Spaces:
Running
Running
blairzheng
commited on
Commit
·
ec03973
1
Parent(s):
e6088ac
add approx_gauss; add eigenvalue; replace contraction with non_expanding
Browse files- 7.1.png +0 -0
- App.py +11 -7
- C1.png +0 -0
- C2.png +0 -0
- D1.png +0 -0
- D2.png +0 -0
- D3.png +0 -0
- D4.png +0 -0
- D5.png +0 -0
- DPMInteractive.py +9 -4
- ExtraBlock.js +8 -5
- Misc.py +1 -1
- RenderMarkdown.py +35 -9
- RenderMarkdownEn.py +337 -82
- RenderMarkdownZh.py +313 -72
- data.json +0 -0
- fig2.png +0 -0
- fig3.png +0 -0
- fig4.png +0 -0
7.1.png
ADDED
![]() |
App.py
CHANGED
@@ -13,8 +13,8 @@ from DPMInteractive import fixed_point_init_change, fixed_point_apply_iterate
|
|
13 |
from DPMInteractive import forward_plot_part, backward_plot_part, fit_plot_part, fixed_plot_part
|
14 |
from RenderMarkdown import md_introduction_block, md_transform_block, md_likelihood_block, md_posterior_block
|
15 |
from RenderMarkdown import md_forward_process_block, md_backward_process_block, md_fit_posterior_block
|
16 |
-
from RenderMarkdown import md_posterior_transform_block, md_deconvolution_block, md_cond_kl_block,
|
17 |
-
from RenderMarkdown import md_reference_block, md_about_block
|
18 |
from Misc import g_css, js_head, js_load
|
19 |
|
20 |
|
@@ -145,7 +145,7 @@ def forward_block(seq_info_state):
|
|
145 |
seed = gr_number("random seed", 0, 1E6, 100, 1, 0, min_width=80)
|
146 |
st_alpha = gr_number("start alpha", 0.001, 0.999, 0.98, 0.001, 3, min_width=80)
|
147 |
et_alpha = gr_number("end alpha", 0.001, 0.999, 0.98, 0.001, 3, min_width=80)
|
148 |
-
step = gr.Slider(label="step", value=7, minimum=
|
149 |
apply_button = gr.Button(value="apply", min_width=80)
|
150 |
|
151 |
node_plot = gr.Plot(label="latent variable's pdf", show_label=False)
|
@@ -236,7 +236,7 @@ def contraction_block():
|
|
236 |
with gr.Row():
|
237 |
ctr_init_seed = gr_number("random seed", 0, 1E6, 100, 1, 0, min_width=80)
|
238 |
ctr_alpha = gr_number("alpha", 0.001, 0.999, 0.95, 0.001, 3, min_width=80)
|
239 |
-
|
240 |
|
241 |
with gr.Row():
|
242 |
inp_plot = gr.Plot(label="input variable pdf", min_width=80, show_label=False)
|
@@ -265,11 +265,11 @@ def contraction_block():
|
|
265 |
power_mat_plot = gr.Plot(show_label=False)
|
266 |
|
267 |
ctr_init_inputs = [ctr_init_seed, ctr_alpha, two_inputs_seed]
|
268 |
-
ctr_init_outputs = [inp_plot, x_state, x_pdf_state, pos_plot, out_plot, z_state, xcz_pdf_state, inp_out_plot]
|
269 |
ctr_init_seed.change(contraction_init_change, ctr_init_inputs, ctr_init_outputs, show_progress="minimal")
|
270 |
|
271 |
ctr_alpha_inputs = [x_state, x_pdf_state, ctr_alpha, two_inputs_seed]
|
272 |
-
ctr_alpha_outputs = [pos_plot, out_plot, z_state, xcz_pdf_state, inp_out_plot]
|
273 |
ctr_alpha.change(contraction_alpha_change, ctr_alpha_inputs, ctr_alpha_outputs, show_progress="minimal")
|
274 |
|
275 |
ctr_apply_inputs, ctr_apply_outputs = [x_state, x_pdf_state, xcz_pdf_state, two_inputs_seed], [inp_out_plot]
|
@@ -348,7 +348,11 @@ def run_app():
|
|
348 |
|
349 |
md_cond_kl_block()
|
350 |
|
351 |
-
|
|
|
|
|
|
|
|
|
352 |
|
353 |
md_reference_block()
|
354 |
|
|
|
13 |
from DPMInteractive import forward_plot_part, backward_plot_part, fit_plot_part, fixed_plot_part
|
14 |
from RenderMarkdown import md_introduction_block, md_transform_block, md_likelihood_block, md_posterior_block
|
15 |
from RenderMarkdown import md_forward_process_block, md_backward_process_block, md_fit_posterior_block
|
16 |
+
from RenderMarkdown import md_posterior_transform_block, md_deconvolution_block, md_cond_kl_block, md_approx_gauss_block
|
17 |
+
from RenderMarkdown import md_non_expanding_block, md_stationary_block, md_reference_block, md_about_block
|
18 |
from Misc import g_css, js_head, js_load
|
19 |
|
20 |
|
|
|
145 |
seed = gr_number("random seed", 0, 1E6, 100, 1, 0, min_width=80)
|
146 |
st_alpha = gr_number("start alpha", 0.001, 0.999, 0.98, 0.001, 3, min_width=80)
|
147 |
et_alpha = gr_number("end alpha", 0.001, 0.999, 0.98, 0.001, 3, min_width=80)
|
148 |
+
step = gr.Slider(label="step", value=7, minimum=1, maximum=15, step=1, min_width=80)
|
149 |
apply_button = gr.Button(value="apply", min_width=80)
|
150 |
|
151 |
node_plot = gr.Plot(label="latent variable's pdf", show_label=False)
|
|
|
236 |
with gr.Row():
|
237 |
ctr_init_seed = gr_number("random seed", 0, 1E6, 100, 1, 0, min_width=80)
|
238 |
ctr_alpha = gr_number("alpha", 0.001, 0.999, 0.95, 0.001, 3, min_width=80)
|
239 |
+
lambda_2 = gr_number("second largest eigenvalue", 0, 0, 1.0, 0.0001, 4, min_width=80)
|
240 |
|
241 |
with gr.Row():
|
242 |
inp_plot = gr.Plot(label="input variable pdf", min_width=80, show_label=False)
|
|
|
265 |
power_mat_plot = gr.Plot(show_label=False)
|
266 |
|
267 |
ctr_init_inputs = [ctr_init_seed, ctr_alpha, two_inputs_seed]
|
268 |
+
ctr_init_outputs = [inp_plot, x_state, x_pdf_state, pos_plot, out_plot, z_state, xcz_pdf_state, inp_out_plot, lambda_2]
|
269 |
ctr_init_seed.change(contraction_init_change, ctr_init_inputs, ctr_init_outputs, show_progress="minimal")
|
270 |
|
271 |
ctr_alpha_inputs = [x_state, x_pdf_state, ctr_alpha, two_inputs_seed]
|
272 |
+
ctr_alpha_outputs = [pos_plot, out_plot, z_state, xcz_pdf_state, inp_out_plot, lambda_2]
|
273 |
ctr_alpha.change(contraction_alpha_change, ctr_alpha_inputs, ctr_alpha_outputs, show_progress="minimal")
|
274 |
|
275 |
ctr_apply_inputs, ctr_apply_outputs = [x_state, x_pdf_state, xcz_pdf_state, two_inputs_seed], [inp_out_plot]
|
|
|
348 |
|
349 |
md_cond_kl_block()
|
350 |
|
351 |
+
md_approx_gauss_block()
|
352 |
+
|
353 |
+
md_non_expanding_block()
|
354 |
+
|
355 |
+
md_stationary_block()
|
356 |
|
357 |
md_reference_block()
|
358 |
|
C1.png
ADDED
![]() |
C2.png
ADDED
![]() |
D1.png
ADDED
![]() |
D2.png
ADDED
![]() |
D3.png
ADDED
![]() |
D4.png
ADDED
![]() |
D5.png
ADDED
![]() |
DPMInteractive.py
CHANGED
@@ -697,15 +697,15 @@ def contraction_init_change(seed, alpha, two_inputs_seed):
|
|
697 |
x_pdf = hijack(seed, x, x_pdf)
|
698 |
|
699 |
# test
|
700 |
-
x_pdf[x_pdf < 0.01] = 0
|
701 |
|
702 |
x_pdf = x_pdf / (x_pdf * g_res).sum() # normalized to 1
|
703 |
fig = plot_pdf(x, x_pdf, title="input variable pdf", titlesize=9)
|
704 |
|
705 |
info = contraction_alpha_change(x, x_pdf, alpha, two_inputs_seed)
|
706 |
-
fig_xcz, fig_z, z, xcz_pdf, fig_inp_out = info
|
707 |
|
708 |
-
return fig, x, x_pdf, fig_xcz, fig_z, z, xcz_pdf, fig_inp_out
|
709 |
|
710 |
|
711 |
def contraction_alpha_change(x, x_pdf, alpha, two_inputs_seed):
|
@@ -721,9 +721,14 @@ def contraction_alpha_change(x, x_pdf, alpha, two_inputs_seed):
|
|
721 |
fig_xcz = plot_2d_pdf(x, z, xcz_pdf, None, label="$q(x|z)$",
|
722 |
title=xcz_title, titlesize=9, xlabel="z domain(cond)", ylabel="x domain")
|
723 |
|
|
|
|
|
|
|
|
|
|
|
724 |
fig_inp_out = contraction_apply(x, x_pdf, xcz_pdf, two_inputs_seed)
|
725 |
|
726 |
-
return fig_xcz, fig_z, z, xcz_pdf, fig_inp_out
|
727 |
|
728 |
|
729 |
def change_two_inputs_seed():
|
|
|
697 |
x_pdf = hijack(seed, x, x_pdf)
|
698 |
|
699 |
# test
|
700 |
+
# x_pdf[x_pdf < 0.01] = 0
|
701 |
|
702 |
x_pdf = x_pdf / (x_pdf * g_res).sum() # normalized to 1
|
703 |
fig = plot_pdf(x, x_pdf, title="input variable pdf", titlesize=9)
|
704 |
|
705 |
info = contraction_alpha_change(x, x_pdf, alpha, two_inputs_seed)
|
706 |
+
fig_xcz, fig_z, z, xcz_pdf, fig_inp_out, lambda_2 = info
|
707 |
|
708 |
+
return fig, x, x_pdf, fig_xcz, fig_z, z, xcz_pdf, fig_inp_out, lambda_2
|
709 |
|
710 |
|
711 |
def contraction_alpha_change(x, x_pdf, alpha, two_inputs_seed):
|
|
|
721 |
fig_xcz = plot_2d_pdf(x, z, xcz_pdf, None, label="$q(x|z)$",
|
722 |
title=xcz_title, titlesize=9, xlabel="z domain(cond)", ylabel="x domain")
|
723 |
|
724 |
+
xcz = xcz_pdf/xcz_pdf.sum(axis=0, keepdims=True)
|
725 |
+
evals = np.linalg.eigvals(xcz)
|
726 |
+
evals = sorted(np.absolute(evals), reverse=True)
|
727 |
+
lambda_2 = evals[1]
|
728 |
+
|
729 |
fig_inp_out = contraction_apply(x, x_pdf, xcz_pdf, two_inputs_seed)
|
730 |
|
731 |
+
return fig_xcz, fig_z, z, xcz_pdf, fig_inp_out, lambda_2
|
732 |
|
733 |
|
734 |
def change_two_inputs_seed():
|
ExtraBlock.js
CHANGED
@@ -2,8 +2,9 @@
|
|
2 |
|
3 |
|
4 |
async function write_markdown() {
|
5 |
-
let names = ["introduction", "transform", "likelihood", "posterior", "forward_process",
|
6 |
-
"
|
|
|
7 |
// names = names.slice(-1)
|
8 |
|
9 |
let data = await fetch("file/data.json").then(response => response.json());
|
@@ -24,8 +25,9 @@ async function write_markdown() {
|
|
24 |
|
25 |
|
26 |
async function insert_markdown() {
|
27 |
-
let names = ["introduction", "transform", "likelihood", "posterior", "forward_process",
|
28 |
-
"
|
|
|
29 |
|
30 |
let data = await fetch("file/data.json").then(response => response.json());
|
31 |
|
@@ -54,7 +56,8 @@ async function insert_markdown() {
|
|
54 |
|
55 |
function control_language() {
|
56 |
const names = ["introduction", "transform", "likelihood", "posterior", "forward_process",
|
57 |
-
|
|
|
58 |
|
59 |
var is_zh = document.getElementById("switch_language").checked;
|
60 |
for (let i = 0; i < names.length; i++) {
|
|
|
2 |
|
3 |
|
4 |
async function write_markdown() {
|
5 |
+
let names = ["introduction", "transform", "likelihood", "posterior", "forward_process",
|
6 |
+
"backward_process", "fit_posterior", "posterior_transform", "deconvolution",
|
7 |
+
"cond_kl", "approx_gauss", "non_expanding", "stationary", "reference", "about"];
|
8 |
// names = names.slice(-1)
|
9 |
|
10 |
let data = await fetch("file/data.json").then(response => response.json());
|
|
|
25 |
|
26 |
|
27 |
async function insert_markdown() {
|
28 |
+
let names = ["introduction", "transform", "likelihood", "posterior", "forward_process",
|
29 |
+
"backward_process", "fit_posterior", "posterior_transform", "deconvolution",
|
30 |
+
"cond_kl", "approx_gauss", "non_expanding", "stationary", "reference", "about"];
|
31 |
|
32 |
let data = await fetch("file/data.json").then(response => response.json());
|
33 |
|
|
|
56 |
|
57 |
function control_language() {
|
58 |
const names = ["introduction", "transform", "likelihood", "posterior", "forward_process",
|
59 |
+
"backward_process", "fit_posterior", "posterior_transform", "deconvolution",
|
60 |
+
"cond_kl", "approx_gauss", "non_expanding", "stationary", "reference", "about"];
|
61 |
|
62 |
var is_zh = document.getElementById("switch_language").checked;
|
63 |
for (let i = 0; i < names.length; i++) {
|
Misc.py
CHANGED
@@ -18,7 +18,7 @@ g_css = """
|
|
18 |
.first_md span{font-size: 140%; font-weight: bold; color: orange}
|
19 |
.normal span{font-size: 100%; font-weight: normal; color: black}
|
20 |
.second span{font-size: 100%; font-weight: bold; color: blue}
|
21 |
-
.mds div{margin-top: 10px; margin-bottom: 20px; margin-left:10px; margin-right:10px; font-size:16px}
|
22 |
.gps div{margin-top: 10px; margin-bottom: 20px;}
|
23 |
|
24 |
.switchbar {position: relative; display: inline-block; width: 60px; height: 30px; margin-left: 10px; margin-right: 10px}
|
|
|
18 |
.first_md span{font-size: 140%; font-weight: bold; color: orange}
|
19 |
.normal span{font-size: 100%; font-weight: normal; color: black}
|
20 |
.second span{font-size: 100%; font-weight: bold; color: blue}
|
21 |
+
.mds div{margin-top: 10px; margin-bottom: 20px; margin-left:10px; margin-right:10px; font-size:16px;}
|
22 |
.gps div{margin-top: 10px; margin-bottom: 20px;}
|
23 |
|
24 |
.switchbar {position: relative; display: inline-block; width: 60px; height: 30px; margin-left: 10px; margin-right: 10px}
|
RenderMarkdown.py
CHANGED
@@ -3,13 +3,13 @@ import gradio as gr
|
|
3 |
|
4 |
from RenderMarkdownZh import md_introduction_zh, md_transform_zh, md_likelihood_zh, md_posterior_zh
|
5 |
from RenderMarkdownZh import md_forward_process_zh, md_backward_process_zh, md_fit_posterior_zh
|
6 |
-
from RenderMarkdownZh import md_posterior_transform_zh, md_deconvolution_zh, md_cond_kl_zh,
|
7 |
-
from RenderMarkdownZh import md_reference_zh, md_about_zh
|
8 |
|
9 |
from RenderMarkdownEn import md_introduction_en, md_transform_en, md_likelihood_en, md_posterior_en
|
10 |
from RenderMarkdownEn import md_forward_process_en, md_backward_process_en, md_fit_posterior_en
|
11 |
-
from RenderMarkdownEn import md_posterior_transform_en, md_deconvolution_en, md_cond_kl_en,
|
12 |
-
from RenderMarkdownEn import md_reference_en, md_about_en
|
13 |
|
14 |
|
15 |
def md_introduction_block(md_type="offline"):
|
@@ -137,14 +137,40 @@ def md_cond_kl_block(md_type="offline"):
|
|
137 |
return
|
138 |
|
139 |
|
140 |
-
def
|
141 |
if md_type == "offline":
|
142 |
-
title = "Appendix B
|
143 |
-
gr.Accordion(label=title, elem_classes="first_md", elem_id="
|
144 |
elif md_type == "zh":
|
145 |
-
|
146 |
elif md_type == "en":
|
147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
else:
|
149 |
raise NotImplementedError
|
150 |
return
|
|
|
3 |
|
4 |
from RenderMarkdownZh import md_introduction_zh, md_transform_zh, md_likelihood_zh, md_posterior_zh
|
5 |
from RenderMarkdownZh import md_forward_process_zh, md_backward_process_zh, md_fit_posterior_zh
|
6 |
+
from RenderMarkdownZh import md_posterior_transform_zh, md_deconvolution_zh, md_cond_kl_zh, md_approx_gauss_zh
|
7 |
+
from RenderMarkdownZh import md_non_expanding_zh, md_stationary_zh, md_reference_zh, md_about_zh
|
8 |
|
9 |
from RenderMarkdownEn import md_introduction_en, md_transform_en, md_likelihood_en, md_posterior_en
|
10 |
from RenderMarkdownEn import md_forward_process_en, md_backward_process_en, md_fit_posterior_en
|
11 |
+
from RenderMarkdownEn import md_posterior_transform_en, md_deconvolution_en, md_cond_kl_en, md_approx_gauss_en
|
12 |
+
from RenderMarkdownEn import md_non_expanding_en, md_stationary_en, md_reference_en, md_about_en
|
13 |
|
14 |
|
15 |
def md_introduction_block(md_type="offline"):
|
|
|
137 |
return
|
138 |
|
139 |
|
140 |
+
def md_approx_gauss_block(md_type="offline"):
|
141 |
if md_type == "offline":
|
142 |
+
title = "Appendix B When does the Posterior Approximate to Gaussian ?"
|
143 |
+
gr.Accordion(label=title, elem_classes="first_md", elem_id="approx_gauss")
|
144 |
elif md_type == "zh":
|
145 |
+
md_approx_gauss_zh()
|
146 |
elif md_type == "en":
|
147 |
+
md_approx_gauss_en()
|
148 |
+
else:
|
149 |
+
raise NotImplementedError
|
150 |
+
return
|
151 |
+
|
152 |
+
|
153 |
+
def md_non_expanding_block(md_type="offline"):
|
154 |
+
if md_type == "offline":
|
155 |
+
title = "Appendix C Posterior Transform is a Non-expanding Mapping"
|
156 |
+
gr.Accordion(label=title, elem_classes="first_md", elem_id="non_expanding")
|
157 |
+
elif md_type == "zh":
|
158 |
+
md_non_expanding_zh()
|
159 |
+
elif md_type == "en":
|
160 |
+
md_non_expanding_en()
|
161 |
+
else:
|
162 |
+
raise NotImplementedError
|
163 |
+
return
|
164 |
+
|
165 |
+
|
166 |
+
def md_stationary_block(md_type="offline"):
|
167 |
+
if md_type == "offline":
|
168 |
+
title = "Appendix D Posterior Transform Converges to the Unique Stationary Distribution"
|
169 |
+
gr.Accordion(label=title, elem_classes="first_md", elem_id="stationary")
|
170 |
+
elif md_type == "zh":
|
171 |
+
md_stationary_zh()
|
172 |
+
elif md_type == "en":
|
173 |
+
md_stationary_en()
|
174 |
else:
|
175 |
raise NotImplementedError
|
176 |
return
|
RenderMarkdownEn.py
CHANGED
@@ -82,15 +82,21 @@ def md_posterior_en():
|
|
82 |
q(x|z) = \frac{q(z|x)q(x)}{q(z)} \tag{3.1}
|
83 |
\end{align}
|
84 |
|
85 |
-
When $z$
|
86 |
\begin{align}
|
87 |
q(x|z) \propto q(z|x)q(x) \qquad where\ z\ is\ fixed \tag{3.2}
|
88 |
\end{align}
|
89 |
|
|
|
|
|
|
|
|
|
|
|
90 |
From Equation 2.1, we can see that $q(z|x)$ is a Gaussian distribution, so we have
|
91 |
\begin{align}
|
92 |
-
q(x|z) &\propto \frac{1}{\sqrt{2\pi(1-\alpha)}}\exp{\frac{-(z-\sqrt{\alpha}x)^2}{2(1-\alpha)}}\ q(x)& \qquad
|
93 |
-
&=
|
|
|
94 |
\end{align}
|
95 |
|
96 |
It can be observed that the <b>GaussFun</b> part is a Gaussian function of $x$, with a mean of $\frac{z}{\sqrt{\alpha}}$ and a variance of $\sqrt{\frac{1-\alpha}{\alpha}}$, so the shape of $q(x|z)$ is determined by **the product of GaussFun and q(x)**.
|
@@ -98,19 +104,27 @@ def md_posterior_en():
|
|
98 |
According to the characteristics of <em>multiplication</em>, the characteristics of the shape of the $q(x|z)$ function can be summarized.
|
99 |
|
100 |
<ul>
|
101 |
-
<li>
|
|
|
|
|
102 |
|
103 |
<li>When the variance of the Gaussian function is large (large noise), or when $q(x)$ changes drastically, the shape of $q(x|z)$ will be more complex, and greatly differ from a Gaussian function, which makes it difficult to model and learn.</li>
|
104 |
</ul>
|
105 |
|
|
|
|
|
106 |
The specifics can be seen in <a href="#demo_2">Demo 2</a>. The fourth figure present the shape of the posterior $q(x|z)$, which shows an irregular shape and resembles a curved and uneven line. As $\alpha$ increases (noise decreases), the curve tends to be uniform and straight. Readers can adjust different $\alpha$ values and observe the relationship between the shape of posterior and the level of noise. In the last figure, the $\textcolor{blue}{\text{blue dash line}}$ represents $q(x)$, the $\textcolor{green}{\text{green dash line}}$ represents <b>GaussFun</b> in the equation 3.4, and the $\textcolor{orange}{\text{orange curve}}$ represents the result of multiplying the two function and normalizing it, which is the posterior probability $q(x|z=fixed)$ under a fixed z condition. Readers can adjust different values of z to observe how the fluctuation of $q(x)$ affect the shape of the posterior probability $q(x|z)$.
|
107 |
|
108 |
The posterior $q(x|z)$ under two special states are worth considering.
|
109 |
<ul>
|
110 |
-
<li>As $\alpha \to 0$, the variance of <b>GaussFun</b> tends to <b>$\infty$</b>, and $q(x|z)$ for different $z$ almost become identical, and almost the same as $q(x)$. Readers can set $\alpha$ to 0.001 in <a href="#demo_2">Demo 2</a> to observe the specific results.</li>
|
111 |
|
112 |
<li>As $\alpha \to 1$, the variance of <b>GaussFun</b> tends to <b>$0$</b>, The $q(x|z)$ for different $z$ values contract into a series of <em>Dirac delta functions</em> with different offsets equalling to $z$. However, there are some exceptions. When there are regions where $q(x)$ is zero, the corresponding $q(x|z)$ will no longer be a Dirac <em>delta function</em>, but a zero function. Readers can set $\alpha$ to 0.999 in <a href="#demo_2">Demo 2</a> to observe the specific results.</li>
|
113 |
</ul>
|
|
|
|
|
|
|
|
|
114 |
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_en")
|
115 |
return
|
116 |
|
@@ -153,18 +167,22 @@ def md_forward_process_en():
|
|
153 |
q(z_t|x) &= \mathcal{N}(\sqrt{\alpha_1\alpha_2\cdots\alpha_t}x,\ 1-\alpha_1\alpha_2\cdots\alpha_t) = \mathcal{N}(\sqrt{\bar{\alpha_t}}x,\ 1-\bar{\alpha_t}) \qquad where\ \bar{\alpha_t} \triangleq \prod_{j=1}^t\alpha_j \tag{4.8}
|
154 |
\end{align}
|
155 |
|
156 |
-
Comparing the forms of Equation 4.8 and Equation 2.1, it can be found that their forms are completely consistent.
|
|
|
|
|
157 |
|
|
|
|
|
158 |
In the DDPM[\[2\]](#ddpm) paper, the authors used 1000 steps (T=1000) to transform the data distribution $q(x)$ to $q(z_T)$. The probability distribution of $q(z_T|x)$ is as follows:
|
159 |
\begin{align}
|
160 |
q(z_T|x) &= \mathcal{N}(0.00635\ x,\ 0.99998) \tag{4.9}
|
161 |
\end{align}
|
162 |
|
163 |
-
If considering
|
164 |
\begin{align}
|
165 |
Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{4.10}
|
166 |
\end{align}
|
167 |
-
It can be seen that, after applying two transforms, the transformed distributions $q(z_T|x)$ are the same. Thus, $q(z_T)$ is also the same.
|
168 |
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_forward_process_en")
|
169 |
return
|
170 |
|
@@ -193,7 +211,7 @@ def md_backward_process_en():
|
|
193 |
In <a href="#posterior">Section 3</a>, we have considered two special posterior probability distributions. Next, we analyze their corresponding <em>posterior transforms</em>.
|
194 |
<ul>
|
195 |
<li> When $\alpha \to 0$, the $q(x|z)$ for different $z$ are almost the same as $q(x)$. In other words, the basis functions of linear weighted sum are almost the same. In this state, no matter how the input changes, the output of the transformation is always $q(x)$.</li>
|
196 |
-
<li> When $\alpha \to 1$, the $q(x|z)$ for different $z$ values becomes a series of Dirac delta functions and zero functions. In this state, as long as the <em>support
|
197 |
</ul>
|
198 |
|
199 |
In <a href="#forward_process">Section 4</a>, it is mentioned that the 1000 transformations used in the DDPM[\[2\]](#ddpm) can be represented using a single transformation
|
@@ -201,11 +219,13 @@ def md_backward_process_en():
|
|
201 |
Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{5.5}
|
202 |
\end{align}
|
203 |
|
204 |
-
Since
|
205 |
|
206 |
-
<b>
|
207 |
|
208 |
Readers can conduct a similar experiment themselves. In <a href="#demo_3_1">Demo 3.1</a>, set <em>start_alpha</em> to 0.25, <em>end_alpha</em> to 0.25, and <em>step</em> to 7. At this point, $q(z_7)=\sqrt{0.000061}X + \sqrt{1-0.000061} \epsilon$, which is roughly equivalent to DDPM's $q(z_T)$. Click on <b>apply</b> to perform the forward transform (plotted using $\textcolor{blue}{\text{blue curves}}$), which prepares for the subsequent restoring process. In <a href="#demo_3_2">Demo 3.2</a>, set the <em>noise_ratio</em> to 1, introducing 100% noise into the <em>tail distribution</em> $q(z_7)$. Changing the value of <em>nose_random_seed</em> will change the distribution of noise. Deselect <em>backward_pdf</em> to reduce screen clutter. Click on <b>apply</b> to restore $q(x)$ through posterior transform. You will see that, no matter what the shape of input $q(z_7)$ may be, the restored $q(x)$ is always exactly the same as the original $q(x)$. The JS Divergence is zero. The restoration process is plotted using a $\textcolor{red}{\text{red curve}}$.
|
|
|
|
|
209 |
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_backward_process_en")
|
210 |
return
|
211 |
|
@@ -296,15 +316,17 @@ def md_posterior_transform_en():
|
|
296 |
|
297 |
gr.Markdown(
|
298 |
r"""
|
299 |
-
|
|
|
300 |
\begin{align}
|
301 |
q(x) &= \int q(x,z) dz = \int q(x|z)q(z)dz \tag{7.1}
|
302 |
\end{align}
|
303 |
-
|
304 |
-
|
305 |
\begin{align}
|
306 |
-
|
307 |
\end{align}
|
|
|
308 |
|
309 |
Readers can refer to <a href="#demo_4_1">Demo 4.1</a>, where the first three figure present a transform process. The first figure is an arbitrary data distribution $q(x)$, the third figure is the transformed probability distribution, and second figure is the posterior probability distribution $q(x|z)$. You can change the random seed to generate a new data distribution$q(x)$, and adjust the value of $\alpha$ to introduce different degrees of noise.
|
310 |
|
@@ -312,7 +334,7 @@ def md_posterior_transform_en():
|
|
312 |
|
313 |
Readers can change the input random seed to toggle different inputs. It can be observed from the figures that $div_{in}$ is always smaller than $div_{out}$ for any input. Additionally, if you change the value of $\alpha$, you will see that the smaller the $\alpha$(larger noise), the smaller the ratio of $div_{out}/div_{in}$,indicating a larger rate of contraction.
|
314 |
|
315 |
-
According to the
|
316 |
|
317 |
Readers can refer to <a href="#demo_4_2">Demo 4.2</a>, which illustrates an example of applying posterior transform iteratively. Choose an appropriate number of iterations, and click on the button of <em>Apply</em>, and the iteration process will be draw step by step. Each subplot shows the transformed output distribution($\textcolor{green}{\text{green curve}}$) from each transform, with the reference distribution $q(x)$ expressed as a $\textcolor{blue}{\text{blue curve}}$, as well as the distance $div$ between the output distribution and $q(x)$. It can be seen that as the number of iterations increases, the output distribution becomes more and more similar to $q(x)$, and will eventually stabilize near $q(x)$. For more complicated distributions, more iterations or greater noise may be required. The maximum number of iterations can be set to tens of thousands, but it'll take longer.
|
318 |
|
@@ -325,23 +347,34 @@ def md_posterior_transform_en():
|
|
325 |
\end{align}
|
326 |
In order to better understand the property of the transform, the matrix $(Q_{x|z})^n$ is also plotted in <a href="#demo_4_2">Demo 4.2</a>. From the demo we can see that, as the iterations converge, the row vectors of the matrix $(Q_{x|z})^n$ will become a constant vector, that is, all components of the vector will be the same, which will appear as a horizontal line in the denisty plot.
|
327 |
|
328 |
-
|
329 |
|
330 |
-
The relationship between the converged distribution and the input distribution q(x) cannot be rigorously proven at present.
|
331 |
|
|
|
332 |
<h3 style="font-size:18px"> Anti-noise Capacity In Restoring Data Distribution</h3>
|
333 |
-
|
|
|
|
|
334 |
\begin{align}
|
335 |
-
|
336 |
\end{align}
|
337 |
-
Wherein, $q(z)$ is the ideal input distribution, $q(x)$ is the ideal output distribution, $q_i(x)$ is any arbitrary input distribution, and $q_o(x)$ is the output distribution obtained after transforming $q_i(z)$.
|
338 |
|
339 |
-
|
|
|
|
|
340 |
|
341 |
Refer specifically to <a href="#demo_3_2">Demo 3.2</a>, where by increasing the value of the <b>noise ratio</b>, noise can be added to the <em>tail distribution</em> $q(z_T)$. Clicking the "apply" button will gradually draw out the restoring process, with the restored distribution represented by a $\textcolor{red}{\text{red curve}}$, and the error size will be computed by the JS divergence. You will see that the error of restored $q(x)$ is always less than the error of $q(z_T)$.
|
342 |
|
343 |
-
From the above discussion,
|
|
|
|
|
|
|
344 |
|
|
|
|
|
|
|
|
|
|
|
345 |
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_transform_en")
|
346 |
return
|
347 |
|
@@ -370,7 +403,8 @@ def md_cond_kl_en():
|
|
370 |
gr.Markdown(
|
371 |
r"""
|
372 |
This section mainly introduces the relationship between <b>KL divergence</b> and <b>conditional KL divergence</b>. Before the formal introduction, we will briefly introduce the definitions of <b>Entropy</b> and <b>Conditional Entropy</b>, as well as the inequality relationship between them, in preparation for the subsequent proof.
|
373 |
-
|
|
|
374 |
<h3 style="font-size:20px">Entropy and Conditional Entropy</h3>
|
375 |
For any two random variables $Z, X$, the <b>Entropy</b> is defined as follows<a href="#entropy">[16]</a>:
|
376 |
\begin{align}
|
@@ -385,7 +419,8 @@ def md_cond_kl_en():
|
|
385 |
\mathbf{H}(Z|X) \le \mathbf{H}(Z) \tag{A.3}
|
386 |
\end{align}
|
387 |
It is to say that <b>the Conditional Entropy is always less than or equal to the Entropy</b>, and they are equal only when X and Z are independent. The proof of this relationship can be found in the literature <a href="#cond_entropy">[17]</a>.
|
388 |
-
|
|
|
389 |
<h3 style="font-size:20px">KL Divergence and Conditional KL Divergence</h3>
|
390 |
In the same manner as the definition of Conditional Entropy, we introduce a new definition, <b>Conditional KL Divergence</b>, denoted as $KL_{\mathcal{C}}$. Since KL Divergence is non-symmetric, there exist two forms as follows.
|
391 |
\begin{align}
|
@@ -434,12 +469,12 @@ def md_cond_kl_en():
|
|
434 |
Another <b>important conclusion</b> can be drawn from equation A.15.
|
435 |
|
436 |
The KL Divergence is often used to fit the distribution of data. In this scenario, the distribution of the data is denoted by $q(z)$ and the parameterized model distribution is denoted by $\textcolor{blue}{p_\theta(z)}$. During the optimization process, since both $q(z|x)$ and $q(x)$ remain constant, the term $\mathbf{H}(Z) - \mathbf{H}(Z|X)$ in Equation A.15 is a constant. Thus, the following relationship is obtained:
|
437 |
-
<span id="
|
438 |
\mathop{\min}{underline}{\textcolor{blue}{p_\theta}} KL(q(z) \Vert \textcolor{blue}{p_\theta(z)}) \iff \mathop{\min}{underline}{\textcolor{blue}{p_\theta}} \int \ q(x) KL(q(z|x) \Vert \textcolor{blue}{p_\theta(z)})dx \tag{A.25}
|
439 |
</span>
|
440 |
|
441 |
Comparing the above relationship with <b>Denoised Score Matching</b> <a href="#dsm">[18]</a>(equation A.26), some similarities can be observed. Both introduce a new variable $X$, and substitute the targeted fitting distribution q(z) with q(z|x). After the substitution, since q(z|x) is a conditional probability distribution, both consider all conditions and perform a weighted sum using the probability of the conditions occurring, $q(x)$, as the weight coefficient.
|
442 |
-
<span id="
|
443 |
\mathop{\min}{underline}{\textcolor{blue}{\psi_\theta}} \frac{1}{2} \int q(z) \left\lVert \textcolor{blue}{\psi_\theta(z)} - \frac{\partial q(z)}{\partial z} \right\rVert^2 dz \iff \mathop{\min}{underline}{\textcolor{blue}{\psi_\theta}} \int q(x)\ \overbrace{\frac{1}{2} \int q(z|x) \left\lVert \textcolor{blue}{\psi_\theta(z)} - \frac{\partial q(z|x)}{\partial z} \right\rVert^2 dz}^{\text{Score Matching of }q(z|x)}\ dx \tag{A.26}
|
444 |
</span>
|
445 |
|
@@ -451,72 +486,273 @@ def md_cond_kl_en():
|
|
451 |
return
|
452 |
|
453 |
|
454 |
-
def
|
455 |
global g_latex_del
|
456 |
|
457 |
-
title = "Appendix B
|
458 |
-
with gr.Accordion(label=title, elem_classes="first_md", elem_id="
|
459 |
gr.Markdown(
|
460 |
r"""
|
461 |
-
|
462 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
463 |
|
464 |
-
The
|
465 |
-
|
466 |
-
|
467 |
-
<ol style="list-style-type:decimal">
|
468 |
-
<li> When $q(x)$ is greater than 0, the posterior transform matrix $q(x|z)$ will be greater than 0 too. Therefore, this matrix is the transition matrix of an $\textcolor{red}{\text{irreducible}}\ \textcolor{green}{\text{aperiodic}}$ Markov Chain. According to the conclusion of the literature <a href="#mc_basic_p6">[13]</a>, this transformation is a contraction mapping with respect to Total Variance metric. Therefore, according to the Banach fixed-point theorem, this transformation has a unique fixed point(converged point). </li>
|
469 |
-
|
470 |
-
<li> When $q(x)$ is partially greater than 0, and the support of $q(x)$ (the region where $q(x)$ is greater than 0) consists only one connected component (Figure 2), several conclusions can be drawn from equation (3.4):
|
471 |
|
472 |
-
<
|
473 |
-
<li> When $z$ and $x$ are within the support set, since both $q(x)$ and GaussFun are greater than 0, the diagonal elements of the transfer matrix $\{q(x|z)|z=x\}$ are greater than 0. This means that the state within the support set is $\textcolor{green}{\text{aperiodic}}$. </li>
|
474 |
|
475 |
-
<
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
<
|
480 |
-
|
481 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
482 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
483 |
</ol>
|
484 |
-
|
485 |
-
|
486 |
-
|
|
|
|
|
|
|
|
|
|
|
487 |
|
488 |
-
|
489 |
-
Therefore, it can be concluded that when the state is confined within the support set and two extension regions, $\lim_{n\to\infty}{q(x|z)^n}$ will converge to a fixed matrix, and each column vector is identical. Hence, for any input distribution, if posterior transforms are continuously applied, it will eventually converge to a fixed distribution, which is equal to the column vector of the converged matrix. Based on the conclusion from the literature <a href=\"#fp_converse\">[9]</a>, when a iterative transform converges to a unique fixed point, this transform is a Contraction Mapping with respect to a certain metric.
|
490 |
-
</p>
|
491 |
</li>
|
|
|
|
|
|
|
492 |
|
493 |
-
|
494 |
-
|
495 |
-
<li> When $q(x)$ is partially greater than 0, and multiple connected component exist in the support set of $q(x)$, and the maximum distance of each connected component <b>cannot</b> be covered by the support set of corresponding GaussFun, the states within each component <b>constitute multiple Communicate Classes</b>, as shown in Figure 4. Under such circumstances, as $n\to\infty$, $q(x|z)^n$ will also converge to a fixed matrix, but not all the column vectors are identical. Therefore, the posterior transforma is not a strict contraction mapping. However, when the state of the input distribution is confined to a single Communicate Class and its corresponding extension, the posterior transform is also a contraction mapping with a unique convergence point. </li>
|
496 |
</ol>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
497 |
|
498 |
-
<
|
499 |
-
|
|
|
500 |
|
501 |
-
|
502 |
-
<center> Figure 4: Two components which <b>cannot</b> communicate with each other </center>
|
503 |
|
504 |
-
|
|
|
505 |
\begin{align}
|
506 |
-
|
507 |
\end{align}
|
508 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
509 |
\begin{align}
|
510 |
-
|
511 |
-
&= \sum_{m}\textcolor{red}{|}\sum_{n}Q_{x|z}(m,n)q_{i1}(n) - \sum_{n}Q_{x|z}(m,n)q_{i2}(n)\textcolor{red}{|} \tag{B.3} \newline
|
512 |
-
&= \sum_{m}\textcolor{red}{|}\sum_{n}Q_{x|z}(m,n)(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \tag{B.4} \newline
|
513 |
-
&\leq \sum_{m}\sum_{n}Q_{x|z}(m,n)\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \qquad \qquad \qquad \text{Absolute value inequality} \tag{B.5} \newline
|
514 |
-
&= \sum_{n}\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \sum_{m} Q_{x|z}(m,n) \qquad \qquad \qquad \sum_{m} Q_{x|z}(m,n) = 1 \tag{B.6} \newline
|
515 |
-
&= \sum_{n}\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \tag{B.7}
|
516 |
\end{align}
|
517 |
-
|
518 |
-
|
519 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
520 |
return
|
521 |
|
522 |
|
@@ -524,7 +760,6 @@ def md_reference_en():
|
|
524 |
global g_latex_del
|
525 |
|
526 |
with gr.Accordion(label="Reference", elem_classes="first_md", elem_id="reference"):
|
527 |
-
|
528 |
gr.Markdown(
|
529 |
r"""
|
530 |
<a id="dpm" href="https://arxiv.org/abs/1503.03585"> [1] Deep Unsupervised Learning Using Nonequilibrium Thermodynami </a>
|
@@ -542,21 +777,37 @@ def md_reference_en():
|
|
542 |
<a id="mc_limit" href="https://stats.libretexts.org/Bookshelves/Probability_Theory/Book%3A_Introductory_Probability_(Grinstead_and_Snell)/11%3A_Markov_Chains/11.04%3A_Fundamental_Limit_Theorem_for_Regular_Chains"> [7] Fundamental Limit Theorem for Regular Chains </a>
|
543 |
|
544 |
<a id="mc_basic_p6" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [8] Markov Chain:Basic Theory - Proposition 6 </a>
|
545 |
-
|
546 |
<a id="fp_converse" href="https://arxiv.org/abs/1702.07339"> [9] A Converse to Banach's Fixed Point Theorem and its CLS Completeness </a>
|
547 |
|
548 |
<a id="ce_kl" href="https://en.wikipedia.org/wiki/Cross-entropy#Cross-entropy_minimization"> [10] Cross-entropy minimization </a>
|
549 |
-
|
550 |
<a id="deconv_1" href="https://thewolfsound.com/deconvolution-inverse-convolution/"> [11] Deconvolution Using Frequency-Domain Division </a>
|
551 |
-
|
552 |
<a id="deconv_2" href="https://www.strollswithmydog.com/deconvolution-by-division-in-the-frequency-domain/"> [12] deconvolution-by-division-in-the-frequency-domain </a>
|
553 |
-
|
554 |
<a id="mc_basic_t7" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [13] Markov Chain:Basic Theory - Theorem 7 </a>
|
555 |
-
|
556 |
<a id="mc_basic_d4" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [14] Markov Chain:Basic Theory - Definition 4 </a>
|
557 |
-
|
558 |
<a id="vdm" href="https://arxiv.org/pdf/2107.00630"> [15] Variational Diffusion Models </a>
|
559 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
560 |
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_reference_en")
|
561 |
|
562 |
return
|
@@ -603,7 +854,11 @@ def run_app():
|
|
603 |
|
604 |
md_cond_kl_en()
|
605 |
|
606 |
-
|
|
|
|
|
|
|
|
|
607 |
|
608 |
md_reference_en()
|
609 |
|
|
|
82 |
q(x|z) = \frac{q(z|x)q(x)}{q(z)} \tag{3.1}
|
83 |
\end{align}
|
84 |
|
85 |
+
When $z$ is fixed, $q(z)$ is a constant, so $q(x|z)$ is a probability density function with respect to $x$, and its shape depends only on $q(z|x)q(x)$.
|
86 |
\begin{align}
|
87 |
q(x|z) \propto q(z|x)q(x) \qquad where\ z\ is\ fixed \tag{3.2}
|
88 |
\end{align}
|
89 |
|
90 |
+
In fact, $q(z)=\int q(z|x)q(x)dx$, which means that $q(z)$ is the sum over $x$ of the function $q(z|x)q(x)$. Therefore, dividing $q(z|x)q(x)$ by $q(z)$ is equivalent to normalizing $q(z|x)q(x)$.
|
91 |
+
\begin{align}
|
92 |
+
q(x|z) = \operatorname{Normalize}\big(q(z|x)q(x)\big) \tag{3.3}
|
93 |
+
\end{align}
|
94 |
+
|
95 |
From Equation 2.1, we can see that $q(z|x)$ is a Gaussian distribution, so we have
|
96 |
\begin{align}
|
97 |
+
q(x|z) &\propto \frac{1}{\sqrt{2\pi(1-\alpha)}}\exp{\frac{-(z-\sqrt{\alpha}x)^2}{2(1-\alpha)}}\ q(x)& \qquad &\text{where z is fixed} \notag \newline
|
98 |
+
&= \frac{1}{\sqrt{\alpha}}\frac{1}{\sqrt{2\pi\frac{1-\alpha}{\alpha}}}\exp{\frac{-(\frac{z}{\sqrt{\alpha}}-x)^2}{2\frac{1-\alpha}{\alpha}}}\ q(x)& \notag \newline
|
99 |
+
&= \frac{1}{\sqrt{\alpha}} \underbrace{\frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}}_{\text{GaussFun}}\ q(x)& \qquad &\text{where}\ \mu=\frac{z}{\sqrt{\alpha}}\quad \sigma=\sqrt{\frac{1-\alpha}{\alpha}} \tag{3.4}
|
100 |
\end{align}
|
101 |
|
102 |
It can be observed that the <b>GaussFun</b> part is a Gaussian function of $x$, with a mean of $\frac{z}{\sqrt{\alpha}}$ and a variance of $\sqrt{\frac{1-\alpha}{\alpha}}$, so the shape of $q(x|z)$ is determined by **the product of GaussFun and q(x)**.
|
|
|
104 |
According to the characteristics of <em>multiplication</em>, the characteristics of the shape of the $q(x|z)$ function can be summarized.
|
105 |
|
106 |
<ul>
|
107 |
+
<li>The support set of $q(x|z)$ should be contained within the support set of GaussFun. The support set of GaussFun is a hypersphere, centered at the mean $\mu$ with a radius of approximately 3 times the standard deviation $\sigma$. </li>
|
108 |
+
|
109 |
+
<li>When the variance of the Gaussian function is small (small noise), or when $q(x)$ changes linearly, the shape of $q(x|z)$ will approximate to the Gaussian function, and have a simpler function form, which is convenient for modeling and learning.</li>
|
110 |
|
111 |
<li>When the variance of the Gaussian function is large (large noise), or when $q(x)$ changes drastically, the shape of $q(x|z)$ will be more complex, and greatly differ from a Gaussian function, which makes it difficult to model and learn.</li>
|
112 |
</ul>
|
113 |
|
114 |
+
<a href="#approx_gauss">Appendix B</a> provides a more rigorous analysis. When $\sigma$ satisfies certain conditions, $q(x|z)$ approximates to Gaussiani distribution.
|
115 |
+
|
116 |
The specifics can be seen in <a href="#demo_2">Demo 2</a>. The fourth figure present the shape of the posterior $q(x|z)$, which shows an irregular shape and resembles a curved and uneven line. As $\alpha$ increases (noise decreases), the curve tends to be uniform and straight. Readers can adjust different $\alpha$ values and observe the relationship between the shape of posterior and the level of noise. In the last figure, the $\textcolor{blue}{\text{blue dash line}}$ represents $q(x)$, the $\textcolor{green}{\text{green dash line}}$ represents <b>GaussFun</b> in the equation 3.4, and the $\textcolor{orange}{\text{orange curve}}$ represents the result of multiplying the two function and normalizing it, which is the posterior probability $q(x|z=fixed)$ under a fixed z condition. Readers can adjust different values of z to observe how the fluctuation of $q(x)$ affect the shape of the posterior probability $q(x|z)$.
|
117 |
|
118 |
The posterior $q(x|z)$ under two special states are worth considering.
|
119 |
<ul>
|
120 |
+
<li>As $\alpha \to 0$, the variance of <b>GaussFun</b> tends to <b>$\infty$</b>, and GaussFun almost becomes a uniform distribution over a very large support set, and the result of multiplying $q(x)$ by the uniform distribution is still $q(x)$, therefore, $q(x|z)$ for different $z$ almost become identical, and almost the same as $q(x)$. Readers can set $\alpha$ to 0.001 in <a href="#demo_2">Demo 2</a> to observe the specific results.</li>
|
121 |
|
122 |
<li>As $\alpha \to 1$, the variance of <b>GaussFun</b> tends to <b>$0$</b>, The $q(x|z)$ for different $z$ values contract into a series of <em>Dirac delta functions</em> with different offsets equalling to $z$. However, there are some exceptions. When there are regions where $q(x)$ is zero, the corresponding $q(x|z)$ will no longer be a Dirac <em>delta function</em>, but a zero function. Readers can set $\alpha$ to 0.999 in <a href="#demo_2">Demo 2</a> to observe the specific results.</li>
|
123 |
</ul>
|
124 |
+
|
125 |
+
There is one point to note. when $\alpha \to 0$, the mean of GaussFun corresponding for larger $z$ values($\mu = \frac{z}{\sqrt{\alpha}}$) also increases sharply. This means that GaussFun is located farther from the support of $q(x)$. In this case, the "uniformity" of the part of GaussFun corresponding to the support of $q(x)$ will slightly decrease, thereby slightly reducing the similarity between $q(x|z)$ and $q(x)$. However, this effect will further diminish as $\alpha$ decreases. Readers can observe this effect in <a href="#demo_2">Demo 2</a>. Set $\alpha$ to 0.001, and you will see a slight difference between $q(x|z=-2)$ and $q(x)$, but no noticeable difference between $q(x|z=0)$ and $q(x)$.
|
126 |
+
|
127 |
+
Regarding the "uniformity" of the Gaussian function, there are two characteristics: the larger the standard deviation, the greater the uniformity; the farther away from the mean, the smaller the uniformity.
|
128 |
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_en")
|
129 |
return
|
130 |
|
|
|
167 |
q(z_t|x) &= \mathcal{N}(\sqrt{\alpha_1\alpha_2\cdots\alpha_t}x,\ 1-\alpha_1\alpha_2\cdots\alpha_t) = \mathcal{N}(\sqrt{\bar{\alpha_t}}x,\ 1-\bar{\alpha_t}) \qquad where\ \bar{\alpha_t} \triangleq \prod_{j=1}^t\alpha_j \tag{4.8}
|
168 |
\end{align}
|
169 |
|
170 |
+
Comparing the forms of Equation 4.8 and Equation 2.1, it can be found that their forms are completely consistent.
|
171 |
+
|
172 |
+
If we only focus on the relationship between the initial and final random variables, then a sequence of t small transforms can be replaced by one large transform, and the $\alpha$ of the large transform is the accumulation of the $\alpha$ from each small transform, because the joint probability distributions corresponding to both types of transforms are the same.
|
173 |
|
174 |
+
Readers can perform an experiment in <a href="#demo_3_1">Demo 3.1</a> using the same input distribution $q(x)$ but with two different transform methods: 1) using three transformations, each with $\alpha$ equal to 0.95; 2) using a single transform with $\alpha$ set to 0.857375. Perform the transformations separately and then compare the two resulting distributions. You will see that the two distributions are identical.
|
175 |
+
|
176 |
In the DDPM[\[2\]](#ddpm) paper, the authors used 1000 steps (T=1000) to transform the data distribution $q(x)$ to $q(z_T)$. The probability distribution of $q(z_T|x)$ is as follows:
|
177 |
\begin{align}
|
178 |
q(z_T|x) &= \mathcal{N}(0.00635\ x,\ 0.99998) \tag{4.9}
|
179 |
\end{align}
|
180 |
|
181 |
+
If only considering the joint distribution $q(x, z_T)$, a single transformation can also be used as a substitute, which is as follows:
|
182 |
\begin{align}
|
183 |
Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{4.10}
|
184 |
\end{align}
|
185 |
+
It can be seen that, after applying two transforms, the transformed distributions $q(z_T|x)$ are the same. Thus, $q(x,z_T)$ is also the same.
|
186 |
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_forward_process_en")
|
187 |
return
|
188 |
|
|
|
211 |
In <a href="#posterior">Section 3</a>, we have considered two special posterior probability distributions. Next, we analyze their corresponding <em>posterior transforms</em>.
|
212 |
<ul>
|
213 |
<li> When $\alpha \to 0$, the $q(x|z)$ for different $z$ are almost the same as $q(x)$. In other words, the basis functions of linear weighted sum are almost the same. In this state, no matter how the input changes, the output of the transformation is always $q(x)$.</li>
|
214 |
+
<li> When $\alpha \to 1$, the $q(x|z)$ for different $z$ values becomes a series of Dirac delta functions and zero functions. In this state, as long as the <em>support</em> of the input distribution is included in the <em>support set</em> of $q(x)$, the output of the transformation will remain the same with the input.</li>
|
215 |
</ul>
|
216 |
|
217 |
In <a href="#forward_process">Section 4</a>, it is mentioned that the 1000 transformations used in the DDPM[\[2\]](#ddpm) can be represented using a single transformation
|
|
|
219 |
Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{5.5}
|
220 |
\end{align}
|
221 |
|
222 |
+
Since $\alpha=0.0000403$ is very small, the corresponding standard deviation of GaussFun (Equation 3.4) reaches 157.52. If we constrain the support of $q(x)$ within the unit hypersphere ($\lVert x \rVert_2 < 1$), then for $z_T$ in the range $[-2, +2]$, each corresponding $q(x|z_T)$ is very similar to $q(x)$. In this state, for the posterior transform of $q(x|z_T)$, regardless of the shape of the input distribution, as long as the support set is within the range $[-2,+2]$, the output distribution will be $q(x)$.
|
223 |
|
224 |
+
<b>Furthermore, we can conclude that in the DPM model, if the support of $q(x)$ is finite and the signal-to-noise ratio of the final variable $Z_T$ is sufficiently high, the process of restoring $q(x)$ can use any distribution; it doesn't necessarily have to use the standard normal distribution.</b>
|
225 |
|
226 |
Readers can conduct a similar experiment themselves. In <a href="#demo_3_1">Demo 3.1</a>, set <em>start_alpha</em> to 0.25, <em>end_alpha</em> to 0.25, and <em>step</em> to 7. At this point, $q(z_7)=\sqrt{0.000061}X + \sqrt{1-0.000061} \epsilon$, which is roughly equivalent to DDPM's $q(z_T)$. Click on <b>apply</b> to perform the forward transform (plotted using $\textcolor{blue}{\text{blue curves}}$), which prepares for the subsequent restoring process. In <a href="#demo_3_2">Demo 3.2</a>, set the <em>noise_ratio</em> to 1, introducing 100% noise into the <em>tail distribution</em> $q(z_7)$. Changing the value of <em>nose_random_seed</em> will change the distribution of noise. Deselect <em>backward_pdf</em> to reduce screen clutter. Click on <b>apply</b> to restore $q(x)$ through posterior transform. You will see that, no matter what the shape of input $q(z_7)$ may be, the restored $q(x)$ is always exactly the same as the original $q(x)$. The JS Divergence is zero. The restoration process is plotted using a $\textcolor{red}{\text{red curve}}$.
|
227 |
+
|
228 |
+
There is another point worth noting. In deep learning tasks, it is common to scale each dimension of the input within the range [-1, 1], which means within a unit hypercube. The maximum Euclidean distance between any two points in a unit hypercube increases with the dimensionality. For example, in one dimension, the maximum distance is $2$, two dimensions is $2\sqrt{2}$, three dimensions is $2\sqrt{3}$, and n dimensions is $2\sqrt{n}$. Therefore, for data with higher dimensions, the variable $Z_T$ needs a higher signal-to-noise ratio to allow the starting distribution of the recovery process to accept any distribution.
|
229 |
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_backward_process_en")
|
230 |
return
|
231 |
|
|
|
316 |
|
317 |
gr.Markdown(
|
318 |
r"""
|
319 |
+
</br>
|
320 |
+
<h3 style="font-size:18px"> Non-expanding Mapping and Stationary Distribution </h3>
|
321 |
\begin{align}
|
322 |
q(x) &= \int q(x,z) dz = \int q(x|z)q(z)dz \tag{7.1}
|
323 |
\end{align}
|
324 |
+
|
325 |
+
According to Corollary 1 and Corollary 2 in <a href="#non_expanping">Appendix B</a>, the posterior transform is a <b>non-expanding mapping</b>. This means that for any two probability distributions $q_{i1}(z)$ and $q_{i2}(z)$, after the posterior transform, the resulting distributions $q_{o1}(x)$ and $q_{o2}(x)$ will have a distance that is <b>always less than or equal to</b> the distance between $q_{i1}(x)$ and $q_{i2}(x)$. The distance here can be measured using KL Divergence or Total Variance.
|
326 |
\begin{align}
|
327 |
+
d(q_{o1}(z),\ q_{o2}(z)) \le d(q_{i1}(x),\ q_{i2}(x)) \tag{7.2}
|
328 |
\end{align}
|
329 |
+
According to the analysis in <a href="#non_expanping">Appendix B</a>, the aforementioned equality does not hold in most cases and posterior transform becomes a <b>shrinkig mapping</b>. Furthermore, <b>the smaller $\alpha$ is (the more noise), the smaller $d(q_{o1},q_{o2})$ will be compared to $d(q_{i1},q_{i2})$</b>.
|
330 |
|
331 |
Readers can refer to <a href="#demo_4_1">Demo 4.1</a>, where the first three figure present a transform process. The first figure is an arbitrary data distribution $q(x)$, the third figure is the transformed probability distribution, and second figure is the posterior probability distribution $q(x|z)$. You can change the random seed to generate a new data distribution$q(x)$, and adjust the value of $\alpha$ to introduce different degrees of noise.
|
332 |
|
|
|
334 |
|
335 |
Readers can change the input random seed to toggle different inputs. It can be observed from the figures that $div_{in}$ is always smaller than $div_{out}$ for any input. Additionally, if you change the value of $\alpha$, you will see that the smaller the $\alpha$(larger noise), the smaller the ratio of $div_{out}/div_{in}$,indicating a larger rate of contraction.
|
336 |
|
337 |
+
According to the analysis in <a href="#stationary">Appendix C</a>: the posterior transform can be seen as a one-step jump of a Markov chain, and <b>when $q(x)$ and $\alpha$ meet certain conditions, this Markov chain will converge to a unique stationary distribution</b>. Additionally, numerous experiments have shown that <b>the stationary distribution is very similar to the data distribution $q(x)$, and the smaller $\alpha$ is, the more similar the stationary distribution is to $q(x)$</b>. Specifically, according to the conclusion in <a href="#backward_process">Section 5</a>, <b>when $\alpha \to 0$, after one step of transform, the output distribution will be $q(x)$, so the stationary distribution must be $q(x)$</b>.
|
338 |
|
339 |
Readers can refer to <a href="#demo_4_2">Demo 4.2</a>, which illustrates an example of applying posterior transform iteratively. Choose an appropriate number of iterations, and click on the button of <em>Apply</em>, and the iteration process will be draw step by step. Each subplot shows the transformed output distribution($\textcolor{green}{\text{green curve}}$) from each transform, with the reference distribution $q(x)$ expressed as a $\textcolor{blue}{\text{blue curve}}$, as well as the distance $div$ between the output distribution and $q(x)$. It can be seen that as the number of iterations increases, the output distribution becomes more and more similar to $q(x)$, and will eventually stabilize near $q(x)$. For more complicated distributions, more iterations or greater noise may be required. The maximum number of iterations can be set to tens of thousands, but it'll take longer.
|
340 |
|
|
|
347 |
\end{align}
|
348 |
In order to better understand the property of the transform, the matrix $(Q_{x|z})^n$ is also plotted in <a href="#demo_4_2">Demo 4.2</a>. From the demo we can see that, as the iterations converge, the row vectors of the matrix $(Q_{x|z})^n$ will become a constant vector, that is, all components of the vector will be the same, which will appear as a horizontal line in the denisty plot.
|
349 |
|
350 |
+
For a one-dimensional discrete Markov chain, the convergence rate is inversely related to the absolute value of the second largest eigenvalue of the transition probability matrix ($\lvert \lambda_2 \rvert$). The smaller $\lvert \lambda_2 \rvert$ is, the faster the convergence. Numerous experiments have shown that $\alpha$ has a clear linear relationship with $\lvert \lambda_2 \rvert$; the smaller $\alpha$ is, the smaller $\lvert \lambda_2 \rvert$ is. Therefore, <b>the smaller $\alpha$ (the greater the noise), the faster the convergence rate</b>. Specifically, when $\alpha \to 0$, according to the conclusion in <a href="#posterior">Section 3</a>, the posterior probability distributions corresponding to different $z$ tend to be consistent. Additionally, according to Theorem 21 in <a href="#non_neg_lambda">[21]</a>, $\lvert \lambda_2 \rvert$ is smaller than the L1 distance between any two posterior probability distributions corresponding to different $z$, so it can be concluded that $\lvert \lambda_2 \rvert \to 0$.
|
351 |
|
|
|
352 |
|
353 |
+
</br>
|
354 |
<h3 style="font-size:18px"> Anti-noise Capacity In Restoring Data Distribution</h3>
|
355 |
+
|
356 |
+
From the above analysis, it can be seen that, in most cases, the <b>posterior transform</b> is a shrinking mapping, which means the following relationship
|
357 |
+
|
358 |
\begin{align}
|
359 |
+
d(q(x),\ q_o(x)) < d(q(z),\ q_i(z)) \tag{7.12}
|
360 |
\end{align}
|
|
|
361 |
|
362 |
+
Among them, $q(z)$ is the ideal input distribution, $q(x)$ is the ideal output distribution, $q(x) = \int q(x|z) q(z) dz$, $q_i(z)$ is any input distribution, and $q_o(x)$ is the transformed output distribution, $q_o(x) = \int q(x|z) q_i(z) dz$.
|
363 |
+
|
364 |
+
The above equation indicates that the distance between the output distribution $q_o(x)$ and the ideal output distribution q(x) will always be <b>less than</b> the distance between the input distribution $q_i(z)$ and the ideal input distribution q(x). Hence, <b>the posterior transform naturally possesses a certain ability to resist noise </b>. This means that during the process of restoring $q(x)$(<a href="#backward_process">Section 5</a>), even if the <em>tail distribution</em> $q(z_T)$ contains some error, the error of the outputed distribution $q(x)$ will be smaller than the error of input after undergoing a series of transform.
|
365 |
|
366 |
Refer specifically to <a href="#demo_3_2">Demo 3.2</a>, where by increasing the value of the <b>noise ratio</b>, noise can be added to the <em>tail distribution</em> $q(z_T)$. Clicking the "apply" button will gradually draw out the restoring process, with the restored distribution represented by a $\textcolor{red}{\text{red curve}}$, and the error size will be computed by the JS divergence. You will see that the error of restored $q(x)$ is always less than the error of $q(z_T)$.
|
367 |
|
368 |
+
From the above discussion, it can be seen that the smaller $\alpha$ is (the larger the noise used in the transform), the greater the shrinking rate of the shrink mapping, and correspondingly, the stronger the error resistance capability. Specifically, when $\alpha \to 0$, the noise resistance capability becomes infinite, meaning that regardless of the magnitude of the error in the input, the output will always be $q(x)$.
|
369 |
+
|
370 |
+
</br>
|
371 |
+
<h3 style="font-size:18px"> Markov Chain Monte Carlo Sampling</h3>
|
372 |
|
373 |
+
In DPM models, sampling is typically performed using <b>Ancestral Sampling</b>. From the analysis above, it can be inferred that when $\alpha$ is sufficiently small, the posterior transform will converge to $q(x)$. Therefore, sampling can be conducted using <b>Markov Chain Monte Carlo</b> (MCMC) methods, as depicted in Figure 7.1. In the figure, $\alpha$ represents a posterior transform with relatively large noise, where larger noise makes the steady-state distribution closer to the data distribution $q(x)$. However, as discussed in Section 3, posterior transform with larger noise are less favorable for fitting. Therefore, transform with larger noise are split into multiple transform with smaller noise.
|
374 |
+
|
375 |
+
<center> <img src="file/7.1.png" width="1024" style="margin-top:12px"/> </center>
|
376 |
+
<center> Figure 7.1: Markov Chain Monte Carlo Sampling</center>
|
377 |
+
|
378 |
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_transform_en")
|
379 |
return
|
380 |
|
|
|
403 |
gr.Markdown(
|
404 |
r"""
|
405 |
This section mainly introduces the relationship between <b>KL divergence</b> and <b>conditional KL divergence</b>. Before the formal introduction, we will briefly introduce the definitions of <b>Entropy</b> and <b>Conditional Entropy</b>, as well as the inequality relationship between them, in preparation for the subsequent proof.
|
406 |
+
|
407 |
+
</br>
|
408 |
<h3 style="font-size:20px">Entropy and Conditional Entropy</h3>
|
409 |
For any two random variables $Z, X$, the <b>Entropy</b> is defined as follows<a href="#entropy">[16]</a>:
|
410 |
\begin{align}
|
|
|
419 |
\mathbf{H}(Z|X) \le \mathbf{H}(Z) \tag{A.3}
|
420 |
\end{align}
|
421 |
It is to say that <b>the Conditional Entropy is always less than or equal to the Entropy</b>, and they are equal only when X and Z are independent. The proof of this relationship can be found in the literature <a href="#cond_entropy">[17]</a>.
|
422 |
+
|
423 |
+
</br>
|
424 |
<h3 style="font-size:20px">KL Divergence and Conditional KL Divergence</h3>
|
425 |
In the same manner as the definition of Conditional Entropy, we introduce a new definition, <b>Conditional KL Divergence</b>, denoted as $KL_{\mathcal{C}}$. Since KL Divergence is non-symmetric, there exist two forms as follows.
|
426 |
\begin{align}
|
|
|
469 |
Another <b>important conclusion</b> can be drawn from equation A.15.
|
470 |
|
471 |
The KL Divergence is often used to fit the distribution of data. In this scenario, the distribution of the data is denoted by $q(z)$ and the parameterized model distribution is denoted by $\textcolor{blue}{p_\theta(z)}$. During the optimization process, since both $q(z|x)$ and $q(x)$ remain constant, the term $\mathbf{H}(Z) - \mathbf{H}(Z|X)$ in Equation A.15 is a constant. Thus, the following relationship is obtained:
|
472 |
+
<span id="en_cond_kl_2">
|
473 |
\mathop{\min}{underline}{\textcolor{blue}{p_\theta}} KL(q(z) \Vert \textcolor{blue}{p_\theta(z)}) \iff \mathop{\min}{underline}{\textcolor{blue}{p_\theta}} \int \ q(x) KL(q(z|x) \Vert \textcolor{blue}{p_\theta(z)})dx \tag{A.25}
|
474 |
</span>
|
475 |
|
476 |
Comparing the above relationship with <b>Denoised Score Matching</b> <a href="#dsm">[18]</a>(equation A.26), some similarities can be observed. Both introduce a new variable $X$, and substitute the targeted fitting distribution q(z) with q(z|x). After the substitution, since q(z|x) is a conditional probability distribution, both consider all conditions and perform a weighted sum using the probability of the conditions occurring, $q(x)$, as the weight coefficient.
|
477 |
+
<span id="en_cond_kl_3">
|
478 |
\mathop{\min}{underline}{\textcolor{blue}{\psi_\theta}} \frac{1}{2} \int q(z) \left\lVert \textcolor{blue}{\psi_\theta(z)} - \frac{\partial q(z)}{\partial z} \right\rVert^2 dz \iff \mathop{\min}{underline}{\textcolor{blue}{\psi_\theta}} \int q(x)\ \overbrace{\frac{1}{2} \int q(z|x) \left\lVert \textcolor{blue}{\psi_\theta(z)} - \frac{\partial q(z|x)}{\partial z} \right\rVert^2 dz}^{\text{Score Matching of }q(z|x)}\ dx \tag{A.26}
|
479 |
</span>
|
480 |
|
|
|
486 |
return
|
487 |
|
488 |
|
489 |
+
def md_approx_gauss_en():
|
490 |
global g_latex_del
|
491 |
|
492 |
+
title = "Appendix B When does the Posterior Approximate to Gaussian ?"
|
493 |
+
with gr.Accordion(label=title, elem_classes="first_md", elem_id="approx_gauss"):
|
494 |
gr.Markdown(
|
495 |
r"""
|
496 |
+
From equation 3.4, it can be seen that $q(x|z)$ takes the following form:
|
497 |
+
\begin{align}
|
498 |
+
q(x|z) &= \operatorname{Normalize} \Big(\ \frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}\ q(x)\ \Big)& \qquad &\text{where}\ \mu=\frac{z}{\sqrt{\alpha}}\quad \sigma=\sqrt{\frac{1-\alpha}{\alpha}} \tag{B.1} \newline
|
499 |
+
&\propto \underbrace{\frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}}_{\text{GaussFun}}\ q(x) \tag{B.2}
|
500 |
+
\end{align}
|
501 |
+
|
502 |
+
Below we will prove that if the following two assumptions are satisfied, $q(x|z)$ approximates a Gaussian distribution.
|
503 |
+
<ul>
|
504 |
+
<li>
|
505 |
+
Assume that within the support of GaussFun, $q(x)$ undergoes linear changes. Expand $q(x)$ around the mean of GaussFun using a Taylor series. According to the properties of Taylor expansion, these assumptions can be satisfied when the standard deviation $\sigma$ of GaussFun is sufficiently small.
|
506 |
+
\begin{align}
|
507 |
+
q(x) &\approx q(\mu) + \nabla_xq(\mu)(x-\mu)& \quad &\text{where}\quad q(\mu)\triangleq q(x)\bigg|_{x=\mu} \quad \nabla_xq(\mu)\triangleq \nabla_xq(x)\bigg|_{x=\mu} \tag{B.3} \newline
|
508 |
+
&= q(\mu)\big(1+ \frac{\nabla_xq(\mu)}{q(\mu)}(x-\mu)\big)& \tag{B.4} \newline
|
509 |
+
&= q(\mu)\big(1+ \nabla_x\log{q(\mu)}(x-\mu)\big)& \quad &\text{where}\quad \nabla_x\log{q(\mu)}\triangleq \nabla_x\log{q(x)}\bigg|_{x=\mu} \tag{B.5}
|
510 |
+
\end{align}
|
511 |
+
</li>
|
512 |
+
<li>
|
513 |
+
Assuming within the support of GaussFun, $\log\big(1+\nabla_x\log{q(\mu)}(x-\mu)\big)$ can be approximated by $\nabla_x\log{q(\mu)}(x-\mu)$. By expanding $\log(1+y)$ using Taylor series, according to the properties of Taylor expansion, when $\lVert y\rVert_2$ is small, $\log(1+y)$ can be approximated by $y$. When $\sigma$ is sufficiently small, $\lVert x-u\rVert_2$ will be small, and $\nabla_x\log{q(\mu)}(x-\mu)$will also be small, hence the above assumption can be satisfied. Generally, when $\nabla_x\log{q(\mu)}(x-\mu)<0.1$, the approximation error is small enough to be negligible.
|
514 |
+
\begin{align}
|
515 |
+
\log(1+y) &\approx \log(1+y)\bigg|_{y=0} + \nabla_y\log(1+y)\bigg|_{y=0}(y-0) \tag{B.6} \newline
|
516 |
+
&= y \tag{B.7}
|
517 |
+
\end{align}
|
518 |
+
</li>
|
519 |
+
</ul>
|
520 |
+
Using the above two assumptions, $q(x|z)$ can be transformed into the following form:
|
521 |
+
|
522 |
+
\begin{align}
|
523 |
+
q(x|z) &\propto \frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}\ q(x) \tag{B.8} \newline
|
524 |
+
&\approx \frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}\ q(\mu)\big(1+ \nabla_x\log{q(\mu)}(x-\mu)\big) \tag{B.9} \newline
|
525 |
+
&= \frac{q(\mu)}{\sqrt{2\pi}\sigma}\exp\left(\frac{-(x-\mu)^2}{2\sigma^2}+\log\big(1+ \nabla_x\log{q(\mu)}(x-\mu)\big)\right) \tag{B.10} \newline
|
526 |
+
&\approx \frac{q(\mu)}{\sqrt{2\pi}\sigma}\exp\left(\frac{-(x-\mu)^2}{2\sigma^2}+\nabla_x\log{q(\mu)}(x-\mu)\right) \tag{B.11} \newline
|
527 |
+
&= \frac{q(\mu)}{\sqrt{2\pi}\sigma}\exp\left(-\frac{(x-\mu)^2-2\sigma^2\nabla_x\log{q(\mu)}(x-\mu)}{2\sigma^2}\right) \tag{B.12} \newline
|
528 |
+
&= \frac{q(\mu)}{\sqrt{2\pi}\sigma}\exp\left(-\frac{\big(x-\mu-\sigma^2\nabla_x\log{q(\mu)}\big)^2}{2\sigma^2}+\frac{\big(\sigma^2\nabla_x\log{q(\mu)}\big)^2}{2\sigma^2}\right) \tag{B.13} \newline
|
529 |
+
&= \exp\left(-\frac{\big(x-\mu-\sigma^2\nabla_x\log{q(\mu)}\big)^2}{2\sigma^2}\right) \underbrace{\frac{q(\mu)}{\sqrt{2\pi}\sigma} \exp\left( \frac{1}{2}\big(\sigma\nabla_x\log{q(\mu)}\big)^2\right)}_{\text{const}} \tag{B.14}
|
530 |
+
\end{align}
|
531 |
+
|
532 |
+
Among them, Equation B.9 applies the conclusion of Assumption 1, and Equation B.11 applies the conclusion of Assumption 2.
|
533 |
+
|
534 |
+
The <em>const term</em> in Equation B.14 is constant and does not affect the shape of the function. Additionally, as can be seen from the above, $q(x|z)$ is self-normalizing. Therefore, $q(x|z)$ is a Gaussian probability density function with a mean of $\mu + \sigma^2 \nabla_x \log{q(\mu)}$ and a variance of $\sigma^2$.
|
535 |
+
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_approx_gauss_en")
|
536 |
+
|
537 |
+
return
|
538 |
+
|
539 |
+
|
540 |
+
def md_non_expanding_en():
|
541 |
+
global g_latex_del
|
542 |
+
|
543 |
+
title = "Appendix C Posterior Transform is a Non-expanding Mapping"
|
544 |
+
with gr.Accordion(label=title, elem_classes="first_md", elem_id="non_expanding"):
|
545 |
+
gr.Markdown(
|
546 |
+
r"""
|
547 |
+
<b>Corollary 1</b>
|
548 |
+
|
549 |
+
Using KL Divergence as a metric, the transition transform of Markov chain is non-expanding<a href="#elem">[23]</a>, which means
|
550 |
+
\begin{align}
|
551 |
+
KL\big(p(x), q(x)\big) &\le KL\big(p(z), q(z)\big) \tag{C.1} \newline
|
552 |
+
\end{align}
|
553 |
+
Here, $p(z)$ and $q(z)$ are arbitrary probability density functions, and $r(x|z)$ is the transition probability density function of the Markov chain. We have $p(x) = \int r(x|z)p(z)dz$ and $q(x) = \int r(x|z)q(z)dz$.
|
554 |
+
|
555 |
+
Proof:
|
556 |
+
|
557 |
+
For the KL divergence of $p(x,z)$ and $q(x,z)$, the following relationship exists:
|
558 |
+
\begin{align}
|
559 |
+
KL\big(p(x,z), q(x,z)\big) &= \iint p(x,z)\log \frac{p(x,z)}{q(x,z)}dxdz \tag{C.2} \newline
|
560 |
+
& = \iint p(x,z)\log \frac{p(x)p(x|z)}{q(x)q(x|z)}dxdz \tag{C.3} \newline
|
561 |
+
&= \iint p(x,z)\log \frac{p(x)}{q(x)}dxdz + \iint p(x,z) \log\frac{p(x|z)}{q(x|z)} dxdz \tag{C.4} \newline
|
562 |
+
&= \int \int p(x,z) dz\ \log \frac{p(x)}{q(x)}dx + \int p(z)\int p(x|z) \log\frac{p(x|z)}{q(x|z)} dx\ dz \tag{C.5} \newline
|
563 |
+
&= KL\big(p(x), q(x)\big) + \int p(z) KL\big(p(x|z), q(x|z)\big)dz \tag{C.6} \newline
|
564 |
+
\end{align}
|
565 |
+
|
566 |
+
Similarly, by swapping the order of $Z$ and $X$, the following relationship can be obtaine:
|
567 |
+
\begin{align}
|
568 |
+
KL\big(p(x,z), q(x,z)\big) &= KL\big(p(z), q(z)\big) + \int p(x) KL\big(p(z|x), q(z|x)\big)dx \tag{C.7}
|
569 |
+
\end{align}
|
570 |
+
|
571 |
+
Comparing the two equations, we can obtain:
|
572 |
+
\begin{align}
|
573 |
+
KL\big(p(z), q(z)\big) + \int p(x) KL\big(p(z|x), q(z|x)\big)dx = KL\big(p(x), q(x)\big) + \int p(z) KL\big(p(x|z), q(x|z)\big)dz \tag{C.8}
|
574 |
+
\end{align}
|
575 |
+
|
576 |
+
Since $q(x|z)$ and $p(x|z)$ are both transition probability densities of the Markov chain, equal to $r(x|z)$, the integral $\int p(z) KL\big(p(x|z), q(x|z)\big)dz$ equals 0. Therefore, the above equation simplifies to:
|
577 |
+
\begin{align}
|
578 |
+
KL\big(p(x), q(x)\big) = KL\big(p(z), q(z)\big) - \int p(x) KL\big(p(z|x), q(z|x)\big)dx \tag{C.9}
|
579 |
+
\end{align}
|
580 |
+
|
581 |
+
Since KL divergence is always greater than or equal to 0, the weighted sum $\int p(x) KL\big(p(z|x), q(z|x)\big)dx$ is also greater than or equal to 0. Therefore, we can conclude:
|
582 |
+
\begin{align}
|
583 |
+
KL\big(p(x), q(x)\big) \le KL\big(p(z), q(z)\big) \tag{C.10}
|
584 |
+
\end{align}
|
585 |
+
|
586 |
+
</br>
|
587 |
+
|
588 |
+
The condition for the above equation to hold is that $\int p(x) KL\big(p(z|x), q(z|x)\big)dx$ equals 0, which requires that for different conditions $x$, $p(z|x)$ and $q(z|x)$ must be equal. In most cases, when $p(z)$ and $q(z)$ are different, $p(z|x)$ and $q(z|x)$ are also different. This means that in most cases, we have
|
589 |
+
\begin{align}
|
590 |
+
KL\big(p(x), q(x)\big) < KL\big(p(z), q(z)\big) \tag{C.11}
|
591 |
+
\end{align}
|
592 |
+
|
593 |
+
</br></br>
|
594 |
+
<b>Corollary 2</b>
|
595 |
+
|
596 |
+
Using Total Variance (L1 distance) as a metric, the transition transform of a Markov chain is non-expanding, which means
|
597 |
+
\begin{align}
|
598 |
+
\left\lVert p(x)-q(x) \right\rVert_1\ &\le\ \left\lVert p(z) - q(z) \right\rVert_1 \tag{C.12}
|
599 |
+
\end{align}
|
600 |
+
|
601 |
+
Here, $p(z)$ and $q(z)$ are arbitrary probability density functions, and $r(x|z)$ is the transition probability density function of a Markov chain. We have $p(x) = \int r(x|z)p(z)dz$ and $q(x) = \int r(x|z) q(z) dz$.
|
602 |
+
|
603 |
+
Proof:
|
604 |
+
\begin{align}
|
605 |
+
\left\lVert p(x)-q(x) \right\rVert_1\ &= \int \big\lvert p(x) - q(x) \big\rvert dx \tag{C.13} \newline
|
606 |
+
&= \int \left\lvert \int r(x|z) p(z) dz - \int r(x|z)q(z)dz \right\rvert dx \tag{C.14} \newline
|
607 |
+
&= \int \left\lvert \int r(x|z) \big(p(z)-q(z)\big) dz \right\rvert dx \tag{C.15} \newline
|
608 |
+
&\le \int \int r(x|z) \left\lvert \big(p(z)-q(z)\big) \right\rvert dz dx \tag{C.16} \newline
|
609 |
+
&= \int \int r(x|z)dx \left\lvert \big(p(z)-q(z)\big) \right\rvert dz \tag{C.17} \newline
|
610 |
+
&= \int \left\lvert \big(p(z)-q(z)\big) \right\rvert dz \tag{C.18} \newline
|
611 |
+
&= \left\lVert p(z) - q(z) \right\rVert_1 \tag{C.19}
|
612 |
+
\end{align}
|
613 |
+
|
614 |
+
Here, Equation C.16 applies the Absolute Value Inequality, while Equation C.18 utilizes the property of $r(x|z)$ being a probability distribution.
|
615 |
+
|
616 |
+
Proof completed.
|
617 |
+
|
618 |
+
</br>
|
619 |
+
|
620 |
+
Figure C.1 shows an example of a one-dimensional random variable, which can help better understand the derivation process described above.
|
621 |
|
622 |
+
The condition for the above equation to hold is that all non-zero terms inside the absolute value brackets have the same sign. As shown in Figure C.1(a), there are five absolute value brackets, each corresponding to a row, with five terms in each bracket. The above equation holds if and only if all non-zero terms in each row have the same sign. If different signs occur, this will lead to $\lVert p(x)-q(x) \rVert_1\ <\ \lVert p(z) - q(z) \rVert_1$. The number of different signs is related to the nonzero elements of the transition probability matrix. In general, the more nonzero elements there are, the more different signs there will be.
|
623 |
+
|
624 |
+
For the posterior transform, generally, when $\alpha$ decreases (more noise), the transition probability density function will have more nonzero elements, as shown in Figure C.2(a); whereas when $\alpha$ increases (less noise), the transition probability density function will have fewer nonzero elements, as shown in Figure C.2(b).
|
|
|
|
|
|
|
|
|
625 |
|
626 |
+
So, there is a feature: <b>when $\alpha$ decreases, it leads to $\lVert p(x)-q(x) \rVert_1$ being smaller than $\lVert p(z) - q(z) \rVert_1$, which means the shrinking rate of the posterior transform is larger.</b>
|
|
|
627 |
|
628 |
+
<center> <img src="file/C1.png" width="1024" style="margin-top:12px"/> </center>
|
629 |
+
<center> Figure C.1: Non-expanding under L1 norm </center>
|
630 |
+
</br>
|
631 |
+
<center> <img src="file/C2.png" width="568" style="margin-top:12px"/> </center>
|
632 |
+
<center> Figure C.2: More non-zero elements as $\alpha$ gets smaller </center>
|
633 |
+
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_non_expanding_en")
|
634 |
+
|
635 |
+
return
|
636 |
+
|
637 |
+
|
638 |
+
def md_stationary_en():
|
639 |
+
global g_latex_del
|
640 |
+
|
641 |
+
title = "Appendix D Posterior Transform Converges to the Unique Stationary Distribution"
|
642 |
+
with gr.Accordion(label=title, elem_classes="first_md", elem_id="stationary"):
|
643 |
+
gr.Markdown(
|
644 |
+
r"""
|
645 |
+
According to the conclusion of Theorem 3 in <a href="#mc_basic_t3">[19]</a>, <b>an aperiodic and irreducible Markov chain will converge to a unique stationary distribution</b>.
|
646 |
+
|
647 |
+
The following will show that under certain conditions, the posterior transform is the transition probability density function of an <b>aperiodic and irreducible Markov chain</b>.
|
648 |
+
|
649 |
+
For convenience, the forward transform of the diffusion model is described below in a more general form.
|
650 |
+
\begin{align}
|
651 |
+
Z = \sqrt{\alpha}X + \sqrt{\beta}\ \epsilon \tag{D.1} \newline
|
652 |
+
\end{align}
|
653 |
+
|
654 |
+
As described in <a href="#transform">Section 1</a>, $\sqrt{\alpha}X$ narrows the probability density function of $X$, so $\alpha$ controls the narrowing intensity, while $\beta$ controls the amount of noise added(smoothing). When $\beta = 1 - \alpha$, the above transform is consistent with the equation 1.1.
|
655 |
|
656 |
+
The form of the posterior probability distribution corresponding to the new transformation is as follows:
|
657 |
+
\begin{align}
|
658 |
+
q(x|z=c) = \operatorname{Normalize} \Big(\ \overbrace{\frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}}^{\text{GaussFun}}\ q(x)\ \Big) \tag{D.2} \newline
|
659 |
+
\text{where}\ \mu=\frac{c}{\sqrt{\alpha}}\qquad \sigma=\sqrt{\frac{\beta}{\alpha}} \qquad \text{$c$ is a fixed value} \notag
|
660 |
+
\end{align}
|
661 |
+
|
662 |
+
When $\beta = 1 - \alpha$, the above transform is consistent with the equation 3.4.
|
663 |
+
|
664 |
+
For convenience, let $g(x)$ represent GaussFun in Equation D.2.
|
665 |
+
|
666 |
+
Since $\sqrt{\alpha}X$ narrows the probability density function $q(x)$ of $X$, this makes the analysis of the aperiodicity and irreducibility of the transition probability density function $q(x|z)$ more complex. Therefore, for the sake of analysis, we first assume $\alpha = 1$ and later analyze the case when $\alpha \neq 1$ and $\beta = 1 - \alpha$.
|
667 |
+
|
668 |
+
<center> <img src="file/D1.png" width="960" style="margin-top:12px"/> </center>
|
669 |
+
<center> Figure D.1: Only one component in support </center>
|
670 |
+
|
671 |
+
<center> <img src="file/D2.png" width="960" style="margin-top:12px"/> </center>
|
672 |
+
<center> Figure D.2: One component which can communicate with each other </center>
|
673 |
+
|
674 |
+
</br>
|
675 |
+
<h3 style="font-size:24px"> $\alpha=1$ </h3>
|
676 |
+
|
677 |
+
When $\alpha=1$, if $q(x)$ and $\beta$ satisfy either of the following two conditions, the Markov chain corresponding to $q(x|z)$ is aperiodic and irreducible.
|
678 |
+
|
679 |
+
<ol style="list-style-type:decimal">
|
680 |
+
<li>If the support of $q(x)$ contains only one connected component.</li>
|
681 |
+
<li>If the support of $q(x)$ has multiple connected components, but the distance between each connected component is less than $3$ times $\sigma$. In other words, the gaps can be covered by the radius of the effective region of $g(x)$.</li>
|
682 |
</ol>
|
683 |
+
|
684 |
+
Proof:
|
685 |
+
|
686 |
+
<ol style="list-style-type:decimal">
|
687 |
+
<li>
|
688 |
+
For any point $c$ in the support of $q(x)$, when $z=c$ and $x=c$, $q(x=c)>0$; from Equation D.2, we know that the center of $g(x)$ is located at $c$, so $g(x)$ is also greater than 0 at $x=c$. Therefore, according to characteristics of multiplication in the equation D.2, $q(x=c|z=c)>0$. Hence, the Markov chain corresponding to $q(x|z)$ is aperiodic.
|
689 |
+
|
690 |
+
For any point $c$ in the support of $q(x)$, when $z=c$, the center of $g(x)$ is located at $c$, so there exists a hypersphere with $c$ as its center ($\lVert x-c\rVert_2 < \delta$). Within this hypersphere, $q(x|z=c)>0$, which means that state $c$ can access nearby states. Since every state in the support has this property, all states within the entire support form a $\textcolor{red}{\text{Communicate Class}}$ <a href="#mc_basic_d4">[14]</a>. Therefore, the Markov chain corresponding to $q(x|z)$ is irreducible.
|
691 |
|
692 |
+
Therefore, a Markov chain that satisfies condition 1 is aperiodic and irreducible. See the example in Figure D.1, which illustrates a single connected component
|
|
|
|
|
693 |
</li>
|
694 |
+
|
695 |
+
<li>
|
696 |
+
When the support set of $q(x)$ has multiple connected components, the Markov chain may have multiple communicate classes. However, if the gaps between components are smaller than $3\sigma$(standard deviation of $g(x)$), the states of each component can access each other. Thus, the Markov chain corresponding to $q(x|z)$ will have only one communicate class, similar to the case in condition 1. Therefore, a Markov chain that satisfies condition 2 is aperiodic and irreducible.
|
697 |
|
698 |
+
In Figure D.2, an example of multiple connected components is shown.
|
699 |
+
</li>
|
|
|
700 |
</ol>
|
701 |
+
|
702 |
+
<center> <img src="file/D3.png" width="960" style="margin-top:12px"/> </center>
|
703 |
+
<center> Figure D.3: Two component which <b>cannot</b> communicate with each other </center>
|
704 |
+
|
705 |
+
</br>
|
706 |
+
<h3 style="font-size:24px"> $\alpha \neq 1$ </h3>
|
707 |
+
|
708 |
+
When $\alpha \neq 1$, for any point $c$ within the support of $q(x)$, it follows from Equation D.2 that the center of $g(x)$ is no longer $c$ but rather $\frac{c}{\sqrt{\alpha}}$. That is to say, the center of $g(x)$ deviates from $c$, with the deviation distance being $\lVert c\rVert(\frac{1-\sqrt{\alpha}}{\sqrt{\alpha}})$. It can be observed that the larger $\lVert c\rVert$ is, the greater the deviation. See the examples in Figures D.4(c) and D.4(d) for specifics. In Figure D.4(d), when $z=2.0$, the center of $g(x)$ is noticeably offset from $x=2.0$. This phenomenon is referred to in this article as <b>the Center Deviation Phenomenon</b>.
|
709 |
|
710 |
+
The <b>Center Deviation Phenomenon</b> will affect the properties of some states in the Markov chain.
|
711 |
+
|
712 |
+
When the deviation distance is significantly greater than $3\sigma$, $g(x)$ may be zero at $x = c$ and its vicinity. Consequently, $q(x=c|z=c)$ may also be zero, and $q(x|z=c)$ in the vicinity of $x = c$ may also be zero. Therefore, state $c$ may not be able to access nearby states and may be periodic. This is different from the case when $\alpha=1$. Refer to the example in Figure D.5: the $\textcolor{green}{\text{green curve}}$ represents $g(x)$ for $z=6.0$, and the $\textcolor{orange}{\text{orange curve}}$ represents $q(x|z=6.0)$. Because the center of $g(x)$ deviates too much from $x=6.0$, $q(x=6.0|z=6.0)=0$.
|
713 |
|
714 |
+
When the deviation distance is significantly less than $3\sigma$, $g(x)$ is non-zero at $x = c$ and its vicinity. Consequently, $q(x=c|z=c)$ will not be zero, and $q(x|z=c)$ in the vicinity of $x = c$ will also not be zero. Therefore, state $c$ can access nearby states and is aperiodic.
|
|
|
715 |
|
716 |
+
Under what conditions for $c$ will the deviation distance of the center of $g(x)$ be less than $3\sigma$?
|
717 |
+
|
718 |
\begin{align}
|
719 |
+
\lVert c\rVert(\frac{1-\sqrt{\alpha}}{\sqrt{\alpha}})\ <\ 3\frac{\sqrt{\beta}}{\sqrt{\alpha}} \qquad \Rightarrow \qquad \lVert c\rVert \ <\ 3\frac{\sqrt{\beta}}{1-\sqrt{\alpha}} \tag{D.3} \newline
|
720 |
\end{align}
|
721 |
+
|
722 |
+
From the above, it is known that there exists an upper limit such that as long as $\lVert c\rVert$ is less than this upper limit, the deviation amount will be less than $3\sigma$.
|
723 |
+
|
724 |
+
When $\beta=1-\alpha$, the above expression becomes
|
725 |
+
\begin{align}
|
726 |
+
\lVert c\rVert \ <\ 3\frac{\sqrt{1-\alpha}}{1-\sqrt{\alpha}} \tag{D.4} \newline
|
727 |
+
\end{align}
|
728 |
+
|
729 |
+
$3\frac{\sqrt{1-\alpha}}{1-\sqrt{\alpha}}$ has a strictly monotonically decreasing relationship with $\alpha$.
|
730 |
+
|
731 |
+
When $\alpha \in (0, 1)$,
|
732 |
\begin{align}
|
733 |
+
3\frac{\sqrt{1-\alpha}}{1-\sqrt{\alpha}} > 3 \tag{D.5} \newline
|
|
|
|
|
|
|
|
|
|
|
734 |
\end{align}
|
735 |
+
|
736 |
+
Based on the analysis above, the following conclusion can be drawn
|
737 |
+
|
738 |
+
<ol style="list-style-type:decimal">
|
739 |
+
<li>
|
740 |
+
<b>If the support of $q(x)$ contains only one connected component, and the points of the support set are all within a distance less than $3\frac{\sqrt{1-\alpha}}{1-\sqrt{\alpha}}$ from the origin, then the Markov chain corresponding to $q(x|z)$ will be aperiodic and irreducible.</b>
|
741 |
+
</li>
|
742 |
+
|
743 |
+
<li>
|
744 |
+
If the support of $q(x)$ contains multiple connected components, the accurate determination of whether two components can access each other becomes more complex due to the Center Deviation Phenomenon of $g(x)$. Here, we won't delve into further analysis. But just give a conservative conclusion: <b>If the points of the support are all within a distance less than $1$ from the origin, and the gaps between each connected component are all less than $2\sigma$, then the Markov chain corresponding to $q(x|z)$ will be aperiodic and irreducible.</b>
|
745 |
+
</li>
|
746 |
+
</ol>
|
747 |
+
|
748 |
+
<center> <img src="file/D4.png" width="1280" style="margin-top:12px"/> </center>
|
749 |
+
<center> Figure D.4: Center Deviation of the GaussFun </center>
|
750 |
+
</br>
|
751 |
+
<center> <img src="file/D5.png" width="568" style="margin-top:12px"/> </center>
|
752 |
+
<center> Figure D.5: Deviation is More Than $3\sigma$ </center>
|
753 |
+
|
754 |
+
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_stationary_en")
|
755 |
+
|
756 |
return
|
757 |
|
758 |
|
|
|
760 |
global g_latex_del
|
761 |
|
762 |
with gr.Accordion(label="Reference", elem_classes="first_md", elem_id="reference"):
|
|
|
763 |
gr.Markdown(
|
764 |
r"""
|
765 |
<a id="dpm" href="https://arxiv.org/abs/1503.03585"> [1] Deep Unsupervised Learning Using Nonequilibrium Thermodynami </a>
|
|
|
777 |
<a id="mc_limit" href="https://stats.libretexts.org/Bookshelves/Probability_Theory/Book%3A_Introductory_Probability_(Grinstead_and_Snell)/11%3A_Markov_Chains/11.04%3A_Fundamental_Limit_Theorem_for_Regular_Chains"> [7] Fundamental Limit Theorem for Regular Chains </a>
|
778 |
|
779 |
<a id="mc_basic_p6" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [8] Markov Chain:Basic Theory - Proposition 6 </a>
|
780 |
+
|
781 |
<a id="fp_converse" href="https://arxiv.org/abs/1702.07339"> [9] A Converse to Banach's Fixed Point Theorem and its CLS Completeness </a>
|
782 |
|
783 |
<a id="ce_kl" href="https://en.wikipedia.org/wiki/Cross-entropy#Cross-entropy_minimization"> [10] Cross-entropy minimization </a>
|
784 |
+
|
785 |
<a id="deconv_1" href="https://thewolfsound.com/deconvolution-inverse-convolution/"> [11] Deconvolution Using Frequency-Domain Division </a>
|
786 |
+
|
787 |
<a id="deconv_2" href="https://www.strollswithmydog.com/deconvolution-by-division-in-the-frequency-domain/"> [12] deconvolution-by-division-in-the-frequency-domain </a>
|
788 |
+
|
789 |
<a id="mc_basic_t7" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [13] Markov Chain:Basic Theory - Theorem 7 </a>
|
790 |
+
|
791 |
<a id="mc_basic_d4" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [14] Markov Chain:Basic Theory - Definition 4 </a>
|
792 |
+
|
793 |
<a id="vdm" href="https://arxiv.org/pdf/2107.00630"> [15] Variational Diffusion Models </a>
|
794 |
+
|
795 |
+
<a id="entropy" href="https://en.wikipedia.org/wiki/Entropy"> [16] Entropy </a>
|
796 |
+
|
797 |
+
<a id="cond_entropy" href="https://en.wikipedia.org/wiki/Conditional_entropy"> [17] Conditional Entropy </a>
|
798 |
+
|
799 |
+
<a id="dsm" href="https://www.iro.umontreal.ca/~vincentp/Publications/smdae_techreport_1358_v1.pdf"> [18] A Connection Between Score Matching and Denoising autoencoders </a>
|
800 |
+
|
801 |
+
<a id="mc_basic_t3" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [19] Markov Chain:Basic Theory - Theorem 3 </a>
|
802 |
+
|
803 |
+
<a id="mc_mt_lambda" href="https://pages.uoregon.edu/dlevin/MARKOV/markovmixing.pdf"> [20] Markov Chains and Mixing Times, second edition - 12.2 The Relaxation Time </a>
|
804 |
+
|
805 |
+
<a id="non_neg_lambda" href="https://link.springer.com/book/10.1007/0-387-32792-4"> [21] Non-negative Matrices and Markov Chains - Theorem 2.10 </a>
|
806 |
+
|
807 |
+
<a id="prml_mcmc" href="https://www.microsoft.com/en-us/research/uploads/prod/2006/01/Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf"> [22] Pattern Recognition and Machine Learning - 11.2. Markov Chain Monte Carlo </a>
|
808 |
+
|
809 |
+
<a id="elem" href="https://cs-114.org/wp-content/uploads/2015/01/Elements_of_Information_Theory_Elements.pdf"> [23] Elements_of_Information_Theory_Elements - 2.9 The Second Law of Thermodynamics </a>
|
810 |
+
|
811 |
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_reference_en")
|
812 |
|
813 |
return
|
|
|
854 |
|
855 |
md_cond_kl_en()
|
856 |
|
857 |
+
md_approx_gauss_en()
|
858 |
+
|
859 |
+
md_non_expanding_en()
|
860 |
+
|
861 |
+
md_stationary_en()
|
862 |
|
863 |
md_reference_en()
|
864 |
|
RenderMarkdownZh.py
CHANGED
@@ -78,31 +78,45 @@ def md_posterior_zh():
|
|
78 |
q(x|z) = \frac{q(z|x)q(x)}{q(z)} \tag{3.1}
|
79 |
\end{align}
|
80 |
|
81 |
-
当$z$是取固定值时,$q(z)$是常数,所以$q(x|z)
|
82 |
\begin{align}
|
83 |
-
q(x|z)
|
84 |
\end{align}
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
由式2.1可知,$q(z|x)$为高斯分布,于是有
|
86 |
\begin{align}
|
87 |
-
q(x|z) &\propto \frac{1}{\sqrt{2\pi(1-\alpha)}}\exp{\frac{-(z-\sqrt{\alpha}x)^2}{2(1-\alpha)}}\ q(x)& \qquad
|
88 |
-
&=
|
|
|
89 |
\end{align}
|
90 |
|
91 |
-
可以看出,<b>GaussFun</b>部分是关于$x$的高斯函数,均值为$\frac{z}{\sqrt{\alpha}}
|
92 |
|
93 |
根据”乘法“的特点,可以总结$q(x|z)$函数形状具有的特点。
|
94 |
<ul>
|
95 |
-
<li>
|
|
|
96 |
<li> 当高斯函数的方差较大(较大噪声),或者$q(x)$剧烈变化时,$q(x|z)$的形状将较复杂,与高斯函数有较大的差别,难以建模学习。</li>
|
97 |
</ul>
|
98 |
|
|
|
|
|
99 |
具体可看<a href="#demo_2">Demo 2</a>,左4图给出后验概率分布$q(x|z)$的形态,可以看出,其形状较不规则,像一条弯曲且不均匀的曲线。当$\alpha$较大时(噪声较小),曲线将趋向于均匀且笔直。读者可调整不同的$\alpha$值,观察后验概率分布与噪声大小的关系;左5图,$\textcolor{blue}{蓝色虚线}$给出$q(x)$,$\textcolor{green}{绿色虚线}$给出式3.4中的GaussFun,$\textcolor{orange}{黄色实线}$给出两者相乘并归一化的结果,即固定z条件下后验概率$q(x|z=fixed)$。读者可调整不同z值,观察$q(x)$的波动变化对后验概率$q(x|z)$形态的影响。
|
100 |
|
101 |
两个特殊状态下的后验概率分布$q(x|z)$值得考虑一下。
|
102 |
<ul>
|
103 |
-
<li> 当$\alpha \to 0$时,GaussFun
|
104 |
-
<li> 当$\alpha \to 1$时,GaussFun
|
105 |
</ul>
|
|
|
|
|
|
|
|
|
106 |
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_zh")
|
107 |
return
|
108 |
|
@@ -145,18 +159,22 @@ def md_forward_process_zh():
|
|
145 |
q(z_t|x) &= \mathcal{N}(\sqrt{\alpha_1\alpha_2\cdots\alpha_t}x,\ 1-\alpha_1\alpha_2\cdots\alpha_t) = \mathcal{N}(\sqrt{\bar{\alpha_t}}x,\ 1-\bar{\alpha_t}) \qquad where\ \bar{\alpha_t} \triangleq \prod_{j=1}^t\alpha_j \tag{4.8}
|
146 |
\end{align}
|
147 |
|
148 |
-
比较式4.8和式2.1
|
|
|
|
|
|
|
|
|
149 |
|
150 |
在DDPM[\[2\]](#ddpm)论文中,作者使用了1000步(T=1000),将数据分布$q(x)$转换至$q(z_T)$,$q(z_T|x)$的概率分布如下:
|
151 |
\begin{align}
|
152 |
q(z_T|x) &= \mathcal{N}(0.00635\ x,\ 0.99998) \tag{4.9}
|
153 |
\end{align}
|
154 |
|
155 |
-
|
156 |
\begin{align}
|
157 |
Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{4.10}
|
158 |
\end{align}
|
159 |
-
可以看出,应用两种变换后,变换后的分布$q(z_T|x)$相同,因此,$q(z_T)$也相同。
|
160 |
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_forward_process_zh")
|
161 |
return
|
162 |
|
@@ -184,7 +202,7 @@ def md_backward_process_zh():
|
|
184 |
|
185 |
在<a href="#posterior">第3节</a>中,我们考虑了两个特殊的后验概率分布。接下来,分析其对应的”后验概率变换“。
|
186 |
<ul>
|
187 |
-
<li> 当$\alpha \to 0$时,不同$z$值的$q(x|z)$均与$q(x)
|
188 |
<li> 当$\alpha \to 1$时,不同$z$值的$q(x|z)$收缩成一系列不同偏移量的Dirac delta函数及零函数。此状态下,只要输入分布的支撑集(support set)包含于$q(x)$的支撑集,变换的输出与输入将保持一致。</li>
|
189 |
</ul>
|
190 |
|
@@ -192,11 +210,13 @@ def md_backward_process_zh():
|
|
192 |
\begin{align}
|
193 |
Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{5.5}
|
194 |
\end{align}
|
195 |
-
由于$\alpha=0.0000403$非常小,其对应的GaussFun(式3.4)的标准差达到157.52
|
196 |
|
197 |
-
<b
|
198 |
|
199 |
读者可亲自做一个类似的实验。在<a href="#demo_3_1">Demo 3.1</a>中,将start_alpha设置0.25,end_alpha也设置为0.25,step设置为7,此时$q(z_7)=\sqrt{0.000061}X + \sqrt{1-0.000061}\epsilon$,与DDPM的$q(z_T)$基本相似。点击<b>apply</b>执行前向变换($\textcolor{blue}{蓝色曲线}$),为接下来的反向恢复做准备。在<a href="#demo_3_2">Demo 3.2</a>中,noise_ratio设置为1,为末端分布$q(z_7)$引入100%的噪声,切换nose_random_seed的值可改变噪声的分布,取消选择backward_pdf,减少画面的干扰。点击<b>apply</b>将通过后验概率变换恢复$q(x)$,将会看到,不管输入的$q(z_7)$的形状如何,恢复的$q(x)$均与原始的$q(x)$完全相同, JS Divergence为0,恢复的过程使用$\textcolor{red}{红色曲线}$画出。
|
|
|
|
|
200 |
|
201 |
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_backward_process_zh")
|
202 |
return
|
@@ -287,19 +307,20 @@ def md_posterior_transform_zh():
|
|
287 |
|
288 |
gr.Markdown(
|
289 |
r"""
|
290 |
-
<h3 style="font-size:18px">
|
291 |
\begin{align}
|
292 |
q(x) &= \int q(x,z) dz = \int q(x|z)q(z)dz \tag{7.1}
|
293 |
\end{align}
|
294 |
|
295 |
-
|
296 |
\begin{align}
|
297 |
-
|
298 |
\end{align}
|
|
|
299 |
|
300 |
读者可查看<a href="#demo_4_1">Demo 4.1</a>,左侧三个图呈现一个变换的��程,左1图是任意的数据分布$q(x)$,左3图是变换后的概率分布,左2图是后验概率分布。可更改随机种子生成新的数据分布,调整$\alpha$值引入不同程度的噪声。左侧最后两个图展示变换的“压缩性质”,左4图展示随机生成的两个输入分布,同时给出其距离度量值$div_{in}$;左5图展示经过变换后的两个输出分布,输出分布之间的距离标识为$div_{out}$。读者可改变输入的随机种子,切换不同的输入。可在图中看到,对于任意的输入,$div_{in}$总是小于$div_{out}$。另外,也可改变$\alpha$的值,将会看到,$\alpha$越小(噪声越大),$\frac{div_{out}}{div_{in}}$的比值也越小,即收缩率越大。
|
301 |
|
302 |
-
|
303 |
|
304 |
读者可看<a href="#demo_4_2">Demo 4.2</a>,此部分展示迭代收敛的例子。选择合适的迭代次数,点中“apply iteration transform”,将逐步画出迭代的过程,每个子图均会展示各自变换后的输出分布($\textcolor{green}{绿色曲线}$),收敛的参考点分布$q(x)$以$\textcolor{blue}{蓝色曲线}$画出,同时给出输出分布与$q(x)$之间的距离$dist$。可以看出,随着迭代的次数增加,输出分布与$q(x)$越来越相似,并最终会稳定在$q(x)$附近。对于较复杂的分布,可能需要较多迭代的次数或者较大的噪声。迭代次数可以设置为上万步,但会花费较长时间。
|
305 |
|
@@ -311,23 +332,31 @@ def md_posterior_transform_zh():
|
|
311 |
\boldsymbol{q_o} &= (Q_{x|z})^n\ \boldsymbol{q_i} & \quad\quad &\text{n iteration} \tag{7.5} \newline
|
312 |
\end{align}
|
313 |
于是,为了更深入地理解变换的特点,<a href="#demo_4_2">Demo 4.2</a>也画出矩阵$(Q_{x|z})^n$的结果。从图里可以看到,当迭代趋向收敛时,矩阵$(Q_{x|z})^n$的行向量将变成一个常数向量,即向量的各分量都相等。在二维密度图里将表现为一条横线。
|
314 |
-
|
315 |
-
在<a href="#proof_ctr">Appendix B</a>中,将会提供一个证明,当$q(x)$和$\alpha$满足一些条件时,后验概率变换是一个严格的压缩映射。
|
316 |
-
|
317 |
-
关于定点分布与输入分布q(x)之间距离的关系,目前尚不能严格证明。
|
318 |
|
319 |
-
|
320 |
-
|
|
|
|
|
|
|
321 |
\begin{align}
|
322 |
-
|
323 |
\end{align}
|
324 |
-
其中,$q(z)$是理想的输入分布,$q(x)
|
325 |
|
326 |
-
上式表明,输出的分布$q_o(x)
|
327 |
|
328 |
具体可看<a href="#demo_3_2">Demo 3.2</a>,通过增加“noise ratio”的值可以向“末尾分布$q(z_T)$”添加噪声,点击“apply”按钮将逐步画出恢复的过程,恢复的分布以$\textcolor{red}{红色曲线}$画出,同时也会通过JS散度标出误差的大小。将会看到,恢复的$q(x)$的误差总是小于$q(z_T)$的误差。
|
329 |
|
330 |
-
由上面的讨论可知,$\alpha$越小(即变换过程中使用的噪声越大)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
331 |
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_transform_zh")
|
332 |
return
|
333 |
|
@@ -439,74 +468,272 @@ def md_cond_kl_zh():
|
|
439 |
return
|
440 |
|
441 |
|
442 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
443 |
global g_latex_del
|
444 |
|
445 |
-
title = "Appendix
|
446 |
-
with gr.Accordion(label=title, elem_classes="first_md", elem_id="
|
447 |
gr.Markdown(
|
448 |
r"""
|
449 |
-
<
|
450 |
-
<center> Figure 2: Only one component in support </center>
|
451 |
|
452 |
-
|
|
|
|
|
|
|
|
|
453 |
|
454 |
-
|
455 |
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
|
|
|
|
|
|
|
|
460 |
|
461 |
-
|
462 |
-
|
|
|
|
|
463 |
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
471 |
|
472 |
-
|
473 |
-
|
474 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
475 |
|
476 |
-
|
477 |
-
|
478 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
479 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
480 |
</ol>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
481 |
</li>
|
482 |
|
483 |
-
<li>
|
|
|
484 |
|
485 |
-
|
|
|
486 |
</ol>
|
487 |
|
488 |
-
<center> <img src="file/
|
489 |
-
<center> Figure 3: Two component which
|
|
|
|
|
|
|
|
|
|
|
490 |
|
491 |
-
<
|
492 |
-
<center> Figure 4: Two component which <b>cannot</b> communicate with each other </center>
|
493 |
|
494 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
495 |
\begin{align}
|
496 |
-
|
497 |
\end{align}
|
498 |
-
|
|
|
|
|
|
|
499 |
\begin{align}
|
500 |
-
|
501 |
-
&= \sum_{m}\textcolor{red}{|}\sum_{n}Q_{x|z}(m,n)q_{i1}(n) - \sum_{n}Q_{x|z}(m,n)q_{i2}(n)\textcolor{red}{|} \tag{B.3} \newline
|
502 |
-
&= \sum_{m}\textcolor{red}{|}\sum_{n}Q_{x|z}(m,n)(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \tag{B.4} \newline
|
503 |
-
&\leq \sum_{m}\sum_{n}Q_{x|z}(m,n)\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \qquad \qquad \qquad \text{Absolute value inequality} \tag{B.5} \newline
|
504 |
-
&= \sum_{n}\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \sum_{m} Q_{x|z}(m,n) \qquad \qquad \qquad \sum_{m} Q_{x|z}(m,n) = 1 \tag{B.6} \newline
|
505 |
-
&= \sum_{n}\textcolor{red}{|}(q_{i1}(n) - q_{i2}(n))\textcolor{red}{|} \tag{B.7}
|
506 |
\end{align}
|
507 |
-
其中,$Q_{x|z}(m,n)$表示矩阵$Q_{x|z}$的第m行第n列的元素,$q_{i1}(n)$表示向量$q_{i1}$的第n个元素。
|
508 |
|
509 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
510 |
return
|
511 |
|
512 |
|
@@ -553,6 +780,16 @@ def md_reference_zh():
|
|
553 |
|
554 |
<a id="dsm" href="https://www.iro.umontreal.ca/~vincentp/Publications/smdae_techreport_1358_v1.pdf"> [18] A Connection Between Score Matching and Denoising autoencoders </a>
|
555 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
556 |
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_reference_zh")
|
557 |
|
558 |
return
|
@@ -599,7 +836,11 @@ def run_app():
|
|
599 |
|
600 |
md_cond_kl_zh()
|
601 |
|
602 |
-
|
|
|
|
|
|
|
|
|
603 |
|
604 |
md_reference_zh()
|
605 |
|
|
|
78 |
q(x|z) = \frac{q(z|x)q(x)}{q(z)} \tag{3.1}
|
79 |
\end{align}
|
80 |
|
81 |
+
当$z$是取固定值时,$q(z)$是常数,所以$q(x|z)$是关于$x$的概率密度函数,并且其形状只与${q(z|x)q(x)}$有关。
|
82 |
\begin{align}
|
83 |
+
q(x|z) &=\propto q(z|x)q(x) \qquad \text{where z is fixed} \tag{3.2}
|
84 |
\end{align}
|
85 |
+
|
86 |
+
实际上,$q(z)=\int q(z|x)q(x)dx$,也就是说,$q(z)$是对函数$q(z|x)q(x)$遍历$x$求和,所以,$q(z|x)q(x)$除以$q(z)$相当于对$q(z|x)q(x)$执行归一化。
|
87 |
+
\begin{align}
|
88 |
+
q(x|z) = \operatorname{Normalize}\big(q(z|x)q(x)\big) \tag{3.3}
|
89 |
+
\end{align}
|
90 |
+
|
91 |
由式2.1可知,$q(z|x)$为高斯分布,于是有
|
92 |
\begin{align}
|
93 |
+
q(x|z) &\propto \frac{1}{\sqrt{2\pi(1-\alpha)}}\exp{\frac{-(z-\sqrt{\alpha}x)^2}{2(1-\alpha)}}\ q(x)& \qquad &\text{where z is fixed} \notag \newline
|
94 |
+
&= \frac{1}{\sqrt{\alpha}}\frac{1}{\sqrt{2\pi\frac{1-\alpha}{\alpha}}}\exp{\frac{-(\frac{z}{\sqrt{\alpha}}-x)^2}{2\frac{1-\alpha}{\alpha}}}\ q(x)& \notag \newline
|
95 |
+
&= \frac{1}{\sqrt{\alpha}} \underbrace{\frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}}_{\text{GaussFun}}\ q(x)& \qquad &\text{where}\ \mu=\frac{z}{\sqrt{\alpha}}\quad \sigma=\sqrt{\frac{1-\alpha}{\alpha}} \tag{3.4}
|
96 |
\end{align}
|
97 |
|
98 |
+
可以看出,<b>GaussFun</b>部分是关于$x$的高斯函数,均值为$\frac{z}{\sqrt{\alpha}}$,标准差为$\sqrt{\frac{1-\alpha}{\alpha}}$,所以$q(x|z)$的形状由“<b>GaussFun与$q(x)$相乘</b>”决定。
|
99 |
|
100 |
根据”乘法“的特点,可以总结$q(x|z)$函数形状具有的特点。
|
101 |
<ul>
|
102 |
+
<li> $q(x|z)$的支撑集应该包含于GaussFun的支撑集,GaussFun的支撑集是一个超球体,中心位于均值$\mu$,半径约为3倍标准差$\sigma$。</li>
|
103 |
+
<li> 当高斯函数的方差较小(较小噪声),或者$q(x)$线性变化时,$q(x|z)$的形状将近似于高斯函数,函数形式较简单,方便建模学习。</li>
|
104 |
<li> 当高斯函数的方差较大(较大噪声),或者$q(x)$剧烈变化时,$q(x|z)$的形状将较复杂,与高斯函数有较大的差别,难以建模学习。</li>
|
105 |
</ul>
|
106 |
|
107 |
+
<a href="#approx_gauss">Appendix B</a>给出了较严谨的分析,当$\sigma$满足一些条件时,$q(x|z)$的近似于高斯分布。
|
108 |
+
|
109 |
具体可看<a href="#demo_2">Demo 2</a>,左4图给出后验概率分布$q(x|z)$的形态,可以看出,其形状较不规则,像一条弯曲且不均匀的曲线。当$\alpha$较大时(噪声较小),曲线将趋向于均匀且笔直。读者可调整不同的$\alpha$值,观察后验概率分布与噪声大小的关系;左5图,$\textcolor{blue}{蓝色虚线}$给出$q(x)$,$\textcolor{green}{绿色虚线}$给出式3.4中的GaussFun,$\textcolor{orange}{黄色实线}$给出两者相乘并归一化的结果,即固定z条件下后验概率$q(x|z=fixed)$。读者可调整不同z值,观察$q(x)$的波动变化对后验概率$q(x|z)$形态的影响。
|
110 |
|
111 |
两个特殊状态下的后验概率分布$q(x|z)$值得考虑一下。
|
112 |
<ul>
|
113 |
+
<li> 当$\alpha \to 0$时,GaussFun的标准差趋向于<b>无穷大</b>,GaussFun变成一个很大支撑集的近似的均匀分布,$q(x)$与均匀分布<b>相乘</b>结果仍为$q(x)$,所以,不同$z$值对应的$q(x|z)$几乎变成一致,并与$q(x)$几乎相同。读者可在<a href="#demo_2">Demo 2</a>中,将$\alpha$设置为0.001,观察具体的结果。</li>
|
114 |
+
<li> 当$\alpha \to 1$时,GaussFun的标准差趋向于<b>无穷小</b>,不同$z$值的$q(x|z)$收缩成一系列不同偏移量的Dirac delta函数, 偏移量等于$z$。但有一些例外,当$q(x)$存在为零的区域时,其对应的$q(x|z)$将不再为Dirac delta函数,而是零函数。可在<a href="#demo_2">Demo 2</a>中,将$\alpha$设置为0.999,观察具体的结果。</li>
|
115 |
</ul>
|
116 |
+
|
117 |
+
有一点需要注意一下,当$\alpha \to 0$时,较大$z$值对应的GaussFun的均值($\mu=\frac{z}{\sqrt{\alpha}}$)也急剧变大,也就是说,GaussFun位于离原点较远的地方,此时,$q(x)$的支撑集对应的GaussFun部分的“均匀程度”会略微有所下降, 从而会略微降低$q(x|z)$与$q(x)$的相似度,但这种影响会随着$\alpha$减小而进一步降低。读者可在<a href="#demo_2">Demo 2</a>中观察此影响,将$\alpha$设置为0.001,$q(x|z=-2)$与$q(x)$会略微有一点差别,但$q(x|z=0)$与$q(x)$却看不出区别。
|
118 |
+
|
119 |
+
关于高斯函数的"均匀程度",有如下两个特点:标准差越大,均匀程度越大;离均值越远,均匀程度越小。
|
120 |
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_zh")
|
121 |
return
|
122 |
|
|
|
159 |
q(z_t|x) &= \mathcal{N}(\sqrt{\alpha_1\alpha_2\cdots\alpha_t}x,\ 1-\alpha_1\alpha_2\cdots\alpha_t) = \mathcal{N}(\sqrt{\bar{\alpha_t}}x,\ 1-\bar{\alpha_t}) \qquad where\ \bar{\alpha_t} \triangleq \prod_{j=1}^t\alpha_j \tag{4.8}
|
160 |
\end{align}
|
161 |
|
162 |
+
比较式4.8和式2.1的形式,可发现,两者的形式是完全一致的。
|
163 |
+
|
164 |
+
如果只关注首尾两个变量之间的关系,那么连续t次的小变换可用一次大变换替代,大变换的$\alpha$是各个小变换的$\alpha$累积,因为两种变换对应的联合概率分布相同。
|
165 |
+
|
166 |
+
读者可在<a href="#demo_3_1">Demo 3.1</a>中做一个实验,对同样的输入分布$q(x)$,使用两种不同的变换方式:1)使用三个变换,$\alpha$均为0.95; 2)使用一个变换,$\alpha$设置为0.857375。分别执行变换,然后比较变换后的两个分布,将会看到,两个分布是完全相同的。
|
167 |
|
168 |
在DDPM[\[2\]](#ddpm)论文中,作者使用了1000步(T=1000),将数据分布$q(x)$转换至$q(z_T)$,$q(z_T|x)$的概率分布如下:
|
169 |
\begin{align}
|
170 |
q(z_T|x) &= \mathcal{N}(0.00635\ x,\ 0.99998) \tag{4.9}
|
171 |
\end{align}
|
172 |
|
173 |
+
如果只考虑$X,Z_T$的联合分布$q(x,z_T)$,也可使用一次变换代替,变换如下:
|
174 |
\begin{align}
|
175 |
Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{4.10}
|
176 |
\end{align}
|
177 |
+
可以看出,应用两种变换后,变换后的分布$q(z_T|x)$相同,因此,$q(x, z_T)$也相同。
|
178 |
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_forward_process_zh")
|
179 |
return
|
180 |
|
|
|
202 |
|
203 |
在<a href="#posterior">第3节</a>中,我们考虑了两个特殊的后验概率分布。接下来,分析其对应的”后验概率变换“。
|
204 |
<ul>
|
205 |
+
<li> 当$\alpha \to 0$时,不同$z$值的$q(x|z)$均与$q(x)$几乎相同,也就是说,线性加权和的基函数几乎相同。此状态下,<b>不管输入如何变化,变换的输出总为$q(x)$</b>。</li>
|
206 |
<li> 当$\alpha \to 1$时,不同$z$值的$q(x|z)$收缩成一系列不同偏移量的Dirac delta函数及零函数。此状态下,只要输入分布的支撑集(support set)包含于$q(x)$的支撑集,变换的输出与输入将保持一致。</li>
|
207 |
</ul>
|
208 |
|
|
|
210 |
\begin{align}
|
211 |
Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{5.5}
|
212 |
\end{align}
|
213 |
+
由于$\alpha=0.0000403$非常小,其对应的GaussFun(式3.4)的标准差达到157.52。如果把$q(x)$的支撑集限制在单位超球范围内($\lVert x \rVert_2 < 1$),那当$z_T \in [-2, +2]$时,对应的各个$q(x|z_T)$均与$q(x)$非常相似。在这种状态下,对于$q(x|z_T)$相应的后验概率变换,不管输入分布的形状的如何,只要支撑集在$[-2,+2]$范围内,其输出分布都将是$q(x)$。
|
214 |
|
215 |
+
<b>所以,可以总结,在DPM模型中,如果$q(x)$的支撑集是有限的,并且最终变量$Z_T$的信噪比足够大,那恢复$q(x)$的过程可以使用任意的分布,不必一定需要使用标准正态分布。</b>
|
216 |
|
217 |
读者可亲自做一个类似的实验。在<a href="#demo_3_1">Demo 3.1</a>中,将start_alpha设置0.25,end_alpha也设置为0.25,step设置为7,此时$q(z_7)=\sqrt{0.000061}X + \sqrt{1-0.000061}\epsilon$,与DDPM的$q(z_T)$基本相似。点击<b>apply</b>执行前向变换($\textcolor{blue}{蓝色曲线}$),为接下来的反向恢复做准备。在<a href="#demo_3_2">Demo 3.2</a>中,noise_ratio设置为1,为末端分布$q(z_7)$引入100%的噪声,切换nose_random_seed的值可改变噪声的分布,取消选择backward_pdf,减少画面的干扰。点击<b>apply</b>将通过后验概率变换恢复$q(x)$,将会看到,不管输入的$q(z_7)$的形状如何,恢复的$q(x)$均与原始的$q(x)$完全相同, JS Divergence为0,恢复的过程使用$\textcolor{red}{红色曲线}$画出。
|
218 |
+
|
219 |
+
另外有一点值得注意一下,在深度学习任务中,常将输入样本的各个维度缩放在[-1,1]范围内,也是说在一个超立方体内(hypercube)。超立方体内任意两点的最大欧氏距离会随着维度的增多而变大,比如,对于一维,最大距离为$2$,对于二维,最大距离为$2\sqrt{2}$,对于三维,最大距离为$2\sqrt{3}$,对于n维,最大距离为$2\sqrt{n}$。所以,对于维度较高的数据,需要$Z_T$变量有更高的信噪比,才能让恢复过程的起始分布接受任意的分布。
|
220 |
|
221 |
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_backward_process_zh")
|
222 |
return
|
|
|
307 |
|
308 |
gr.Markdown(
|
309 |
r"""
|
310 |
+
<h3 style="font-size:18px"> Non-expanding mapping and Stationary Distribution </h3>
|
311 |
\begin{align}
|
312 |
q(x) &= \int q(x,z) dz = \int q(x|z)q(z)dz \tag{7.1}
|
313 |
\end{align}
|
314 |
|
315 |
+
根据<a href="#non_expanping">Appendix B</a>的Corollary 1和Corollary 2可知,后验概率变换是一个non-expanding mapping。也是说,对任意的两个概率分布$q_{i1}(z)和q_{i2}(z)$,经过后验概率变换后得到$q_{o1}(x)$和$q_{o2}(x)$,$q_{o1}(z)$和$q_{o2}(z)$的距离<b>总是小于或等于</b>$q_{i1}(x)$和$q_{i2}(x)$的距离。这里的距离可使用KL Divergence或Total Variance或度量。
|
316 |
\begin{align}
|
317 |
+
d(q_{o1}(z),\ q_{o2}(z)) \le d(q_{i1}(x),\ q_{i2}(x)) \tag{7.2}
|
318 |
\end{align}
|
319 |
+
根据<a href="#non_expanping">Appendix B</a>的分析可知,在大多数情况,上述的等号并不会成立。并且,<b>当$\alpha$越小时(噪声越多),$d(q_{o1},q_{o2})$会越小于$d(q_{i1},q_{i2})$</b>。
|
320 |
|
321 |
读者可查看<a href="#demo_4_1">Demo 4.1</a>,左侧三个图呈现一个变换的��程,左1图是任意的数据分布$q(x)$,左3图是变换后的概率分布,左2图是后验概率分布。可更改随机种子生成新的数据分布,调整$\alpha$值引入不同程度的噪声。左侧最后两个图展示变换的“压缩性质”,左4图展示随机生成的两个输入分布,同时给出其距离度量值$div_{in}$;左5图展示经过变换后的两个输出分布,输出分布之间的距离标识为$div_{out}$。读者可改变输入的随机种子,切换不同的输入。可在图中看到,对于任意的输入,$div_{in}$总是小于$div_{out}$。另外,也可改变$\alpha$的值,将会看到,$\alpha$越小(噪声越大),$\frac{div_{out}}{div_{in}}$的比值也越小,即收缩率越大。
|
322 |
|
323 |
+
根据<a href="#stationary">Appendix C</a>的分析可知:后验概率变换可视为markov chain的一步跳转,并且,<b>当$q(x)$和$\alpha$满足一些条件时,此markov chain会收敛于惟一的稳态分布</b>。另外,通过大量实验发现,<b>稳态分布与数据分布$q(x)$非常相似,当$\alpha$越小时,稳态分布与$q(x)$越相似</b>。特别地,根据<a href="#backward_process">第5节</a>的结论,<b>当$\alpha \to 0$时,经过一步变换后,输出分布即是$q(x)$,所以稳态分布必定是$q(x)$</b>。
|
324 |
|
325 |
读者可看<a href="#demo_4_2">Demo 4.2</a>,此部分展示迭代收敛的例子。选择合适的迭代次数,点中“apply iteration transform”,将逐步画出迭代的过程,每个子图均会展示各自变换后的输出分布($\textcolor{green}{绿色曲线}$),收敛的参考点分布$q(x)$以$\textcolor{blue}{蓝色曲线}$画出,同时给出输出分布与$q(x)$之间的距离$dist$。可以看出,随着迭代的次数增加,输出分布与$q(x)$越来越相似,并最终会稳定在$q(x)$附近。对于较复杂的分布,可能需要较多迭代的次数或者较大的噪声。迭代次数可以设置为上万步,但会花费较长时间。
|
326 |
|
|
|
332 |
\boldsymbol{q_o} &= (Q_{x|z})^n\ \boldsymbol{q_i} & \quad\quad &\text{n iteration} \tag{7.5} \newline
|
333 |
\end{align}
|
334 |
于是,为了更深入地理解变换的特点,<a href="#demo_4_2">Demo 4.2</a>也画出矩阵$(Q_{x|z})^n$的结果。从图里可以看到,当迭代趋向收敛时,矩阵$(Q_{x|z})^n$的行向量将变成一个常数向量,即向量的各分量都相等。在二维密度图里将表现为一条横线。
|
|
|
|
|
|
|
|
|
335 |
|
336 |
+
对于一维离散的markov chain,收敛速度与转移概率矩阵的第二大特征值的绝对值($\lvert \lambda_2 \rvert$)反相关,$\lvert \lambda_2 \rvert$越小,收敛速度越快。经过大量的实验发现,$\alpha$与$\lvert \lambda_2 \rvert$有着明确的线性关系,$\alpha$越小,$\lvert \lambda_2 \rvert$也越小。所以,<b>$\alpha$越小(噪声越大),收敛速度越快</b>。 特别地,当$\alpha \to 0$时,由<a href="#posterior">第3节</a>的结论可知,各个$z$对应的后验概率分布趋向一致,而由文献<a href="#non_neg_lambda">[21]</a>的Theorem 21可知,$\lvert \lambda_2 \rvert$小于任意两个$z$对应的后验概率分布的L1距离,所以,可知$\lvert \lambda_2 \rvert \to 0$。
|
337 |
+
|
338 |
+
</br>
|
339 |
+
<h3 style="font-size:18px"> Anti-noise Capacity In Restoring Data Distribution </h3>
|
340 |
+
由上面的分析可知,在大多数情况下,"后验概率变换"是一个收缩映射,所以存在如下的关系:
|
341 |
\begin{align}
|
342 |
+
d(q(x),\ q_o(x)) < d(q(z),\ q_i(z)) \tag{7.12}
|
343 |
\end{align}
|
344 |
+
其中,$q(z)$是理想的输入分布,$q(x)$理想的输出分��,$q(x)=\int q(x|z)q(z)dz$,$q_i(z)$是任意的输入分布,$q_o(x)$是变换后的输出分布,$q_o(x)=\int q(x|z)q_i(z)dz$。
|
345 |
|
346 |
+
上式表明,输出的分布$q_o(x)$与理想输出分布$q(x)$之间的距离总会<b>小于</b>输入分布$q_i(z)$与理想输入分布$q(x)$的距离。所以,<b>”后验概率变换“天然具备一定的抵抗噪声能力</b>。这意味着,在恢复$q(x)$的过程中(<a href="#backward_process">第5节</a>),哪怕输入的“末尾分布$q(z_T)”$存在一定的误差,经过一系列变换后,输出的“数据分布$q(x)$“的误差也会比输入的误差更小。
|
347 |
|
348 |
具体可看<a href="#demo_3_2">Demo 3.2</a>,通过增加“noise ratio”的值可以向“末尾分布$q(z_T)$”添加噪声,点击“apply”按钮将逐步画出恢复的过程,恢复的分布以$\textcolor{red}{红色曲线}$画出,同时也会通过JS散度标出误差的大小。将会看到,恢复的$q(x)$的误差总是小于$q(z_T)$的误差。
|
349 |
|
350 |
+
由上面的讨论可知,$\alpha$越小(即变换过程中使用的噪声越大),收缩映射的收缩率越大,相应地,抗噪声的能力也越强。特别地,当$\alpha \to 0$时,抗噪声能力无限大,不论多大噪声的输入,输出都为$q(x)$。
|
351 |
+
|
352 |
+
</br>
|
353 |
+
<h3 style="font-size:18px"> Markov Chain Monte Carlo Sampling</h3>
|
354 |
+
|
355 |
+
在DPM模型中,通常是通过Ancestral Sampling的方式进行采样。由上面的分析可知,当$\alpha$足够小时,后验概率变换会收敛于$q(x)$,所以,可通过Markov Chain Monte Carlo的方式进行采样。如图7.1所示。图中$\alpha$代表一个较大的噪声的后验概率变换,较大的噪声使稳态分布更接近于数据分布$q(x)$,但由<a href="#posterior">第3节</a>可知,较大噪声的后验变换不利于拟合,所以把较大噪声的后验概率变换分成多个小噪声的后验概率变换。
|
356 |
+
|
357 |
+
<center> <img src="file/7.1.png" width="1024" style="margin-top:12px"/> </center>
|
358 |
+
<center> Figure 7.1: Markov Chain Monte Carlo Sampling</center>
|
359 |
+
|
360 |
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_transform_zh")
|
361 |
return
|
362 |
|
|
|
468 |
return
|
469 |
|
470 |
|
471 |
+
def md_approx_gauss_zh():
|
472 |
+
global g_latex_del
|
473 |
+
|
474 |
+
title = "Appendix B When does the Posterior Approximate to Gaussian ?"
|
475 |
+
with gr.Accordion(label=title, elem_classes="first_md", elem_id="approx_gauss"):
|
476 |
+
gr.Markdown(
|
477 |
+
r"""
|
478 |
+
由式3.4可知,$q(x|z)$有如下的形式
|
479 |
+
\begin{align}
|
480 |
+
q(x|z) &= \operatorname{Normalize} \Big(\ \frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}\ q(x)\ \Big)& \qquad &\text{where}\ \mu=\frac{z}{\sqrt{\alpha}}\quad \sigma=\sqrt{\frac{1-\alpha}{\alpha}} \tag{B.1} \newline
|
481 |
+
&\propto \underbrace{\frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}}_{\text{GaussFun}}\ q(x) \tag{B.2}
|
482 |
+
\end{align}
|
483 |
+
|
484 |
+
下面证明,如果满足如下两个假设,$q(x|z)$近似于高斯分布。
|
485 |
+
<ul>
|
486 |
+
<li>
|
487 |
+
假设在GaussFun的支撑集内,$q(x)$是线性变化的。以GaussFun的均值为中心,对$q(x)$进行泰勒展开。由泰勒展开的性质可知,当GaussFun的标准差$\sigma$足够小时,上述假设可以满足。
|
488 |
+
\begin{align}
|
489 |
+
q(x) &\approx q(\mu) + \nabla_xq(\mu)(x-\mu)& \quad &\text{where}\quad q(\mu)\triangleq q(x)\bigg|_{x=\mu} \quad \nabla_xq(\mu)\triangleq \nabla_xq(x)\bigg|_{x=\mu} \tag{B.3} \newline
|
490 |
+
&= q(\mu)\big(1+ \frac{\nabla_xq(\mu)}{q(\mu)}(x-\mu)\big)& \tag{B.4} \newline
|
491 |
+
&= q(\mu)\big(1+ \nabla_x\log{q(\mu)}(x-\mu)\big)& \quad &\text{where}\quad \nabla_x\log{q(\mu)}\triangleq \nabla_x\log{q(x)}\bigg|_{x=\mu} \tag{B.5}
|
492 |
+
\end{align}
|
493 |
+
</li>
|
494 |
+
<li>
|
495 |
+
假设在GaussFun的支撑集内,$\log\big(1+\nabla_x\log{q(\mu)}(x-\mu)\big)$可近似为 $\nabla_x\log{q(\mu)}(x-\mu)$。对$\log(1+y)$进行泰勒展开,由泰勒展开的性质可知,当$\lVert y\rVert_2$较小时,$\log(1+y)$可近似为$y$。当$\sigma$足够小时,$\lVert x-u\rVert_2$将较小,$\nabla_x\log{q(\mu)}(x-\mu)$也将较小,所以上述假设可以满足。一般情况下,当$\nabla_x\log{q(\mu)}(x-\mu)<0.1$时,近似的误差较小,可忽略。
|
496 |
+
\begin{align}
|
497 |
+
\log(1+y) &\approx \log(1+y)\bigg|_{y=0} + \nabla_y\log(1+y)\bigg|_{y=0}(y-0) \tag{B.6} \newline
|
498 |
+
&= y \tag{B.7}
|
499 |
+
\end{align}
|
500 |
+
</li>
|
501 |
+
</ul>
|
502 |
+
利用上面的两个假设,可对$q(x|z)$进行如下的推导:
|
503 |
+
|
504 |
+
\begin{align}
|
505 |
+
q(x|z) &\propto \frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}\ q(x) \tag{B.8} \newline
|
506 |
+
&\approx \frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}\ q(\mu)\big(1+ \nabla_x\log{q(\mu)}(x-\mu)\big) \tag{B.9} \newline
|
507 |
+
&= \frac{q(\mu)}{\sqrt{2\pi}\sigma}\exp\left(\frac{-(x-\mu)^2}{2\sigma^2}+\log\big(1+ \nabla_x\log{q(\mu)}(x-\mu)\big)\right) \tag{B.10} \newline
|
508 |
+
&\approx \frac{q(\mu)}{\sqrt{2\pi}\sigma}\exp\left(\frac{-(x-\mu)^2}{2\sigma^2}+\nabla_x\log{q(\mu)}(x-\mu)\right) \tag{B.11} \newline
|
509 |
+
&= \frac{q(\mu)}{\sqrt{2\pi}\sigma}\exp\left(-\frac{(x-\mu)^2-2\sigma^2\nabla_x\log{q(\mu)}(x-\mu)}{2\sigma^2}\right) \tag{B.12} \newline
|
510 |
+
&= \frac{q(\mu)}{\sqrt{2\pi}\sigma}\exp\left(-\frac{\big(x-\mu-\sigma^2\nabla_x\log{q(\mu)}\big)^2}{2\sigma^2}+\frac{\big(\sigma^2\nabla_x\log{q(\mu)}\big)^2}{2\sigma^2}\right) \tag{B.13} \newline
|
511 |
+
&= \exp\left(-\frac{\big(x-\mu-\sigma^2\nabla_x\log{q(\mu)}\big)^2}{2\sigma^2}\right) \underbrace{\frac{q(\mu)}{\sqrt{2\pi}\sigma} \exp\left( \frac{1}{2}\big(\sigma\nabla_x\log{q(\mu)}\big)^2\right)}_{\text{const}} \tag{B.14}
|
512 |
+
\end{align}
|
513 |
+
|
514 |
+
其中,式B.9应用了假设1的结论,式B.11应用了假设2的结论。
|
515 |
+
|
516 |
+
式B.14中的const项是常数项,不会影响函数的形状。另外,由上面可知,$q(x|z)$具有自归一化的功能,所以,$q(x|z)$是一个高斯概率密度函数,均值为$\mu+\sigma^2\nabla_x\log{q(\mu)}$,方差为$\sigma^2$。
|
517 |
+
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_approx_gauss_zh")
|
518 |
+
|
519 |
+
return
|
520 |
+
|
521 |
+
|
522 |
+
def md_non_expanding_zh():
|
523 |
global g_latex_del
|
524 |
|
525 |
+
title = "Appendix C Posterior Transform is a Non-expanding Mapping"
|
526 |
+
with gr.Accordion(label=title, elem_classes="first_md", elem_id="non_expanding"):
|
527 |
gr.Markdown(
|
528 |
r"""
|
529 |
+
<b>Corollary 1</b>
|
|
|
530 |
|
531 |
+
以KL Divergence为度量,markov chain的转移变换是non-expanding的<a href="#elem">[23]</a>,即
|
532 |
+
\begin{align}
|
533 |
+
KL\big(p(x), q(x)\big) &\le KL\big(p(z), q(z)\big) \tag{C.1} \newline
|
534 |
+
\end{align}
|
535 |
+
其中,$p(z)$和$q(z)$是任意的概率密度函数,$r(x|z)$是markov chain的转移概率密度函数,$p(x) = \int r(x|z)p(z)dz$,$q(x) = \int r(x|z) q(z) dz$。
|
536 |
|
537 |
+
证明:
|
538 |
|
539 |
+
对于$p(x,z)$和$q(x,z)$的KL divergence,存在如下的关系:
|
540 |
+
\begin{align}
|
541 |
+
KL\big(p(x,z), q(x,z)\big) &= \iint p(x,z)\log \frac{p(x,z)}{q(x,z)}dxdz \tag{C.2} \newline
|
542 |
+
& = \iint p(x,z)\log \frac{p(x)p(x|z)}{q(x)q(x|z)}dxdz \tag{C.3} \newline
|
543 |
+
&= \iint p(x,z)\log \frac{p(x)}{q(x)}dxdz + \iint p(x,z) \log\frac{p(x|z)}{q(x|z)} dxdz \tag{C.4} \newline
|
544 |
+
&= \int \int p(x,z) dz\ \log \frac{p(x)}{q(x)}dx + \int p(z)\int p(x|z) \log\frac{p(x|z)}{q(x|z)} dx\ dz \tag{C.5} \newline
|
545 |
+
&= KL\big(p(x), q(x)\big) + \int p(z) KL\big(p(x|z), q(x|z)\big)dz \tag{C.6} \newline
|
546 |
+
\end{align}
|
547 |
|
548 |
+
类似地,调换$Z$和$X$的顺序,可得到下面的关系:
|
549 |
+
\begin{align}
|
550 |
+
KL\big(p(x,z), q(x,z)\big) &= KL\big(p(z), q(z)\big) + \int p(x) KL\big(p(z|x), q(z|x)\big)dx \tag{C.7}
|
551 |
+
\end{align}
|
552 |
|
553 |
+
比较两个关系式,可得:
|
554 |
+
\begin{align}
|
555 |
+
KL\big(p(z), q(z)\big) + \int p(x) KL\big(p(z|x), q(z|x)\big)dx = KL\big(p(x), q(x)\big) + \int p(z) KL\big(p(x|z), q(x|z)\big)dz \tag{C.8}
|
556 |
+
\end{align}
|
557 |
+
|
558 |
+
由于$q(x|z)$和$p(x|z)$都是markov chain的转移概率密度,均等于$r(x|z)$,所以$\int p(z) KL\big(p(x|z), q(x|z)\big)dz$等于0。于是,上式简化为:
|
559 |
+
\begin{align}
|
560 |
+
KL\big(p(x), q(x)\big) = KL\big(p(z), q(z)\big) - \int p(x) KL\big(p(z|x), q(z|x)\big)dx \tag{C.9}
|
561 |
+
\end{align}
|
562 |
+
|
563 |
+
由于KL divergence总是大于或者等于0,所以,加权和$\int p(x) KL\big(p(z|x), q(z|x)\big)dx$也是大于等于0。于是,可得:
|
564 |
+
\begin{align}
|
565 |
+
KL\big(p(x), q(x)\big) \le KL\big(p(z), q(z)\big) \tag{C.10}
|
566 |
+
\end{align}
|
567 |
+
|
568 |
+
</br>
|
569 |
|
570 |
+
上式等号成立的条件是$\int p(x) KL\big(p(z|x), q(z|x)\big)dx$等于0,这要求对不同的条件$x$,$p(z|x)$与$q(z|x)$均要相等。在大多数情况下,当$p(z)$和$q(z)$不同时,$p(z|x)$也和$q(z|x)$不同。这意味着,在大多数情况下,有
|
571 |
+
\begin{align}
|
572 |
+
KL\big(p(x), q(x)\big) < KL\big(p(z), q(z)\big) \tag{C.11}
|
573 |
+
\end{align}
|
574 |
+
|
575 |
+
</br></br>
|
576 |
+
<b>Corollary 2</b>
|
577 |
+
|
578 |
+
以Total Variance(L1 distance)为度量,markov chain的转移变换是non-expanding,即
|
579 |
+
\begin{align}
|
580 |
+
\left\lVert p(x)-q(x) \right\rVert_1\ &\le\ \left\lVert p(z) - q(z) \right\rVert_1 \tag{C.12}
|
581 |
+
\end{align}
|
582 |
|
583 |
+
其中,$p(z)$和$q(z)$是任意的概率密度函数,$r(x|z)$是markov chain的转移概率密度函数,$p(x) = \int r(x|z)p(z)dz$,$q(x) = \int r(x|z) q(z) dz$。
|
584 |
+
|
585 |
+
证明:
|
586 |
+
\begin{align}
|
587 |
+
\left\lVert p(x)-q(x) \right\rVert_1\ &= \int \big\lvert p(x) - q(x) \big\rvert dx \tag{C.13} \newline
|
588 |
+
&= \int \left\lvert \int r(x|z) p(z) dz - \int r(x|z)q(z)dz \right\rvert dx \tag{C.14} \newline
|
589 |
+
&= \int \left\lvert \int r(x|z) \big(p(z)-q(z)\big) dz \right\rvert dx \tag{C.15} \newline
|
590 |
+
&\le \int \int r(x|z) \left\lvert \big(p(z)-q(z)\big) \right\rvert dz dx \tag{C.16} \newline
|
591 |
+
&= \int \int r(x|z)dx \left\lvert \big(p(z)-q(z)\big) \right\rvert dz \tag{C.17} \newline
|
592 |
+
&= \int \left\lvert \big(p(z)-q(z)\big) \right\rvert dz \tag{C.18} \newline
|
593 |
+
&= \left\lVert p(z) - q(z) \right\rVert_1 \tag{C.19}
|
594 |
+
\end{align}
|
595 |
+
|
596 |
+
其中,式C.16应用了绝对值不等式,式C.18利用了$r(x|z)$是概率分布的性质。
|
597 |
+
|
598 |
+
证明完毕。
|
599 |
+
|
600 |
+
</br>
|
601 |
+
|
602 |
+
图C.1展示了一个一维随机变量的例子,可以更直观地理解上述推导的过程。
|
603 |
+
|
604 |
+
上述等式的成立的条件是:各个绝对值括号内的非零项均是同样的符号。如图C.1(a),包含5个绝对值括号,每个对应一行,每个括号内有5项,当且仅当每行各个非零项同号时,上述的等式才成立。如果出现不同号的情况,则会导致$\lVert p(x)-q(x) \rVert_1\ <\ \lVert p(z) - q(z) \rVert_1$。不同号出现的数量与转移概率矩阵的非零元素有关,一般情况下,非零元素越多,不同号出现的数量会越多。
|
605 |
+
|
606 |
+
在后验概率变换中,一般情况下,当$\alpha$越小(噪声越多)时,转移概率密度函数会有越多的非零元素,如图C.2(a)所示;当$\alpha$越大(噪声越小)时,转移概率密度函数会有越少的非零元素,如图C.2(b)所示。
|
607 |
+
|
608 |
+
所以,有这么一个规律:<b>当$\alpha$越小时,则会导致$\lVert p(x)-q(x) \rVert_1$越小于$\lVert p(z) - q(z) \rVert_1$,也就是说,这个变换的压缩率越大</b>。
|
609 |
+
|
610 |
+
<center> <img src="file/C1.png" width="1024" style="margin-top:12px"/> </center>
|
611 |
+
<center> Figure C.1: Non-expanding under L1 norm </center>
|
612 |
+
</br>
|
613 |
+
<center> <img src="file/C2.png" width="568" style="margin-top:12px"/> </center>
|
614 |
+
<center> Figure C.2: More non-zero elements as $\alpha$ gets smaller </center>
|
615 |
+
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_non_expanding_zh")
|
616 |
|
617 |
+
return
|
618 |
+
|
619 |
+
|
620 |
+
def md_stationary_zh():
|
621 |
+
global g_latex_del
|
622 |
+
|
623 |
+
title = "Appendix D Posterior Transform Converges to the Unique Stationary Distribution"
|
624 |
+
with gr.Accordion(label=title, elem_classes="first_md", elem_id="stationary"):
|
625 |
+
gr.Markdown(
|
626 |
+
r"""
|
627 |
+
根据文献<a href="#mc_basic_t3">[19]</a>Theorem 3的结论,<b>非周期(aperiodic)不可约(irreducible)的markov chain会收敛于惟一的稳态分布</b>。
|
628 |
+
|
629 |
+
下面将表明,当满足一定的条件时,后验概率变换是一个非周期不可约的markov chain的转移概率密度函数。
|
630 |
+
|
631 |
+
为了表述方便,下面以一个更通用的形式来描述扩散模型的前向变换。
|
632 |
+
\begin{align}
|
633 |
+
Z = \sqrt{\alpha}X + \sqrt{\beta}\ \epsilon \tag{D.1} \newline
|
634 |
+
\end{align}
|
635 |
+
|
636 |
+
由<a href="#transform">第1节</a>可知,$\sqrt{\alpha}X$会对$X$的概率密度函数执行缩放,所以$\alpha$控制着缩放的强度,$\beta$控制着添加噪声的大小。当$\beta = 1-\alpha$时,上述的变换与式1.1一致。
|
637 |
+
|
638 |
+
新变换对应的后验概率分布的形式如下:
|
639 |
+
\begin{align}
|
640 |
+
q(x|z=c) = \operatorname{Normalize} \Big(\ \overbrace{\frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}}^{\text{GaussFun}}\ q(x)\ \Big) \tag{D.2} \newline
|
641 |
+
\text{where}\ \mu=\frac{c}{\sqrt{\alpha}}\qquad \sigma=\sqrt{\frac{\beta}{\alpha}} \qquad \text{$c$ is a fixed value} \notag
|
642 |
+
\end{align}
|
643 |
+
|
644 |
+
当$\beta = 1-\alpha$时,上述的变换与式3.4一致。
|
645 |
+
|
646 |
+
为了表述方便,下面以$g(x)$表示式D.2中GaussFun。
|
647 |
+
|
648 |
+
由于$\sqrt{\alpha}X$会缩放$X$的概率密��函数$q(x)$,这会使分析转移概率密度函数$q(x|z)$的非周期性和不可约性变得更复杂。所以,为了分析方便,先假设$\alpha=1$,后面再分析$\alpha \neq 1$且$\beta = 1-\alpha$的情况。
|
649 |
+
|
650 |
+
<center> <img src="file/D1.png" width="960" style="margin-top:12px"/> </center>
|
651 |
+
<center> Figure D.1: Only one component in support </center>
|
652 |
+
|
653 |
+
<center> <img src="file/D2.png" width="960" style="margin-top:12px"/> </center>
|
654 |
+
<center> Figure D.2: One component which can communicate with each other </center>
|
655 |
+
|
656 |
+
</br>
|
657 |
+
<h3 style="font-size:24px"> $\alpha=1$ </h3>
|
658 |
+
|
659 |
+
当$\alpha=1$时,如果$q(x)$和$\beta$满足下面两个条件之一,则$q(x|z)$对应的markov chain是非周期且不可约的。
|
660 |
+
|
661 |
+
<ol style="list-style-type:decimal">
|
662 |
+
<li>如果$q(x)$的支撑集只存在一个connected component。</li>
|
663 |
+
<li>如果$q(x)$的支撑集存在多个connected component,但各个connected component之间的距离小于$3$倍$\sigma$。也就是说,间隙能被$g(x)$的有效区域的半径所覆盖。</li>
|
664 |
</ol>
|
665 |
+
|
666 |
+
证明如下:
|
667 |
+
|
668 |
+
<ol style="list-style-type:decimal">
|
669 |
+
<li>
|
670 |
+
对$q(x)$支撑集内的任意点$c$,当$z=c$和$x=c$时,$q(x=c)>0$;由式D.2可知,$g(x)$的中心位于$c$,所以$g(x)$在$x=c$处也大于0。于是,根据式D.2中相乘的关系可知,$q(x=c|z=c)>0$。因此,$q(x|z)$对应的markov chain是非周期的。
|
671 |
+
|
672 |
+
对$q(x)$支撑集内的任意点$c$,当$z=c$时,$g(x)$的中心位于$c$, 所以存在一个以$c$为中心的超球($\lVert x-c\rVert_2 < \delta$),在此超球内,$q(x|z=c)>0$,也就是说,状态$c$可以访问(access)附近的其它状态。由于支撑集内每个状态都具有此性质,所以,整个支撑集内的状态构成一个$\textcolor{red}{\text{Communicate Class}}$<a href="#mc_basic_d4">[14]</a>。因此,$q(x|z)$对应的markov chain是不可约的。
|
673 |
+
|
674 |
+
所以,满足条件1的markov chain是非周期和不可约的。可看图D.1的例子,其展示了单个connected component的例子。
|
675 |
</li>
|
676 |
|
677 |
+
<li>
|
678 |
+
当$q(x)$支撑集存在多个connected component时,markov chain可能存在多个communicate class。但当各间隙小于$g(x)$的3倍标准差时,那各个component的状态的将可互相访问(access),因此,$q(x|z)$对应的markov chain也只存在一个communicate class,与条件1的情况相同。所以,满足条件2的markov chain是非周期和不可约的。
|
679 |
|
680 |
+
可看图d2的例子,其展示了多个connected component的例子。
|
681 |
+
</li>
|
682 |
</ol>
|
683 |
|
684 |
+
<center> <img src="file/D3.png" width="960" style="margin-top:12px"/> </center>
|
685 |
+
<center> Figure D.3: Two component which <b>cannot</b> communicate with each other </center>
|
686 |
+
|
687 |
+
</br>
|
688 |
+
<h3 style="font-size:24px"> $\alpha \neq 1$ </h3>
|
689 |
+
|
690 |
+
当$\alpha \neq 1$时,对$q(x)$支撑集内的任意点$c$,由式D.2可知,$g(x)$的中心不再是$c$,而是$\frac{c}{\sqrt{\alpha}}$。也就是说$g(x)$的中心会偏离$c$,偏离的距离为$\lVert c\rVert(\frac{1-\sqrt{\alpha}}{\sqrt{\alpha}})$。可以看出,$\lVert c\rVert$越大,偏离越多。具体可看图D.4(c)和图D.4(d)的例子,在图D.4(d)中,当$z=2.0$,$g(x)$的中心明显偏离$x=2.0$。本文将此现象称之为<b>中心偏离现象</b>。
|
691 |
|
692 |
+
<b>中心偏离现象</b>将会影响markov chain一些状态的性质。
|
|
|
693 |
|
694 |
+
当偏离的距离明显大于$3\sigma$时,$g(x)$在$x=c$及其附近<b>可能均为零</b>,于是,$q(x=c|z=c)$将<b>可能等于0</b>,并且在$x=c$附近$q(x|z=c)$<b>也可能等于0</b>。所以,状态$c$不一定可访问附近的状态。这一点与$\alpha=1$的情况不同。具体可图D.5的例子,$\textcolor{green}{\text{绿色曲线}}$是$z=6.0$的$g(x)$,$\textcolor{orange}{\text{黄线曲线}}$是$q(x|z=6.0)$,由于$g(x)$的中心偏离$x=6.0$太多,导致$q(x=6.0|z=6.0)=0$。
|
695 |
+
|
696 |
+
当偏离的距离明显小于$3\sigma$时,$g(x)$在$x=c$及其附近<b>均不为零</b>,于是,$q(x=c|z=c)$将<b>不等于0</b>,并且在$x=c$附近$q(x|z=c)$<b>也不等于0</b>。所以,状态$c$可访问附近的状态,并且是非周期的。
|
697 |
+
|
698 |
+
当$c$满足什么要求时,$g(x)$中心的偏离距离会小于$3\sigma$呢?
|
699 |
+
\begin{align}
|
700 |
+
\lVert c\rVert(\frac{1-\sqrt{\alpha}}{\sqrt{\alpha}})\ <\ 3\frac{\sqrt{\beta}}{\sqrt{\alpha}} \qquad \Rightarrow \qquad \lVert c\rVert \ <\ 3\frac{\sqrt{\beta}}{1-\sqrt{\alpha}} \tag{D.3} \newline
|
701 |
+
\end{align}
|
702 |
+
|
703 |
+
由上可知,存在一个上限,只要$\lVert c\rVert$小于这个上限,可保证偏离量小于$3\sigma$。
|
704 |
+
|
705 |
+
当$\beta=1-\alpha$时,上式变为
|
706 |
\begin{align}
|
707 |
+
\lVert c\rVert \ <\ 3\frac{\sqrt{1-\alpha}}{1-\sqrt{\alpha}} \tag{D.4} \newline
|
708 |
\end{align}
|
709 |
+
|
710 |
+
$3\frac{\sqrt{1-\alpha}}{1-\sqrt{\alpha}}$与$\alpha$有着严格的单调递减的关系。
|
711 |
+
|
712 |
+
当$\alpha \in (0, 1)$时,
|
713 |
\begin{align}
|
714 |
+
3\frac{\sqrt{1-\alpha}}{1-\sqrt{\alpha}} > 3 \tag{D.5} \newline
|
|
|
|
|
|
|
|
|
|
|
715 |
\end{align}
|
|
|
716 |
|
717 |
+
根据上面的分析,可总结出以下的结论:
|
718 |
+
|
719 |
+
<ol style="list-style-type:decimal">
|
720 |
+
<li>
|
721 |
+
<b>如果$q(x)$的支撑集只存在一个connected component,并且支撑集的点离原点的距离均小于$ 3\frac{\sqrt{1-\alpha}}{1-\sqrt{\alpha}}$,那么$q(x|z)$对应的markov chain是非周期和不可约的。</b>
|
722 |
+
</li>
|
723 |
+
|
724 |
+
<li>
|
725 |
+
如果$q(x)$的支撑集存在多个connected component,由于$g(x)$的中心偏离效应,准确判断两个component之间是否可以互相访问变得更加复杂,这里不再详细分析。但下面给出一个保守的结论:<b>如果支撑集的点离原点的距离均小于$1$,并且各个connected component之间的间隙均小于$2\sigma$,那么$q(x|z)$对应的markov chain是非周期和不可约的。</b>
|
726 |
+
</li>
|
727 |
+
</ol>
|
728 |
+
|
729 |
+
<center> <img src="file/D4.png" width="1280" style="margin-top:12px"/> </center>
|
730 |
+
<center> Figure D.4: Center Deviation of the GaussFun </center>
|
731 |
+
</br>
|
732 |
+
<center> <img src="file/D5.png" width="568" style="margin-top:12px"/> </center>
|
733 |
+
<center> Figure D.5: Deviation is More Than $3\sigma$ </center>
|
734 |
+
|
735 |
+
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_stationary_zh")
|
736 |
+
|
737 |
return
|
738 |
|
739 |
|
|
|
780 |
|
781 |
<a id="dsm" href="https://www.iro.umontreal.ca/~vincentp/Publications/smdae_techreport_1358_v1.pdf"> [18] A Connection Between Score Matching and Denoising autoencoders </a>
|
782 |
|
783 |
+
<a id="mc_basic_t3" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [19] Markov Chain:Basic Theory - Theorem 3 </a>
|
784 |
+
|
785 |
+
<a id="mc_mt_lambda" href="https://pages.uoregon.edu/dlevin/MARKOV/markovmixing.pdf"> [20] Markov Chains and Mixing Times, second edition - 12.2 The Relaxation Time </a>
|
786 |
+
|
787 |
+
<a id="non_neg_lambda" href="https://link.springer.com/book/10.1007/0-387-32792-4"> [21] Non-negative Matrices and Markov Chains - Theorem 2.10 </a>
|
788 |
+
|
789 |
+
<a id="prml_mcmc" href="https://www.microsoft.com/en-us/research/uploads/prod/2006/01/Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf"> [22] Pattern Recognition and Machine Learning - 11.2. Markov Chain Monte Carlo </a>
|
790 |
+
|
791 |
+
<a id="elem" href="https://cs-114.org/wp-content/uploads/2015/01/Elements_of_Information_Theory_Elements.pdf"> [23] Elements_of_Information_Theory_Elements - 2.9 The Second Law of Thermodynamics </a>
|
792 |
+
|
793 |
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_reference_zh")
|
794 |
|
795 |
return
|
|
|
836 |
|
837 |
md_cond_kl_zh()
|
838 |
|
839 |
+
md_approx_gauss_zh()
|
840 |
+
|
841 |
+
md_non_expanding_zh()
|
842 |
+
|
843 |
+
md_stationary_zh()
|
844 |
|
845 |
md_reference_zh()
|
846 |
|
data.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
fig2.png
DELETED
Binary file (107 kB)
|
|
fig3.png
DELETED
Binary file (123 kB)
|
|
fig4.png
DELETED
Binary file (122 kB)
|
|