Spaces:
Running
Running
File size: 85,151 Bytes
12f3253 66e25d1 12f3253 53339de 12f3253 66e25d1 12f3253 66e25d1 12f3253 53339de 12f3253 53339de 66e25d1 12f3253 53339de 66e25d1 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 53339de 12f3253 ec03973 12f3253 53339de 12f3253 ec03973 66e25d1 12f3253 53339de 12f3253 53339de 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 66e25d1 12f3253 53339de 12f3253 53339de 12f3253 53339de 12f3253 53339de ec03973 12f3253 53339de 12f3253 ec03973 12f3253 ec03973 53339de ec03973 66e25d1 12f3253 53339de 12f3253 53339de 12f3253 53339de 12f3253 53339de 12f3253 66e25d1 12f3253 53339de e6088ac 12f3253 66e25d1 12f3253 d365954 12f3253 d365954 12f3253 d365954 12f3253 d365954 12f3253 53339de 12f3253 53339de 66e25d1 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 53339de 12f3253 ec03973 12f3253 53339de 12f3253 53339de ec03973 53339de ec03973 53339de ec03973 53339de ec03973 53339de ec03973 53339de ec03973 53339de ec03973 53339de d365954 12f3253 53339de d365954 ec03973 53339de d365954 ec03973 53339de d365954 53339de d365954 53339de d365954 53339de d365954 53339de d365954 53339de d365954 53339de ec03973 53339de ec03973 53339de ec03973 53339de ec03973 53339de ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 12f3253 ec03973 53339de ec03973 66e25d1 12f3253 53339de 12f3253 66e25d1 12f3253 53339de ec03973 53339de 12f3253 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 |
import gradio as gr
from Misc import g_css, js_head, g_latex_del
js_head += """ <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.js" integrity="sha384-0fdwu/T/EQMsQlrHCCHoH10pkPLlKA1jL5dFyUOvB3lfeT2540/2g6YgSi2BL14p" crossorigin="anonymous"></script> """
def md_introduction_en():
global g_latex_del
with gr.Accordion(label="0. Introduction", elem_classes="first_md", elem_id="introduction"):
gr.Markdown(
r"""
The Diffusion Probability Model[\[1\]](#dpm)[\[2\]](#ddpm) is currently the main method used in image and video generation, but due to its abstruse theory, many engineers are unable to understand it well. This article will provide a very easy-to-understand method to help readers grasp the principles of the Diffusion Model. Specifically, it will illustrate the Diffusion Model using examples of one-dimensional random variables in an interactive way, explaining several interesting properties of the Diffusion Model in an intuitive manner.
The diffusion model is a probabilistic model. Probabilistic models mainly offer two functions: calculating the probability of a given sample appearing; and generating new samples. The diffusion model focuses on the latter aspect, facilitating the production of new samples, thus realizing the task of **generation**.
The diffusion model differs from general probability models (such as GMM), which directly models the probability distribution of random variables. The diffusion model adopts an indirect approach, which utilizes **random variable transform**(shown in Figure 1a) to gradually convert the data distribution (the probability distribution to be modeled) into the **standard normal distribution**, and meanwhile models the posterior probability distribution corresponding to each transformation (Figure 1b-c). Upon obtaining the final standard normal distribution and the posterior probability distributions, one can generate samples of each random variable $Z_T \ldots Z_2,Z_1,X$ in reverse order through <b>Ancestral Sampling</b>. Simultaneously, initial data distribution $q(x)$ can be determined by employing Bayes theorem and the total probability theorem.
One might wonder: indirect methods require modeling and learning T posterior probability distributions, while direct methods only need to model one probability distribution, Why would we choose the indirect approach? Here's the reasoning: the initial data distribution might be quite complex and hard to represent directly with a probability model. In contrast, the complexity of each posterior probability distribution in indirect methods is significantly simpler, allowing it to be approximated by simple probability models. As we will see later, given certain conditions, posterior probability distributions can closely resemble Gaussian distributions, thus a simple conditional Gaussian model can be used for modeling.
<center> <img id="en_fig1" src="file/fig1.png" width="820" style="margin-top:12px"/> </center>
<center> Figure 1: Diffusion probability model schematic </center>
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_introduction_en")
return
def md_transform_en():
global g_latex_del
with gr.Accordion(label="1. How To Transform", elem_classes="first_md", elem_id="transform"):
gr.Markdown(
r"""
To transform the initial data distribution into a simple standard normal distribution, the diffusion model uses the following transformation method:
\begin{align}
Z = \sqrt{\alpha} X + \sqrt{1-\alpha}\epsilon \qquad where \quad \alpha < 1, \quad \epsilon \sim \mathcal{N}(0, I) \tag{1.1}
\end{align}
where $X\sim q(x)$is any random variable,$Z\sim q(Z)$ is the transformed random variable。
This transformation can be divided into two sub-transformations。
The first sub-transformation performs a linear transformation ($\sqrt{\alpha}X$) on the random variable $X$. According to the conclusion of the literature[\[3\]](#linear_transform), the linear transformation makes the probability distribution of $X$ **narrower and taller**, and the extent of **narrowing and heightening** is directly proportional to the value of $\alpha$.
This can be specifically seen in <a href="#demo_1">Demo 1</a>, where the first figure depicts a randomly generated one-dimensional probability distribution, and the second figure represents the probability distribution after the linear transformation. It can be observed that the curve of the third figure has become **narrower and taller** compared to the first image. Readers can experiment with different $\alpha$ to gain a more intuitive understanding.
The second sub-transformation is **adding independent random noise**($\sqrt{1-\alpha}\epsilon$). According to the conclusion of the literature[\[4\]](#sum_conv), **adding independent random variables** is equivalent to performing convolution on the two probability distributions. Since the probability distribution of random noise is Gaussian, it is equivalent to performing a **Gaussian Blur** operation. After blurring, the original probability distribution will become smoother and more similar to the standard normal distribution. The degree of blurring is directly proportional to the noise level ($\sqrt{1-\alpha}$).
For specifics, one can see <a href="#demo_1">Demo 1</a>, where the first figure is a randomly generated one-dimensional probability distribution, and the third figure is the result after the transformation. It can be seen that the transformed probability distribution curve is smoother and there are fewer corners. The readers can test different $\alpha$ values to feel how the noise level affect the shape of the probability distribution. The last figure is the result after applying all two sub-transformations.
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_transform_en")
return
def md_likelihood_en():
global g_latex_del
with gr.Accordion(label="2. Likelihood of The Transform", elem_classes="first_md", elem_id="likelihood"):
gr.Markdown(
r"""
From the transformation method (equation 1.1), it can be seen that the probability distribution of the forward conditional probability $q(z|x)$ is a Gaussian distribution, which is only related to the value of $\alpha$, regardless of the probability distribution of $q(x)$.
\begin{align}
q(z|x) &= \mathcal{N}(\sqrt{\alpha}x,\ 1-\alpha) \tag{2.1}
\end{align}
It can be understood by concrete examples in <a href="#demo_2">Demo 2</a>. The third figure depict the shape of $q(z|x)$. From the figure, a uniform slanting line can be observed. This implies that the mean of $q(z|x)$ is linearly related to x, and the variance is fixed. The magnitude of $\alpha$ will determine the width and incline of the slanting line.
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_likelihood_en")
return
def md_posterior_en():
global g_latex_del
with gr.Accordion(label="3. Posterior of The Transform", elem_classes="first_md", elem_id="posterior"):
gr.Markdown(
r"""
The posterior probability distribution does not have a closed form, but its shape can be inferred approximately through some technique.
According to Bayes formula, we have
\begin{align}
q(x|z) = \frac{q(z|x)q(x)}{q(z)} \tag{3.1}
\end{align}
When $z$ is fixed, $q(z)$ is a constant, so $q(x|z)$ is a probability density function with respect to $x$, and its shape depends only on $q(z|x)q(x)$.
\begin{align}
q(x|z) \propto q(z|x)q(x) \qquad where\ z\ is\ fixed \tag{3.2}
\end{align}
In fact, $q(z)=\int q(z|x)q(x)dx$, which means that $q(z)$ is the sum over $x$ of the function $q(z|x)q(x)$. Therefore, dividing $q(z|x)q(x)$ by $q(z)$ is equivalent to normalizing $q(z|x)q(x)$.
\begin{align}
q(x|z) = \operatorname{Normalize}\big(q(z|x)q(x)\big) \tag{3.3}
\end{align}
From Equation 2.1, we can see that $q(z|x)$ is a Gaussian distribution, so we have
\begin{align}
q(x|z) &\propto \frac{1}{\sqrt{2\pi(1-\alpha)}}\exp{\frac{-(z-\sqrt{\alpha}x)^2}{2(1-\alpha)}}\ q(x)& \qquad &\text{where z is fixed} \notag \newline
&= \frac{1}{\sqrt{\alpha}}\frac{1}{\sqrt{2\pi\frac{1-\alpha}{\alpha}}}\exp{\frac{-(\frac{z}{\sqrt{\alpha}}-x)^2}{2\frac{1-\alpha}{\alpha}}}\ q(x)& \notag \newline
&= \frac{1}{\sqrt{\alpha}} \underbrace{\frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}}_{\text{GaussFun}}\ q(x)& \qquad &\text{where}\ \mu=\frac{z}{\sqrt{\alpha}}\quad \sigma=\sqrt{\frac{1-\alpha}{\alpha}} \tag{3.4}
\end{align}
It can be observed that the <b>GaussFun</b> part is a Gaussian function of $x$, with a mean of $\frac{z}{\sqrt{\alpha}}$ and a variance of $\sqrt{\frac{1-\alpha}{\alpha}}$, so the shape of $q(x|z)$ is determined by **the product of GaussFun and q(x)**.
According to the characteristics of <em>multiplication</em>, the characteristics of the shape of the $q(x|z)$ function can be summarized.
<ul>
<li>The support set of $q(x|z)$ should be contained within the support set of GaussFun. The support set of GaussFun is a hypersphere, centered at the mean $\mu$ with a radius of approximately 3 times the standard deviation $\sigma$. </li>
<li>When the variance of the Gaussian function is small (small noise), or when $q(x)$ changes linearly, the shape of $q(x|z)$ will approximate to the Gaussian function, and have a simpler function form, which is convenient for modeling and learning.</li>
<li>When the variance of the Gaussian function is large (large noise), or when $q(x)$ changes drastically, the shape of $q(x|z)$ will be more complex, and greatly differ from a Gaussian function, which makes it difficult to model and learn.</li>
</ul>
<a href="#approx_gauss">Appendix B</a> provides a more rigorous analysis. When $\sigma$ satisfies certain conditions, $q(x|z)$ approximates to Gaussiani distribution.
The specifics can be seen in <a href="#demo_2">Demo 2</a>. The fourth figure present the shape of the posterior $q(x|z)$, which shows an irregular shape and resembles a curved and uneven line. As $\alpha$ increases (noise decreases), the curve tends to be uniform and straight. Readers can adjust different $\alpha$ values and observe the relationship between the shape of posterior and the level of noise. In the last figure, the $\textcolor{blue}{\text{blue dash line}}$ represents $q(x)$, the $\textcolor{green}{\text{green dash line}}$ represents <b>GaussFun</b> in the equation 3.4, and the $\textcolor{orange}{\text{orange curve}}$ represents the result of multiplying the two function and normalizing it, which is the posterior probability $q(x|z=fixed)$ under a fixed z condition. Readers can adjust different values of z to observe how the fluctuation of $q(x)$ affect the shape of the posterior probability $q(x|z)$.
The posterior $q(x|z)$ under two special states are worth considering.
<ul>
<li>As $\alpha \to 0$, the variance of <b>GaussFun</b> tends to <b>$\infty$</b>, and GaussFun almost becomes a uniform distribution over a very large support set, and the result of multiplying $q(x)$ by the uniform distribution is still $q(x)$, therefore, $q(x|z)$ for different $z$ almost become identical, and almost the same as $q(x)$. Readers can set $\alpha$ to 0.001 in <a href="#demo_2">Demo 2</a> to observe the specific results.</li>
<li>As $\alpha \to 1$, the variance of <b>GaussFun</b> tends to <b>$0$</b>, The $q(x|z)$ for different $z$ values contract into a series of <em>Dirac delta functions</em> with different offsets equalling to $z$. However, there are some exceptions. When there are regions where $q(x)$ is zero, the corresponding $q(x|z)$ will no longer be a Dirac <em>delta function</em>, but a zero function. Readers can set $\alpha$ to 0.999 in <a href="#demo_2">Demo 2</a> to observe the specific results.</li>
</ul>
There is one point to note. when $\alpha \to 0$, the mean of GaussFun corresponding for larger $z$ values($\mu = \frac{z}{\sqrt{\alpha}}$) also increases sharply. This means that GaussFun is located farther from the support of $q(x)$. In this case, the "uniformity" of the part of GaussFun corresponding to the support of $q(x)$ will slightly decrease, thereby slightly reducing the similarity between $q(x|z)$ and $q(x)$. However, this effect will further diminish as $\alpha$ decreases. Readers can observe this effect in <a href="#demo_2">Demo 2</a>. Set $\alpha$ to 0.001, and you will see a slight difference between $q(x|z=-2)$ and $q(x)$, but no noticeable difference between $q(x|z=0)$ and $q(x)$.
Regarding the "uniformity" of the Gaussian function, there are two characteristics: the larger the standard deviation, the greater the uniformity; the farther away from the mean, the smaller the uniformity.
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_en")
return
def md_forward_process_en():
global g_latex_del
title = "4. Transform Data Distribution To Normal Distribution"
with gr.Accordion(label=title, elem_classes="first_md", elem_id="forward_process"):
gr.Markdown(
r"""
For any arbitrary data distribution $q(x)$, the transform(equation 2.1) in section 2 can be continuously applied(equation 4.1~4.4). As the number of transforms increases, the output probability distribution will become increasingly closer to the standard normal distribution. For more complex data distributions, more iterations or larger noise are needed.
Specific details can be observed in <a href="#demo_3_1">Demo 3.1</a>. The first figure illustrates a randomly generated one-dimensional probability distribution. After seven transforms, this distribution looks very similar to the standard normal distribution. The degree of similarity increases with the number of iterations and the level of the noise. Given the same degree of similarity, fewer transforms are needed if the noise added at each step is larger (smaller $\alpha$ value). Readers can try different $\alpha$ values and numbers of transforms to see how similar the final probability distribution is.
The complexity of the initial probability distribution tends to be high, but as the number of transforms increases, the complexity of the probability distribution $q(z_t)$ will decrease. As concluded in section 4, a more complex probability distribution corresponds to a more complex posterior probability distribution. Therefore, in order to ensure that the posterior probability distribution is more similar to the Conditional Gaussian function (easier to learn), a larger value of $\alpha$ (smaller noise) should be used in the initial phase, and a smaller value of $\alpha$ (larger noise) can be appropriately used in the later phase to accelerate the transition to the standard normal distribution.
In the example of <a href="#demo_3_1">Demo 3.1</a>, it can be seen that as the number of transforms increases, the corners of $q(z_t)$ become fewer and fewer. Meanwhile, the slanting lines in the plot of the posterior probability distribution $q(z_{t-1}|z_t)$ become increasingly straight and uniform, resembling more and more the conditional Gaussian distribution.
\begin{align}
Z_1 &= \sqrt{\alpha_1} X + \sqrt{1-\alpha_1}\epsilon_1 \tag{4.1} \newline
Z_2 &= \sqrt{\alpha_2} Z_1 + \sqrt{1-\alpha_2}\epsilon_2 \tag{4.2} \newline
&\dots \notag \newline
Z_{t} &= \sqrt{\alpha_t}Z_{t-1} + \sqrt{1-\alpha_t}\epsilon_{t} \tag{4.3} \newline
&\dots \notag \newline
Z_{T} &= \sqrt{\alpha_T}Z_{T-1} + \sqrt{1-\alpha_T}\epsilon_{T} \tag{4.4} \newline
&where \quad \alpha_t < 1 \qquad t\in {1,2,\dots,T} \notag
\end{align}
By substituting Equation 4.1 into Equation 4.2, and utilizing the properties of Gaussian distribution, we can derive the form of $q(z_2|x)$
\begin{align}
z_2 &= \sqrt{\alpha_2}(\sqrt{\alpha_1}x + \sqrt{1-\alpha_1}\epsilon_1) + \sqrt{1-\alpha_2}\epsilon_2 \tag{4.5} \newline
&= \sqrt{\alpha_2\alpha_1}x + \sqrt{\alpha_2-\alpha_2\alpha_1}\epsilon_1 + \sqrt{1-\alpha_2}\epsilon_2 \tag{4.6} \newline
&= \mathcal{N}(\sqrt{\alpha_1\alpha_2}x,\ 1-\alpha_1\alpha_2) \tag{4.7}
\end{align}
In the same way, it can be deduced recursively that
\begin{align}
q(z_t|x) &= \mathcal{N}(\sqrt{\alpha_1\alpha_2\cdots\alpha_t}x,\ 1-\alpha_1\alpha_2\cdots\alpha_t) = \mathcal{N}(\sqrt{\bar{\alpha_t}}x,\ 1-\bar{\alpha_t}) \qquad where\ \bar{\alpha_t} \triangleq \prod_{j=1}^t\alpha_j \tag{4.8}
\end{align}
Comparing the forms of Equation 4.8 and Equation 2.1, it can be found that their forms are completely consistent.
If we only focus on the relationship between the initial and final random variables, then a sequence of t small transforms can be replaced by one large transform, and the $\alpha$ of the large transform is the accumulation of the $\alpha$ from each small transform, because the joint probability distributions corresponding to both types of transforms are the same.
Readers can perform an experiment in <a href="#demo_3_1">Demo 3.1</a> using the same input distribution $q(x)$ but with two different transform methods: 1) using three transformations, each with $\alpha$ equal to 0.95; 2) using a single transform with $\alpha$ set to 0.857375. Perform the transformations separately and then compare the two resulting distributions. You will see that the two distributions are identical.
In the DDPM[\[2\]](#ddpm) paper, the authors used 1000 steps (T=1000) to transform the data distribution $q(x)$ to $q(z_T)$. The probability distribution of $q(z_T|x)$ is as follows:
\begin{align}
q(z_T|x) &= \mathcal{N}(0.00635\ x,\ 0.99998) \tag{4.9}
\end{align}
If only considering the joint distribution $q(x, z_T)$, a single transformation can also be used as a substitute, which is as follows:
\begin{align}
Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{4.10}
\end{align}
It can be seen that, after applying two transforms, the transformed distributions $q(z_T|x)$ are the same. Thus, $q(x,z_T)$ is also the same.
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_forward_process_en")
return
def md_backward_process_en():
global g_latex_del
title = "5. Restore Data Distribution From Normal Distribution"
with gr.Accordion(label=title, elem_classes="first_md", elem_id="backward_process"):
gr.Markdown(
r"""
If the final probability distribution $q(z_T)$ and the posterior probabilities of each transform $q(x|z),q(z_{t-1}|z_t)$ are known, the data distribution $q(x)$ can be recovered through the Bayes Theorem and the Law of Total Probability, as shown in equations 5.1~5.4. When the final probability distribution $q(z_T)$ is very similar to the standard normal distribution, the standard normal distribution can be used as a substitute.
Specifics can be seen in <a href="#demo_3_2">Demo 3.2</a>. In the example, $q(z_T)$ substitutes $\mathcal{N}(0,1)$, and the error magnitude is given through JS Divergence. The restored probability distribution $q(z_t)$ and $q(x)$ are identified by the $\textcolor{green}{\text{green curve}}$, and the original probability distribution is identified by the $\textcolor{blue}{\text{blue curve}}$. It can be observed that the data distribution $q(x)$ can be well restored, and the error (JS Divergence) will be smaller than the error caused by the standard normal distribution replacing $q(z_T)$.
\begin{align}
q(z_{T-1}) &= \int q(z_{T-1},z_T)dz_T = \int q(z_{T-1}|z_T)q(z_T)dz_T \tag{5.1} \newline
& \dots \notag \newline
q(z_{t-1}) &= \int q(z_{t-1},z_t)dz_t = \int q(z_{t-1}|z_t)q(z_t)dz_t \tag{5.2} \newline
& \dots \notag \newline
q(z_1) &= \int q(z_1,z_2) dz_1 = \int q(z_1|z_2)q(z_2)dz_2 \tag{5.3} \newline
q(x) &= \int q(x,z_1) dz_1 = \int q(x|z_1)q(z_1)dz_1 \tag{5.4} \newline
\end{align}
In this article, the aforementioned transform is referred to as the <b>Posterior Transform</b>. For example, in equation 5.4, the input of the transform is the probability distribution function $q(z_1)$, and the output is the probability distribution function $q(x)$.The entire transform is determined by the posterior $q(x|z_1)$. This transform can also be considered as the linear weighted sum of a set of basis functions, where the basis functions are $q(x|z_1)$ under different $z_1$, and the weights of each basis function are $q(z_1)$. Some interesting properties of this transform will be introduced in <a href="#posterior_transform">Section 7</a>.
In <a href="#posterior">Section 3</a>, we have considered two special posterior probability distributions. Next, we analyze their corresponding <em>posterior transforms</em>.
<ul>
<li> When $\alpha \to 0$, the $q(x|z)$ for different $z$ are almost the same as $q(x)$. In other words, the basis functions of linear weighted sum are almost the same. In this state, no matter how the input changes, the output of the transformation is always $q(x)$.</li>
<li> When $\alpha \to 1$, the $q(x|z)$ for different $z$ values becomes a series of Dirac delta functions and zero functions. In this state, as long as the <em>support</em> of the input distribution is included in the <em>support set</em> of $q(x)$, the output of the transformation will remain the same with the input.</li>
</ul>
In <a href="#forward_process">Section 4</a>, it is mentioned that the 1000 transformations used in the DDPM[\[2\]](#ddpm) can be represented using a single transformation
\begin{align}
Z_T = \sqrt{0.0000403}\ X + \sqrt{1-0.0000403}\ \epsilon = 0.00635\ X + 0.99998\ \epsilon \tag{5.5}
\end{align}
Since $\alpha=0.0000403$ is very small, the corresponding standard deviation of GaussFun (Equation 3.4) reaches 157.52. If we constrain the support of $q(x)$ within the unit hypersphere ($\lVert x \rVert_2 < 1$), then for $z_T$ in the range $[-2, +2]$, each corresponding $q(x|z_T)$ is very similar to $q(x)$. In this state, for the posterior transform of $q(x|z_T)$, regardless of the shape of the input distribution, as long as the support set is within the range $[-2,+2]$, the output distribution will be $q(x)$.
<b>Furthermore, we can conclude that in the DPM model, if the support of $q(x)$ is finite and the signal-to-noise ratio of the final variable $Z_T$ is sufficiently high, the process of restoring $q(x)$ can use any distribution; it doesn't necessarily have to use the standard normal distribution.</b>
Readers can conduct a similar experiment themselves. In <a href="#demo_3_1">Demo 3.1</a>, set <em>start_alpha</em> to 0.25, <em>end_alpha</em> to 0.25, and <em>step</em> to 7. At this point, $q(z_7)=\sqrt{0.000061}X + \sqrt{1-0.000061} \epsilon$, which is roughly equivalent to DDPM's $q(z_T)$. Click on <b>apply</b> to perform the forward transform (plotted using $\textcolor{blue}{\text{blue curves}}$), which prepares for the subsequent restoring process. In <a href="#demo_3_2">Demo 3.2</a>, set the <em>noise_ratio</em> to 1, introducing 100% noise into the <em>tail distribution</em> $q(z_7)$. Changing the value of <em>nose_random_seed</em> will change the distribution of noise. Deselect <em>backward_pdf</em> to reduce screen clutter. Click on <b>apply</b> to restore $q(x)$ through posterior transform. You will see that, no matter what the shape of input $q(z_7)$ may be, the restored $q(x)$ is always exactly the same as the original $q(x)$. The JS Divergence is zero. The restoration process is plotted using a $\textcolor{red}{\text{red curve}}$.
There is another point worth noting. In deep learning tasks, it is common to scale each dimension of the input within the range [-1, 1], which means within a unit hypercube. The maximum Euclidean distance between any two points in a unit hypercube increases with the dimensionality. For example, in one dimension, the maximum distance is $2$, two dimensions is $2\sqrt{2}$, three dimensions is $2\sqrt{3}$, and n dimensions is $2\sqrt{n}$. Therefore, for data with higher dimensions, the variable $Z_T$ needs a higher signal-to-noise ratio to allow the starting distribution of the recovery process to accept any distribution.
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_backward_process_en")
return
def md_fit_posterior_en():
global g_latex_del
title = "6. Fitting Posterior With Conditional Gaussian Model"
with gr.Accordion(label=title, elem_classes="first_md", elem_id="fit_posterior"):
gr.Markdown(
r"""
From the front part of <a href="#posterior">Section 3</a>, it is known that the posterior probability distributions are unknown and related to $q(x)$. Therefore, in order to recover the data distribution or sample from it, it is necessary to learn and estimate each posterior probability distribution.
From the latter part of <a href="#posterior">Section 3</a>, it can be understood that when certain conditions are met, each posterior probability distribution $q(x|z), q(z_{t-1}|z_t)$ approximates the Gaussian probability distribution. Therefore, by constructing a set of conditional Gaussian probability models $p(x|z), p(z_{t-1}|z_t)$, we can learn to fit the corresponding $q(x|z), q(z_{t-1}|z_t)$.
Due to the limitations of the model's representative and learning capabilities, there will be certain errors in the fitting process, which will further impact the accuracy of restored $q(x)$. The size of the fitting error is related to the complexity of the posterior probability distribution. As can be seen from <a href="#posterior">Section 3</a>, when $q(x)$ is more complex or the added noise is large, the posterior probability distribution will be more complex, and it will differ greatly from the Gaussian distribution, thus leading to fitting errors and further affecting the restoration of $q(x)$.
Refer to <a href="#demo_3_3">Demo 3.3</a> for the specifics. The reader can test different $q(x)$ and $\alpha$, observe the fitting degree of the posterior probability distribution $q(z_{t-1}|z_t)$ and the accuracy of restored $q(x)$. The restored probability distribution is ploted with $\textcolor{orange}{\text{orange}}$, and the error is also measured by JS divergence.
Regarding the objective function for fitting, similar to other probability models, the cross-entropy loss can be optimized to make $p(z_{t-1}|z_t)$ approaching $q(z_{t-1}|z_t)$. Since $(z_{t-1}|z_t)$ is a conditional probability, it is necessary to fully consider all conditions. This can be achieved by averaging the cross-entropy corresponding to each condition weighted by the probability of each condition happening. The final form of the loss function is as follows.
\begin{align}
loss &= -\int q(z_t) \overbrace{\int q(z_{t-1}|z_t) \log \textcolor{blue}{p(z_{t-1}|z_t)}dz_{t-1}}^{\text{Cross Entropy}}\ dz_t \tag{6.1} \newline
&= -\iint q(z_{t-1},z_t) \log \textcolor{blue}{p(z_{t-1}|z_t)}dz_{t-1}dz_t \tag{6.2}
\end{align}
KL divergence can also be optimized as the objective function. KL divergence and cross-entropy are equivalent[\\[10\\]](#ce_kl)
<span id="en_fit_0">
loss &= \int q(z_t) KL(q(z_{t-1}|z_t) \Vert \textcolor{blue}{p(z_{t-1}|z_t)})dz_t \tag{6.3} \newline
&= \int q(z_t) \int q(z_{t-1}|z_t) \log \frac{q(z_{t-1}|z_t)}{\textcolor{blue}{p(z_{t-1}|z_t)}} dz_{t-1} dz_t \tag{6.4} \newline
&= -\int q(z_t)\ \underbrace{\int q(z_{t-1}|z_t) \log \textcolor{blue}{p(z_{t-1}|z_t)}dz_{t-1}}{underline}{\text{Cross Entropy}}\ dz_t + \underbrace{\int q(z_t) \int q(z_{t-1}|z_t) \log q(z_{t-1}|z_t)}{underline}{\text{Is Constant}} dz \tag{6.5}
</span>
The integral in equation 6.2 does not have a closed form and cannot be directly optimized. The Monte Carlo integration can be used for approximate calculation. The new objective function is as follows:
\begin{align}
loss &= -\iint q(z_{t-1},z_t) \log \textcolor{blue}{p(z_{t-1}|z_t)}dz_{t-1}dz_t \tag{6.6} \newline
&\approx -\sum_{i=0}^N \log \textcolor{blue}{p(Z_{t-1}^i|Z_t^i)} \qquad where \quad (Z_{t-1}^i,Z_t^i) \sim q(z_{t-1},z_t) \tag{6.7}
\end{align}
The aforementioned samples $(Z_{t-1}^i,Z_t^i)$ follow a joint probability distribution $q(z_{t-1},z_t)$, which can be sampled via an <b>Ancestral Sampling</b>. The specific method is as follows: sample $X,Z_1,Z_2 \dots Z_{t-1},Z_t$ step by step through forward transforms (Formulas 4.1~4.4), and then reserve $(Z_{t-1},Z_t)$ as a sample. This sampling process is relatively slow. To speed up the sampling, we can take advantage of the known features of the probability distribution $q(z_t|x)$ (Formula 4.8). First, sample $X$ from $q(x)$, then sample $Z_{t-1}$ from $q(z_{t-1}|x)$, and finally sample $Z_t$ from $q(z_t|z_{t-1})$. Thus, a sample $(Z_{t-1},Z_t)$ is obtained.
Some people may question that the objective function in Equation 6.3 seems different from those in the DPM[\\[1\\]](#dpm) and DDPM[\\[2\\]](#ddpm) papers. In fact, these two objective functions are equivalent, and the proof is given below.
For <b>Consistent Terms</b>, the proof is as follows:
\begin{align}
loss &= -\iint q(z_{t-1},z_t)\ \log \textcolor{blue}{p(z_{t-1}|z_t)}dz_{t-1}dz_t \tag{6.8} \newline
&= -\iint \int q(x)q(z_{t-1}, z_t|x)dx\ \log \textcolor{blue}{p(z_{t-1}|z_t)}dz_{t-1}dz_t \tag{6.9} \newline
&= \overbrace{\iint \int q(x)q(z_{t-1}, z_t|x) \log q(z_{t-1}|z_t,x)dxdz_{t-1}dz_t}^{\text{This Term Is Constant And Is Denoted As}\ \textcolor{orange}{C_1}} \tag{6.10} \newline
&\quad - \iint \int q(x)q(z_{t-1}, z_t|x) \log \textcolor{blue}{p(z_{t-1}|z_t)}dxdz_{t-1}dz_t - \textcolor{orange}{C_1} \tag{6.11} \newline
&= \iint \int q(x)q(z_{t-1},z_t|x) \log \frac{q(z_{t-1}|z_t,x)}{\textcolor{blue}{p(z_{t-1}|z_t)}}dxdz_{t-1}dz_t - \textcolor{orange}{C_1} \tag{6.12} \newline
&= \iint q(x)q(z_t|x)\int q(z_{t-1}|z_t,x) \log \frac{q(z_{t-1}|z_t,x)}{\textcolor{blue}{p(z_{t-1}|z_t)}}dz_{t-1}\ dz_t dx - \textcolor{orange}{C_1} \tag{6.13} \newline
&= \iint \ q(x)q(z_t|x) KL[q(z_{t-1}|z_t,x) \Vert \textcolor{blue}{p(z_{t-1}|z_t)}] dz_t dx - \textcolor{orange}{C_1} \tag{6.14} \newline
&\propto \iint \ q(x)q(z_t|x) KL(q(z_{t-1}|z_t,x) \Vert \textcolor{blue}{p(z_{t-1}|z_t)}) dz_t dx \tag{6.15} \newline
\end{align}
In the above formula, the term $C_1$ is a fixed value, which does not contain parameters to be optimized. Here, $q(x)$ is a fixed probability distribution, and $q(z_{t-1}|z_t)$ is also a fixed probability distribution, whose specific form is determined by $q(x)$ and the coefficient $\alpha$.
For the <b>Reconstruction Term</b>, it can be proven in a similar way.
\begin{align}
loss &= -\int q(z_1)\overbrace{\int q(x|z_1)\log \textcolor{blue}{p(x|z_1)}dx}^{\text{Cross Entropy}}\ dz_1 \tag{6.16} \newline
&= -\iint q(z_1,x)\log \textcolor{blue}{p(x|z_1)}dxdz_1 \tag{6.17} \newline
&= -\int q(x)\int q(z_1|x)\log \textcolor{blue}{p(x|z_1)}dz_1\ dx \tag{6.18}
\end{align}
Therefore, the objective function in equation 6.1 is equivalent with the DPM original objective function.
Based on the conclusion of the Consistent Terms proof and the relationship between cross entropy and KL divergence, an interesting conclusion can be drawn:
<span id="en_fit_1">
\mathop{\min}{underline}{\textcolor{blue}{p}} \int q(z_t) KL(q(z_{t-1}|z_t) \Vert \textcolor{blue}{p(z_{t-1}|z_t)})dz_t \iff \mathop{\min}{underline}{\textcolor{blue}{p}} \iint \ q(z_t)q(x|z_t) KL(q(z_{t-1}|z_t,x) \Vert \textcolor{blue}{p(z_{t-1}|z_t)})dxdz_t \tag{6.19}
</span>
By comparing the expressions on the left and right, it can be observed that the objective function on the right side includes an additional variable $X$ compared to the left side. At the same time, there is an additional integral with respect to $X$, with the occurrence probability of $X$, denoted as $q(x|z_t)$, serving as the weighting coefficient for the integral.
Following a similar proof method, a more general relationship can be derived:
<span id="en_fit_2">
\mathop{\min}{underline}{\textcolor{blue}{p}} KL(q(z) \Vert \textcolor{blue}{p(z)}) \iff \mathop{\min}_{\textcolor{blue}{p}} \int \ q(x) KL(q(z|x) \Vert \textcolor{blue}{p(z)})dx \tag{6.20}
</span>
A detailed derivation of this conclusion can be found in <a href="#cond_kl">Appendix A</a>.
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_fit_posterior_en")
return
def md_posterior_transform_en():
global g_latex_del
with gr.Accordion(label="7. Posterior Transform", elem_classes="first_md", elem_id="posterior_transform"):
gr.Markdown(
r"""
</br>
<h3 style="font-size:18px"> Non-expanding Mapping and Stationary Distribution </h3>
\begin{align}
q(x) &= \int q(x,z) dz = \int q(x|z)q(z)dz \tag{7.1}
\end{align}
According to Corollary 1 and Corollary 2 in <a href="#non_expanping">Appendix B</a>, the posterior transform is a <b>non-expanding mapping</b>. This means that for any two probability distributions $q_{i1}(z)$ and $q_{i2}(z)$, after the posterior transform, the resulting distributions $q_{o1}(x)$ and $q_{o2}(x)$ will have a distance that is <b>always less than or equal to</b> the distance between $q_{i1}(x)$ and $q_{i2}(x)$. The distance here can be measured using KL Divergence or Total Variance.
\begin{align}
d(q_{o1}(z),\ q_{o2}(z)) \le d(q_{i1}(x),\ q_{i2}(x)) \tag{7.2}
\end{align}
According to the analysis in <a href="#non_expanping">Appendix B</a>, the aforementioned equality does not hold in most cases and posterior transform becomes a <b>shrinkig mapping</b>. Furthermore, <b>the smaller $\alpha$ is (the more noise), the smaller $d(q_{o1},q_{o2})$ will be compared to $d(q_{i1},q_{i2})$</b>.
Readers can refer to <a href="#demo_4_1">Demo 4.1</a>, where the first three figure present a transform process. The first figure is an arbitrary data distribution $q(x)$, the third figure is the transformed probability distribution, and second figure is the posterior probability distribution $q(x|z)$. You can change the random seed to generate a new data distribution$q(x)$, and adjust the value of $\alpha$ to introduce different degrees of noise.
The last two figures show contraction of the transform. The fourth figure displays two randomly generated input distributions and their distance, $div_{in}$. The fifth figure displays the two output distributions after transform, with the distance denoted as $div_{out}$.
Readers can change the input random seed to toggle different inputs. It can be observed from the figures that $div_{in}$ is always smaller than $div_{out}$ for any input. Additionally, if you change the value of $\alpha$, you will see that the smaller the $\alpha$(larger noise), the smaller the ratio of $div_{out}/div_{in}$,indicating a larger rate of contraction.
According to the analysis in <a href="#stationary">Appendix C</a>: the posterior transform can be seen as a one-step jump of a Markov chain, and <b>when $q(x)$ and $\alpha$ meet certain conditions, this Markov chain will converge to a unique stationary distribution</b>. Additionally, numerous experiments have shown that <b>the stationary distribution is very similar to the data distribution $q(x)$, and the smaller $\alpha$ is, the more similar the stationary distribution is to $q(x)$</b>. Specifically, according to the conclusion in <a href="#backward_process">Section 5</a>, <b>when $\alpha \to 0$, after one step of transform, the output distribution will be $q(x)$, so the stationary distribution must be $q(x)$</b>.
Readers can refer to <a href="#demo_4_2">Demo 4.2</a>, which illustrates an example of applying posterior transform iteratively. Choose an appropriate number of iterations, and click on the button of <em>Apply</em>, and the iteration process will be draw step by step. Each subplot shows the transformed output distribution($\textcolor{green}{\text{green curve}}$) from each transform, with the reference distribution $q(x)$ expressed as a $\textcolor{blue}{\text{blue curve}}$, as well as the distance $div$ between the output distribution and $q(x)$. It can be seen that as the number of iterations increases, the output distribution becomes more and more similar to $q(x)$, and will eventually stabilize near $q(x)$. For more complicated distributions, more iterations or greater noise may be required. The maximum number of iterations can be set to tens of thousands, but it'll take longer.
For the one-dimensional discrete case, $q(x|z)$ is discretized into a matrix (denoted as $Q_{x|z}$), $q(z)$ is discretized into a vector (denoted as $\boldsymbol{q_i}$). The integration operation $\int q(x|z)q(z)dz$ is discretized into a **matrix-vector** multiplication operation, thus the posterior transform can be written as
\begin{align}
\boldsymbol{q_o} &= Q_{x|z}\ \boldsymbol{q_i} & \quad\quad &\text{1 iteration} \tag{7.3} \newline
\boldsymbol{q_o} &= Q_{x|z}\ Q_{x|z}\ \boldsymbol{q_i} & \quad\quad &\text{2 iteration} \tag{7.4} \newline
& \dots & \notag \newline
\boldsymbol{q_o} &= (Q_{x|z})^n\ \boldsymbol{q_i} & \quad\quad &\text{n iteration} \tag{7.5} \newline
\end{align}
In order to better understand the property of the transform, the matrix $(Q_{x|z})^n$ is also plotted in <a href="#demo_4_2">Demo 4.2</a>. From the demo we can see that, as the iterations converge, the row vectors of the matrix $(Q_{x|z})^n$ will become a constant vector, that is, all components of the vector will be the same, which will appear as a horizontal line in the denisty plot.
For a one-dimensional discrete Markov chain, the convergence rate is inversely related to the absolute value of the second largest eigenvalue of the transition probability matrix ($\lvert \lambda_2 \rvert$). The smaller $\lvert \lambda_2 \rvert$ is, the faster the convergence. Numerous experiments have shown that $\alpha$ has a clear linear relationship with $\lvert \lambda_2 \rvert$; the smaller $\alpha$ is, the smaller $\lvert \lambda_2 \rvert$ is. Therefore, <b>the smaller $\alpha$ (the greater the noise), the faster the convergence rate</b>. Specifically, when $\alpha \to 0$, according to the conclusion in <a href="#posterior">Section 3</a>, the posterior probability distributions corresponding to different $z$ tend to be consistent. Additionally, according to Theorem 21 in <a href="#non_neg_lambda">[21]</a>, $\lvert \lambda_2 \rvert$ is smaller than the L1 distance between any two posterior probability distributions corresponding to different $z$, so it can be concluded that $\lvert \lambda_2 \rvert \to 0$.
</br>
<h3 style="font-size:18px"> Anti-noise Capacity In Restoring Data Distribution</h3>
From the above analysis, it can be seen that, in most cases, the <b>posterior transform</b> is a shrinking mapping, which means the following relationship
\begin{align}
d(q(x),\ q_o(x)) < d(q(z),\ q_i(z)) \tag{7.12}
\end{align}
Among them, $q(z)$ is the ideal input distribution, $q(x)$ is the ideal output distribution, $q(x) = \int q(x|z) q(z) dz$, $q_i(z)$ is any input distribution, and $q_o(x)$ is the transformed output distribution, $q_o(x) = \int q(x|z) q_i(z) dz$.
The above equation indicates that the distance between the output distribution $q_o(x)$ and the ideal output distribution q(x) will always be <b>less than</b> the distance between the input distribution $q_i(z)$ and the ideal input distribution q(x). Hence, <b>the posterior transform naturally possesses a certain ability to resist noise </b>. This means that during the process of restoring $q(x)$(<a href="#backward_process">Section 5</a>), even if the <em>tail distribution</em> $q(z_T)$ contains some error, the error of the outputed distribution $q(x)$ will be smaller than the error of input after undergoing a series of transform.
Refer specifically to <a href="#demo_3_2">Demo 3.2</a>, where by increasing the value of the <b>noise ratio</b>, noise can be added to the <em>tail distribution</em> $q(z_T)$. Clicking the "apply" button will gradually draw out the restoring process, with the restored distribution represented by a $\textcolor{red}{\text{red curve}}$, and the error size will be computed by the JS divergence. You will see that the error of restored $q(x)$ is always less than the error of $q(z_T)$.
From the above discussion, it can be seen that the smaller $\alpha$ is (the larger the noise used in the transform), the greater the shrinking rate of the shrink mapping, and correspondingly, the stronger the error resistance capability. Specifically, when $\alpha \to 0$, the noise resistance capability becomes infinite, meaning that regardless of the magnitude of the error in the input, the output will always be $q(x)$.
</br>
<h3 style="font-size:18px"> Markov Chain Monte Carlo Sampling</h3>
In DPM models, sampling is typically performed using <b>Ancestral Sampling</b>. From the analysis above, it can be inferred that when $\alpha$ is sufficiently small, the posterior transform will converge to $q(x)$. Therefore, sampling can be conducted using <b>Markov Chain Monte Carlo</b> (MCMC) methods, as depicted in Figure 7.1. In the figure, $\alpha$ represents a posterior transform with relatively large noise, where larger noise makes the steady-state distribution closer to the data distribution $q(x)$. However, as discussed in Section 3, posterior transform with larger noise are less favorable for fitting. Therefore, transform with larger noise are split into multiple transform with smaller noise.
<center> <img src="file/7.1.png" width="1024" style="margin-top:12px"/> </center>
<center> Figure 7.1: Markov Chain Monte Carlo Sampling</center>
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_posterior_transform_en")
return
def md_deconvolution_en():
global g_latex_del
title = "8. Can the data distribution be restored by deconvolution?"
with gr.Accordion(label=title, elem_classes="first_md", elem_id="deconvolution"):
gr.Markdown(
r"""
As mentioned in the <a href="#introduction">Section 1</a>, the transform of Equation 2.1 can be divided into two sub-transforms, the first one being a linear transform and the second being adding independent Gaussian noise. The linear transform is equivalent to a scaling transform of the probability distribution, so it has an inverse transformation. Adding independent Gaussian noise is equivalent to the execution of a convolution operation on the probability distribution, which can be restored through <b>deconvolution</b>. Therefore, theoretically, the data distribution $q(x)$ can be recovered from the final probability distribution $q(z_T)$ through <b>inverse linear transform</b> and <b>deconvolution</b>.
However, in actuality, some problems do exist. Due to the extreme sensitivity of deconvolution to errors, having high input sensitivity, even a small amount of input noise can lead to significant changes in output[\[11\]](#deconv_1)[\[12\]](#deconv_2). Meanwhile, in the diffusion model, the standard normal distribution is used as an approximation to replace $q(z_T)$, thus, noise is introduced at the initial stage of restoring. Although the noise is relatively small, because of the sensitivity of deconvolution, the noise will gradually amplify, affecting the restoring.
In addition, the infeasibility of <b>deconvolution restoring</b> can be understood from another perspective. Since the process of forward transform (equations 4.1 to 4.4) is fixed, the convolution kernel is fixed. Therefore, the corresponding deconvolution transform is also fixed. Since the initial data distribution $q(x)$ is arbitrary, any probability distribution can be transformed into an approximation of $\mathcal{N}(0,I)$ through a series of fixed linear transforms and convolutions. If <b>deconvolution restoring</b> is feasible, it means that a fixed deconvolution can be used to restore any data distribution $q(x)$ from the $\mathcal{N}(0,I)$ , this is clearly <b>paradoxical</b>. The same input, the same transform, cannot have multiple different outputs.
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_deconvolution_en")
return
def md_cond_kl_en():
global g_latex_del
title = "Appendix A Conditional KL Divergence"
with gr.Accordion(label=title, elem_classes="first_md", elem_id="cond_kl"):
gr.Markdown(
r"""
This section mainly introduces the relationship between <b>KL divergence</b> and <b>conditional KL divergence</b>. Before the formal introduction, we will briefly introduce the definitions of <b>Entropy</b> and <b>Conditional Entropy</b>, as well as the inequality relationship between them, in preparation for the subsequent proof.
</br>
<h3 style="font-size:20px">Entropy and Conditional Entropy</h3>
For any two random variables $Z, X$, the <b>Entropy</b> is defined as follows<a href="#entropy">[16]</a>:
\begin{align}
\mathbf{H}(Z) = \int -p(z)\log{p(z)}dz \tag{A.1}
\end{align}
The <b>Conditional Entropy</b> is defined as followed <a href="#cond_entropy">[17]</a>:
\begin{align}
\mathbf{H}(Z|X) = \int p(x) \overbrace{\int -p(z|x)\log{p(z|x)}dz}^{\text{Entropy}}\ dx \tag{A.2}
\end{align}
The following inequality relationship exists between the two:
\begin{align}
\mathbf{H}(Z|X) \le \mathbf{H}(Z) \tag{A.3}
\end{align}
It is to say that <b>the Conditional Entropy is always less than or equal to the Entropy</b>, and they are equal only when X and Z are independent. The proof of this relationship can be found in the literature <a href="#cond_entropy">[17]</a>.
</br>
<h3 style="font-size:20px">KL Divergence and Conditional KL Divergence</h3>
In the same manner as the definition of Conditional Entropy, we introduce a new definition, <b>Conditional KL Divergence</b>, denoted as $KL_{\mathcal{C}}$. Since KL Divergence is non-symmetric, there exist two forms as follows.
\begin{align}
KL_{\mathcal{C}}(q(z|x) \Vert \textcolor{blue}{p(z)}) = \int \ q(x) KL(q(z|x) \Vert \textcolor{blue}{p(z)})dx \tag{A.4} \newline
KL_{\mathcal{C}}(q(z) \Vert \textcolor{blue}{p(z|x)}) = \int \ \textcolor{blue}{p(x)} KL(q(z) \Vert \textcolor{blue}{p(z|x)})dx \tag{A.5}
\end{align}
Similar to Conditional Entropy, there also exists a similar inequality relationship for <b>both forms of Conditional KL Divergence</b>:
\begin{align}
KL_{\mathcal{C}}(q(z|x) \Vert \textcolor{blue}{p(z)}) \ge KL(q(z) \Vert \textcolor{blue}{p(z)}) \tag{A.6} \newline
KL_{\mathcal{C}}(q(z) \Vert \textcolor{blue}{p(z|x)}) \ge KL(q(z) \Vert \textcolor{blue}{p(z)}) \tag{A.7}
\end{align}
It is to say that <b>the Conditional KL Divergence is always less than or equal to the KL Divergence</b>, and they are equal only when X and Z are independent.
The following provides proofs for the conclusions on Equation A.5 and Equation A.6 respectively.
For equation A.6, the proof is as follows:
\begin{align}
KL_{\mathcal{C}}(q(z|x) \Vert \textcolor{blue}{p(z)}) &= \int \ q(x) KL(q(z|x) \Vert \textcolor{blue}{p(z)})dx \tag{A.8} \newline
&= \iint q(x) q(z|x) \log \frac{q(z|x)}{\textcolor{blue}{p(z)}}dzdx \tag{A.9} \newline
&= -\overbrace{\iint - q(x)q(z|x) \log q(z|x) dzdx}^{\text{Conditional Entropy }\mathbf{H}_q(Z|X)} - \iint q(x) q(z|x) \log \textcolor{blue}{p(z)} dzdx \tag{A.10} \newline
&= -\mathbf{H}_q(Z|X) - \int \left\lbrace \int q(x) q(z|x)dx \right\rbrace \log \textcolor{blue}{p(z)}dz \tag{A.11} \newline
&= -\mathbf{H}_q(Z|X) + \overbrace{\int - q(z) \log p(z)dz}^{\text{Cross Entropy}} \tag{A.12} \newline
&= -\mathbf{H}_q(Z|X) + \int q(z)\left\lbrace \log\frac{q(z)}{\textcolor{blue}{p(z)}} -\log q(z)\right\rbrace dz \tag{A.13} \newline
&= -\mathbf{H}_q(Z|X) + \int q(z)\log\frac{q(z)}{\textcolor{blue}{p(z)}}dz + \overbrace{\int - q(z)\log q(z)dz}^{\text{Entropy } \mathbf{H}_q(Z)} \tag{A.14} \newline
&= KL(q(z) \Vert \textcolor{blue}{p(z)}) + \overbrace{\mathbf{H}_q(Z) - \mathbf{H}_q(Z|X)}^{\ge 0} \tag{A.15} \newline
&\le KL(q(z) \Vert \textcolor{blue}{p(z)}) \tag{A.16}
\end{align}
In this context, equation A.15 applies the conclusion that <b>Conditional Entropy is always less than or equal to Entropy</b>. Thus, the relationship in equation A.6 is derived.
For equation A.6, the proof is as follows:
\begin{align}
KL(\textcolor{blue}{q(z)} \Vert p(z)) &= \int \textcolor{blue}{q(z)}\log\frac{\textcolor{blue}{q(z)}}{p(z)}dz \tag{A.15} \newline
&= \int q(z)\log\frac{q(z)}{\int p(z|x)p(x)dx}dz \tag{A.16} \newline
&= \textcolor{orange}{\int p(x)dx}\int q(z)\log q(z)dz - \int q(z)\textcolor{red}{\log\int p(z|x)p(x)dx}dz \qquad \ \textcolor{orange}{\int p(x)dx=1} \tag{A.17} \newline
&\le \iint p(x) q(z)\log q(z)dzdx - \int q(z)\textcolor{red}{\int p(x)\log p(z|x)dx}dz \ \qquad \textcolor{red}{\text{jensen\ inequality}} \tag{A.18} \newline
&= \iint p(x)q(z)\log q(z)dzdx - \iint p(x)q(z)\log p(z|x)dzdx \tag{A.19} \newline
&= \iint p(x)q(z)(\log q(z) - \log p(z|x))dzdx \tag{A.20} \newline
&= \iint p(x)q(z)\log \frac{q(z)}{p(z|x)}dzdx \tag{A.21} \newline
&= \int p(x)\left\lbrace \int q(z)\log \frac{q(z)}{p(z|x)}dz\right\rbrace dx \tag{A.22} \newline
&= \int p(x)KL(\textcolor{blue}{q(z)} \Vert p(z|x))dx \tag{A.23} \newline
&= KL_{\mathcal{C}}(q(z) \Vert \textcolor{blue}{p(z|x)}) \tag{A.24}
\end{align}
Thus, the relationship in equation A.7 is obtained.
Another <b>important conclusion</b> can be drawn from equation A.15.
The KL Divergence is often used to fit the distribution of data. In this scenario, the distribution of the data is denoted by $q(z)$ and the parameterized model distribution is denoted by $\textcolor{blue}{p_\theta(z)}$. During the optimization process, since both $q(z|x)$ and $q(x)$ remain constant, the term $\mathbf{H}(Z) - \mathbf{H}(Z|X)$ in Equation A.15 is a constant. Thus, the following relationship is obtained:
<span id="en_cond_kl_2">
\mathop{\min}{underline}{\textcolor{blue}{p_\theta}} KL(q(z) \Vert \textcolor{blue}{p_\theta(z)}) \iff \mathop{\min}{underline}{\textcolor{blue}{p_\theta}} \int \ q(x) KL(q(z|x) \Vert \textcolor{blue}{p_\theta(z)})dx \tag{A.25}
</span>
Comparing the above relationship with <b>Denoised Score Matching</b> <a href="#dsm">[18]</a>(equation A.26), some similarities can be observed. Both introduce a new variable $X$, and substitute the targeted fitting distribution q(z) with q(z|x). After the substitution, since q(z|x) is a conditional probability distribution, both consider all conditions and perform a weighted sum using the probability of the conditions occurring, $q(x)$, as the weight coefficient.
<span id="en_cond_kl_3">
\mathop{\min}{underline}{\textcolor{blue}{\psi_\theta}} \frac{1}{2} \int q(z) \left\lVert \textcolor{blue}{\psi_\theta(z)} - \frac{\partial q(z)}{\partial z} \right\rVert^2 dz \iff \mathop{\min}{underline}{\textcolor{blue}{\psi_\theta}} \int q(x)\ \overbrace{\frac{1}{2} \int q(z|x) \left\lVert \textcolor{blue}{\psi_\theta(z)} - \frac{\partial q(z|x)}{\partial z} \right\rVert^2 dz}^{\text{Score Matching of }q(z|x)}\ dx \tag{A.26}
</span>
The operation of the above weighted sum is somewhat similar to <em> Elimination by Total Probability Formula </b>.
\begin{align}
q(z) = \int q(z,x) dx = \int q(x) q(z|x) dx \tag{A.27}
\end{align}
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_cond_kl_en")
return
def md_approx_gauss_en():
global g_latex_del
title = "Appendix B When does the Posterior Approximate to Gaussian ?"
with gr.Accordion(label=title, elem_classes="first_md", elem_id="approx_gauss"):
gr.Markdown(
r"""
From equation 3.4, it can be seen that $q(x|z)$ takes the following form:
\begin{align}
q(x|z) &= \operatorname{Normalize} \Big(\ \frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}\ q(x)\ \Big)& \qquad &\text{where}\ \mu=\frac{z}{\sqrt{\alpha}}\quad \sigma=\sqrt{\frac{1-\alpha}{\alpha}} \tag{B.1} \newline
&\propto \underbrace{\frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}}_{\text{GaussFun}}\ q(x) \tag{B.2}
\end{align}
Below we will prove that if the following two assumptions are satisfied, $q(x|z)$ approximates a Gaussian distribution.
<ul>
<li>
Assume that within the support of GaussFun, $q(x)$ undergoes linear changes. Expand $q(x)$ around the mean of GaussFun using a Taylor series. According to the properties of Taylor expansion, these assumptions can be satisfied when the standard deviation $\sigma$ of GaussFun is sufficiently small.
\begin{align}
q(x) &\approx q(\mu) + \nabla_xq(\mu)(x-\mu)& \quad &\text{where}\quad q(\mu)\triangleq q(x)\bigg|_{x=\mu} \quad \nabla_xq(\mu)\triangleq \nabla_xq(x)\bigg|_{x=\mu} \tag{B.3} \newline
&= q(\mu)\big(1+ \frac{\nabla_xq(\mu)}{q(\mu)}(x-\mu)\big)& \tag{B.4} \newline
&= q(\mu)\big(1+ \nabla_x\log{q(\mu)}(x-\mu)\big)& \quad &\text{where}\quad \nabla_x\log{q(\mu)}\triangleq \nabla_x\log{q(x)}\bigg|_{x=\mu} \tag{B.5}
\end{align}
</li>
<li>
Assuming within the support of GaussFun, $\log\big(1+\nabla_x\log{q(\mu)}(x-\mu)\big)$ can be approximated by $\nabla_x\log{q(\mu)}(x-\mu)$. By expanding $\log(1+y)$ using Taylor series, according to the properties of Taylor expansion, when $\lVert y\rVert_2$ is small, $\log(1+y)$ can be approximated by $y$. When $\sigma$ is sufficiently small, $\lVert x-u\rVert_2$ will be small, and $\nabla_x\log{q(\mu)}(x-\mu)$will also be small, hence the above assumption can be satisfied. Generally, when $\nabla_x\log{q(\mu)}(x-\mu)<0.1$, the approximation error is small enough to be negligible.
\begin{align}
\log(1+y) &\approx \log(1+y)\bigg|_{y=0} + \nabla_y\log(1+y)\bigg|_{y=0}(y-0) \tag{B.6} \newline
&= y \tag{B.7}
\end{align}
</li>
</ul>
Using the above two assumptions, $q(x|z)$ can be transformed into the following form:
\begin{align}
q(x|z) &\propto \frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}\ q(x) \tag{B.8} \newline
&\approx \frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}\ q(\mu)\big(1+ \nabla_x\log{q(\mu)}(x-\mu)\big) \tag{B.9} \newline
&= \frac{q(\mu)}{\sqrt{2\pi}\sigma}\exp\left(\frac{-(x-\mu)^2}{2\sigma^2}+\log\big(1+ \nabla_x\log{q(\mu)}(x-\mu)\big)\right) \tag{B.10} \newline
&\approx \frac{q(\mu)}{\sqrt{2\pi}\sigma}\exp\left(\frac{-(x-\mu)^2}{2\sigma^2}+\nabla_x\log{q(\mu)}(x-\mu)\right) \tag{B.11} \newline
&= \frac{q(\mu)}{\sqrt{2\pi}\sigma}\exp\left(-\frac{(x-\mu)^2-2\sigma^2\nabla_x\log{q(\mu)}(x-\mu)}{2\sigma^2}\right) \tag{B.12} \newline
&= \frac{q(\mu)}{\sqrt{2\pi}\sigma}\exp\left(-\frac{\big(x-\mu-\sigma^2\nabla_x\log{q(\mu)}\big)^2}{2\sigma^2}+\frac{\big(\sigma^2\nabla_x\log{q(\mu)}\big)^2}{2\sigma^2}\right) \tag{B.13} \newline
&= \exp\left(-\frac{\big(x-\mu-\sigma^2\nabla_x\log{q(\mu)}\big)^2}{2\sigma^2}\right) \underbrace{\frac{q(\mu)}{\sqrt{2\pi}\sigma} \exp\left( \frac{1}{2}\big(\sigma\nabla_x\log{q(\mu)}\big)^2\right)}_{\text{const}} \tag{B.14}
\end{align}
Among them, Equation B.9 applies the conclusion of Assumption 1, and Equation B.11 applies the conclusion of Assumption 2.
The <em>const term</em> in Equation B.14 is constant and does not affect the shape of the function. Additionally, as can be seen from the above, $q(x|z)$ is self-normalizing. Therefore, $q(x|z)$ is a Gaussian probability density function with a mean of $\mu + \sigma^2 \nabla_x \log{q(\mu)}$ and a variance of $\sigma^2$.
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_approx_gauss_en")
return
def md_non_expanding_en():
global g_latex_del
title = "Appendix C Posterior Transform is a Non-expanding Mapping"
with gr.Accordion(label=title, elem_classes="first_md", elem_id="non_expanding"):
gr.Markdown(
r"""
<b>Corollary 1</b>
Using KL Divergence as a metric, the transition transform of Markov chain is non-expanding<a href="#elem">[23]</a>, which means
\begin{align}
KL\big(p(x), q(x)\big) &\le KL\big(p(z), q(z)\big) \tag{C.1} \newline
\end{align}
Here, $p(z)$ and $q(z)$ are arbitrary probability density functions, and $r(x|z)$ is the transition probability density function of the Markov chain. We have $p(x) = \int r(x|z)p(z)dz$ and $q(x) = \int r(x|z)q(z)dz$.
Proof:
For the KL divergence of $p(x,z)$ and $q(x,z)$, the following relationship exists:
\begin{align}
KL\big(p(x,z), q(x,z)\big) &= \iint p(x,z)\log \frac{p(x,z)}{q(x,z)}dxdz \tag{C.2} \newline
& = \iint p(x,z)\log \frac{p(x)p(x|z)}{q(x)q(x|z)}dxdz \tag{C.3} \newline
&= \iint p(x,z)\log \frac{p(x)}{q(x)}dxdz + \iint p(x,z) \log\frac{p(x|z)}{q(x|z)} dxdz \tag{C.4} \newline
&= \int \int p(x,z) dz\ \log \frac{p(x)}{q(x)}dx + \int p(z)\int p(x|z) \log\frac{p(x|z)}{q(x|z)} dx\ dz \tag{C.5} \newline
&= KL\big(p(x), q(x)\big) + \int p(z) KL\big(p(x|z), q(x|z)\big)dz \tag{C.6} \newline
\end{align}
Similarly, by swapping the order of $Z$ and $X$, the following relationship can be obtaine:
\begin{align}
KL\big(p(x,z), q(x,z)\big) &= KL\big(p(z), q(z)\big) + \int p(x) KL\big(p(z|x), q(z|x)\big)dx \tag{C.7}
\end{align}
Comparing the two equations, we can obtain:
\begin{align}
KL\big(p(z), q(z)\big) + \int p(x) KL\big(p(z|x), q(z|x)\big)dx = KL\big(p(x), q(x)\big) + \int p(z) KL\big(p(x|z), q(x|z)\big)dz \tag{C.8}
\end{align}
Since $q(x|z)$ and $p(x|z)$ are both transition probability densities of the Markov chain, equal to $r(x|z)$, the integral $\int p(z) KL\big(p(x|z), q(x|z)\big)dz$ equals 0. Therefore, the above equation simplifies to:
\begin{align}
KL\big(p(x), q(x)\big) = KL\big(p(z), q(z)\big) - \int p(x) KL\big(p(z|x), q(z|x)\big)dx \tag{C.9}
\end{align}
Since KL divergence is always greater than or equal to 0, the weighted sum $\int p(x) KL\big(p(z|x), q(z|x)\big)dx$ is also greater than or equal to 0. Therefore, we can conclude:
\begin{align}
KL\big(p(x), q(x)\big) \le KL\big(p(z), q(z)\big) \tag{C.10}
\end{align}
</br>
The condition for the above equation to hold is that $\int p(x) KL\big(p(z|x), q(z|x)\big)dx$ equals 0, which requires that for different conditions $x$, $p(z|x)$ and $q(z|x)$ must be equal. In most cases, when $p(z)$ and $q(z)$ are different, $p(z|x)$ and $q(z|x)$ are also different. This means that in most cases, we have
\begin{align}
KL\big(p(x), q(x)\big) < KL\big(p(z), q(z)\big) \tag{C.11}
\end{align}
</br></br>
<b>Corollary 2</b>
Using Total Variance (L1 distance) as a metric, the transition transform of a Markov chain is non-expanding, which means
\begin{align}
\left\lVert p(x)-q(x) \right\rVert_1\ &\le\ \left\lVert p(z) - q(z) \right\rVert_1 \tag{C.12}
\end{align}
Here, $p(z)$ and $q(z)$ are arbitrary probability density functions, and $r(x|z)$ is the transition probability density function of a Markov chain. We have $p(x) = \int r(x|z)p(z)dz$ and $q(x) = \int r(x|z) q(z) dz$.
Proof:
\begin{align}
\left\lVert p(x)-q(x) \right\rVert_1\ &= \int \big\lvert p(x) - q(x) \big\rvert dx \tag{C.13} \newline
&= \int \left\lvert \int r(x|z) p(z) dz - \int r(x|z)q(z)dz \right\rvert dx \tag{C.14} \newline
&= \int \left\lvert \int r(x|z) \big(p(z)-q(z)\big) dz \right\rvert dx \tag{C.15} \newline
&\le \int \int r(x|z) \left\lvert \big(p(z)-q(z)\big) \right\rvert dz dx \tag{C.16} \newline
&= \int \int r(x|z)dx \left\lvert \big(p(z)-q(z)\big) \right\rvert dz \tag{C.17} \newline
&= \int \left\lvert \big(p(z)-q(z)\big) \right\rvert dz \tag{C.18} \newline
&= \left\lVert p(z) - q(z) \right\rVert_1 \tag{C.19}
\end{align}
Here, Equation C.16 applies the Absolute Value Inequality, while Equation C.18 utilizes the property of $r(x|z)$ being a probability distribution.
Proof completed.
</br>
Figure C.1 shows an example of a one-dimensional random variable, which can help better understand the derivation process described above.
The condition for the above equation to hold is that all non-zero terms inside the absolute value brackets have the same sign. As shown in Figure C.1(a), there are five absolute value brackets, each corresponding to a row, with five terms in each bracket. The above equation holds if and only if all non-zero terms in each row have the same sign. If different signs occur, this will lead to $\lVert p(x)-q(x) \rVert_1\ <\ \lVert p(z) - q(z) \rVert_1$. The number of different signs is related to the nonzero elements of the transition probability matrix. In general, the more nonzero elements there are, the more different signs there will be.
For the posterior transform, generally, when $\alpha$ decreases (more noise), the transition probability density function will have more nonzero elements, as shown in Figure C.2(a); whereas when $\alpha$ increases (less noise), the transition probability density function will have fewer nonzero elements, as shown in Figure C.2(b).
So, there is a feature: <b>when $\alpha$ decreases, it leads to $\lVert p(x)-q(x) \rVert_1$ being smaller than $\lVert p(z) - q(z) \rVert_1$, which means the shrinking rate of the posterior transform is larger.</b>
<center> <img src="file/C1.png" width="1024" style="margin-top:12px"/> </center>
<center> Figure C.1: Non-expanding under L1 norm </center>
</br>
<center> <img src="file/C2.png" width="568" style="margin-top:12px"/> </center>
<center> Figure C.2: More non-zero elements as $\alpha$ gets smaller </center>
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_non_expanding_en")
return
def md_stationary_en():
global g_latex_del
title = "Appendix D Posterior Transform Converges to the Unique Stationary Distribution"
with gr.Accordion(label=title, elem_classes="first_md", elem_id="stationary"):
gr.Markdown(
r"""
According to the conclusion of Theorem 3 in <a href="#mc_basic_t3">[19]</a>, <b>an aperiodic and irreducible Markov chain will converge to a unique stationary distribution</b>.
The following will show that under certain conditions, the posterior transform is the transition probability density function of an <b>aperiodic and irreducible Markov chain</b>.
For convenience, the forward transform of the diffusion model is described below in a more general form.
\begin{align}
Z = \sqrt{\alpha}X + \sqrt{\beta}\ \epsilon \tag{D.1} \newline
\end{align}
As described in <a href="#transform">Section 1</a>, $\sqrt{\alpha}X$ narrows the probability density function of $X$, so $\alpha$ controls the narrowing intensity, while $\beta$ controls the amount of noise added(smoothing). When $\beta = 1 - \alpha$, the above transform is consistent with the equation 1.1.
The form of the posterior probability distribution corresponding to the new transformation is as follows:
\begin{align}
q(x|z=c) = \operatorname{Normalize} \Big(\ \overbrace{\frac{1}{\sqrt{2\pi}\sigma}\exp{\frac{-(x-\mu)^2}{2\sigma^2}}}^{\text{GaussFun}}\ q(x)\ \Big) \tag{D.2} \newline
\text{where}\ \mu=\frac{c}{\sqrt{\alpha}}\qquad \sigma=\sqrt{\frac{\beta}{\alpha}} \qquad \text{$c$ is a fixed value} \notag
\end{align}
When $\beta = 1 - \alpha$, the above transform is consistent with the equation 3.4.
For convenience, let $g(x)$ represent GaussFun in Equation D.2.
Since $\sqrt{\alpha}X$ narrows the probability density function $q(x)$ of $X$, this makes the analysis of the aperiodicity and irreducibility of the transition probability density function $q(x|z)$ more complex. Therefore, for the sake of analysis, we first assume $\alpha = 1$ and later analyze the case when $\alpha \neq 1$ and $\beta = 1 - \alpha$.
<center> <img src="file/D1.png" width="960" style="margin-top:12px"/> </center>
<center> Figure D.1: Only one component in support </center>
<center> <img src="file/D2.png" width="960" style="margin-top:12px"/> </center>
<center> Figure D.2: One component which can communicate with each other </center>
</br>
<h3 style="font-size:24px"> $\alpha=1$ </h3>
When $\alpha=1$, if $q(x)$ and $\beta$ satisfy either of the following two conditions, the Markov chain corresponding to $q(x|z)$ is aperiodic and irreducible.
<ol style="list-style-type:decimal">
<li>If the support of $q(x)$ contains only one connected component.</li>
<li>If the support of $q(x)$ has multiple connected components, but the distance between each connected component is less than $3$ times $\sigma$. In other words, the gaps can be covered by the radius of the effective region of $g(x)$.</li>
</ol>
Proof:
<ol style="list-style-type:decimal">
<li>
For any point $c$ in the support of $q(x)$, when $z=c$ and $x=c$, $q(x=c)>0$; from Equation D.2, we know that the center of $g(x)$ is located at $c$, so $g(x)$ is also greater than 0 at $x=c$. Therefore, according to characteristics of multiplication in the equation D.2, $q(x=c|z=c)>0$. Hence, the Markov chain corresponding to $q(x|z)$ is aperiodic.
For any point $c$ in the support of $q(x)$, when $z=c$, the center of $g(x)$ is located at $c$, so there exists a hypersphere with $c$ as its center ($\lVert x-c\rVert_2 < \delta$). Within this hypersphere, $q(x|z=c)>0$, which means that state $c$ can access nearby states. Since every state in the support has this property, all states within the entire support form a $\textcolor{red}{\text{Communicate Class}}$ <a href="#mc_basic_d4">[14]</a>. Therefore, the Markov chain corresponding to $q(x|z)$ is irreducible.
Therefore, a Markov chain that satisfies condition 1 is aperiodic and irreducible. See the example in Figure D.1, which illustrates a single connected component
</li>
<li>
When the support set of $q(x)$ has multiple connected components, the Markov chain may have multiple communicate classes. However, if the gaps between components are smaller than $3\sigma$(standard deviation of $g(x)$), the states of each component can access each other. Thus, the Markov chain corresponding to $q(x|z)$ will have only one communicate class, similar to the case in condition 1. Therefore, a Markov chain that satisfies condition 2 is aperiodic and irreducible.
In Figure D.2, an example of multiple connected components is shown.
</li>
</ol>
<center> <img src="file/D3.png" width="960" style="margin-top:12px"/> </center>
<center> Figure D.3: Two component which <b>cannot</b> communicate with each other </center>
</br>
<h3 style="font-size:24px"> $\alpha \neq 1$ </h3>
When $\alpha \neq 1$, for any point $c$ within the support of $q(x)$, it follows from Equation D.2 that the center of $g(x)$ is no longer $c$ but rather $\frac{c}{\sqrt{\alpha}}$. That is to say, the center of $g(x)$ deviates from $c$, with the deviation distance being $\lVert c\rVert(\frac{1-\sqrt{\alpha}}{\sqrt{\alpha}})$. It can be observed that the larger $\lVert c\rVert$ is, the greater the deviation. See the examples in Figures D.4(c) and D.4(d) for specifics. In Figure D.4(d), when $z=2.0$, the center of $g(x)$ is noticeably offset from $x=2.0$. This phenomenon is referred to in this article as <b>the Center Deviation Phenomenon</b>.
The <b>Center Deviation Phenomenon</b> will affect the properties of some states in the Markov chain.
When the deviation distance is significantly greater than $3\sigma$, $g(x)$ may be zero at $x = c$ and its vicinity. Consequently, $q(x=c|z=c)$ may also be zero, and $q(x|z=c)$ in the vicinity of $x = c$ may also be zero. Therefore, state $c$ may not be able to access nearby states and may be periodic. This is different from the case when $\alpha=1$. Refer to the example in Figure D.5: the $\textcolor{green}{\text{green curve}}$ represents $g(x)$ for $z=6.0$, and the $\textcolor{orange}{\text{orange curve}}$ represents $q(x|z=6.0)$. Because the center of $g(x)$ deviates too much from $x=6.0$, $q(x=6.0|z=6.0)=0$.
When the deviation distance is significantly less than $3\sigma$, $g(x)$ is non-zero at $x = c$ and its vicinity. Consequently, $q(x=c|z=c)$ will not be zero, and $q(x|z=c)$ in the vicinity of $x = c$ will also not be zero. Therefore, state $c$ can access nearby states and is aperiodic.
Under what conditions for $c$ will the deviation distance of the center of $g(x)$ be less than $3\sigma$?
\begin{align}
\lVert c\rVert(\frac{1-\sqrt{\alpha}}{\sqrt{\alpha}})\ <\ 3\frac{\sqrt{\beta}}{\sqrt{\alpha}} \qquad \Rightarrow \qquad \lVert c\rVert \ <\ 3\frac{\sqrt{\beta}}{1-\sqrt{\alpha}} \tag{D.3} \newline
\end{align}
From the above, it is known that there exists an upper limit such that as long as $\lVert c\rVert$ is less than this upper limit, the deviation amount will be less than $3\sigma$.
When $\beta=1-\alpha$, the above expression becomes
\begin{align}
\lVert c\rVert \ <\ 3\frac{\sqrt{1-\alpha}}{1-\sqrt{\alpha}} \tag{D.4} \newline
\end{align}
$3\frac{\sqrt{1-\alpha}}{1-\sqrt{\alpha}}$ has a strictly monotonically decreasing relationship with $\alpha$.
When $\alpha \in (0, 1)$,
\begin{align}
3\frac{\sqrt{1-\alpha}}{1-\sqrt{\alpha}} > 3 \tag{D.5} \newline
\end{align}
Based on the analysis above, the following conclusion can be drawn
<ol style="list-style-type:decimal">
<li>
<b>If the support of $q(x)$ contains only one connected component, and the points of the support set are all within a distance less than $3\frac{\sqrt{1-\alpha}}{1-\sqrt{\alpha}}$ from the origin, then the Markov chain corresponding to $q(x|z)$ will be aperiodic and irreducible.</b>
</li>
<li>
If the support of $q(x)$ contains multiple connected components, the accurate determination of whether two components can access each other becomes more complex due to the Center Deviation Phenomenon of $g(x)$. Here, we won't delve into further analysis. But just give a conservative conclusion: <b>If the points of the support are all within a distance less than $1$ from the origin, and the gaps between each connected component are all less than $2\sigma$, then the Markov chain corresponding to $q(x|z)$ will be aperiodic and irreducible.</b>
</li>
</ol>
<center> <img src="file/D4.png" width="1280" style="margin-top:12px"/> </center>
<center> Figure D.4: Center Deviation of the GaussFun </center>
</br>
<center> <img src="file/D5.png" width="568" style="margin-top:12px"/> </center>
<center> Figure D.5: Deviation is More Than $3\sigma$ </center>
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_stationary_en")
return
def md_reference_en():
global g_latex_del
with gr.Accordion(label="Reference", elem_classes="first_md", elem_id="reference"):
gr.Markdown(
r"""
<a id="dpm" href="https://arxiv.org/abs/1503.03585"> [1] Deep Unsupervised Learning Using Nonequilibrium Thermodynami </a>
<a id="ddpm" href="https://arxiv.org/abs/1503.03585"> [2] Denoising Diffusion Probabilistic Models </a>
<a id="linear_transform" href="https://stats.libretexts.org/Bookshelves/Probability_Theory/Probability_Mathematical_Statistics_and_Stochastic_Processes_(Siegrist)/03%3A_Distributions/3.07%3A_Transformations_of_Random_Variables"> [3] Linear Transformations of Random Variable </a>
<a id="sum_conv" href="https://stats.libretexts.org/Bookshelves/Probability_Theory/Probability_Mathematical_Statistics_and_Stochastic_Processes_(Siegrist)/03%3A_Distributions/3.07%3A_Transformations_of_Random_Variables"> [4] Sums and Convolution </a>
<a id="fixed_point" href="https://en.wikipedia.org/wiki/Banach_fixed-point_theorem"> [5] Banach fixed-point theorem </a>
<a id="ctr" href="https://en.wikipedia.org/wiki/Contraction_mapping"> [6] Contraction mapping </a>
<a id="mc_limit" href="https://stats.libretexts.org/Bookshelves/Probability_Theory/Book%3A_Introductory_Probability_(Grinstead_and_Snell)/11%3A_Markov_Chains/11.04%3A_Fundamental_Limit_Theorem_for_Regular_Chains"> [7] Fundamental Limit Theorem for Regular Chains </a>
<a id="mc_basic_p6" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [8] Markov Chain:Basic Theory - Proposition 6 </a>
<a id="fp_converse" href="https://arxiv.org/abs/1702.07339"> [9] A Converse to Banach's Fixed Point Theorem and its CLS Completeness </a>
<a id="ce_kl" href="https://en.wikipedia.org/wiki/Cross-entropy#Cross-entropy_minimization"> [10] Cross-entropy minimization </a>
<a id="deconv_1" href="https://thewolfsound.com/deconvolution-inverse-convolution/"> [11] Deconvolution Using Frequency-Domain Division </a>
<a id="deconv_2" href="https://www.strollswithmydog.com/deconvolution-by-division-in-the-frequency-domain/"> [12] deconvolution-by-division-in-the-frequency-domain </a>
<a id="mc_basic_t7" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [13] Markov Chain:Basic Theory - Theorem 7 </a>
<a id="mc_basic_d4" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [14] Markov Chain:Basic Theory - Definition 4 </a>
<a id="vdm" href="https://arxiv.org/pdf/2107.00630"> [15] Variational Diffusion Models </a>
<a id="entropy" href="https://en.wikipedia.org/wiki/Entropy"> [16] Entropy </a>
<a id="cond_entropy" href="https://en.wikipedia.org/wiki/Conditional_entropy"> [17] Conditional Entropy </a>
<a id="dsm" href="https://www.iro.umontreal.ca/~vincentp/Publications/smdae_techreport_1358_v1.pdf"> [18] A Connection Between Score Matching and Denoising autoencoders </a>
<a id="mc_basic_t3" href="http://galton.uchicago.edu/~lalley/Courses/312/MarkovChains.pdf"> [19] Markov Chain:Basic Theory - Theorem 3 </a>
<a id="mc_mt_lambda" href="https://pages.uoregon.edu/dlevin/MARKOV/markovmixing.pdf"> [20] Markov Chains and Mixing Times, second edition - 12.2 The Relaxation Time </a>
<a id="non_neg_lambda" href="https://link.springer.com/book/10.1007/0-387-32792-4"> [21] Non-negative Matrices and Markov Chains - Theorem 2.10 </a>
<a id="prml_mcmc" href="https://www.microsoft.com/en-us/research/uploads/prod/2006/01/Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf"> [22] Pattern Recognition and Machine Learning - 11.2. Markov Chain Monte Carlo </a>
<a id="elem" href="https://cs-114.org/wp-content/uploads/2015/01/Elements_of_Information_Theory_Elements.pdf"> [23] Elements_of_Information_Theory_Elements - 2.9 The Second Law of Thermodynamics </a>
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_reference_en")
return
def md_about_en():
global g_latex_del
with gr.Accordion(label="About", elem_classes="first_md", elem_id="about"):
gr.Markdown(
r"""
<b>APP</b>: This Web APP is developed using Gradio and deployed on HuggingFace. Due to limited resources (2 cores, 16G memory), the response may be slow. For a better experience, it is recommended to clone the source code from <a href="https://github.com/blairstar/The_Art_of_DPM">github</a> and run it locally. This program only relies on Gradio, SciPy, and Matplotlib.
<b>Author</b>: Zhenxin Zheng, Senior computer vision engineer with ten years of algorithm development experience, Formerly employed by Tencent and JD.com, currently focusing on image and video generation.
<b>Email</b>: [email protected].
""", latex_delimiters=g_latex_del, elem_classes="normal mds", elem_id="md_about_en")
return
def run_app():
# with gr.Blocks(css=g_css, js="() => { insert_special_formula(); }", head=js_head) as demo:
with gr.Blocks(css=g_css, js="() => {insert_special_formula(); write_markdown();}", head=js_head) as demo:
md_introduction_en()
md_transform_en()
md_likelihood_en()
md_posterior_en()
md_forward_process_en()
md_backward_process_en()
md_fit_posterior_en()
md_posterior_transform_en()
md_deconvolution_en()
md_cond_kl_en()
md_approx_gauss_en()
md_non_expanding_en()
md_stationary_en()
md_reference_en()
md_about_en()
demo.launch(allowed_paths=["/"])
return
if __name__ == "__main__":
run_app() |