Bayesian Quadrature: Variance Derivation
We want to compute:
\[\mathbb{V}[Z] \approx \int\int \bar{\ell}(x)C_{\log \ell}(x, x^\prime)\bar{\ell}(x^\prime)p(x)p(x^\prime)\ \mathrm{d}x\ \mathrm{d}x^\prime\]
We expand this to obtain:
\[\begin{split}\begin{align*}
E[\bar{\ell} C_{\log\ell}\bar{\ell}] &= \int \int \left(K_{\exp(\log\ell)}(x, \mathbf{x}_c)K(\mathbf{x}_c, \mathbf{x}_c)^{-1}\bar{\ell}(\mathbf{x}_c)\right)\left(K_{\log\ell}(x, x^\prime)-K_{\log\ell}(x, \mathbf{x}_c)K_{\log\ell}(\mathbf{x}_c, \mathbf{x}_c)^{-1}K_{\log\ell}(\mathbf{x}_c, x^\prime)\right)\left(K_{\exp(\log\ell)}(x^\prime, \mathbf{x}_c)K(\mathbf{x}_c, \mathbf{x}_c)^{-1}\bar{\ell}(\mathbf{x}_c)\right)p(x)p(x^\prime)\ \mathrm{d}x\ \mathrm{d}x^\prime\\\\
&= \int \int \left(K_{\exp(\log\ell)}(x, \mathbf{x}_c)\alpha(\mathbf{x}_c)\right)\left(K_{\log\ell}(x, x^\prime)-K_{\log\ell}(x, \mathbf{x}_c)K_{\log\ell}(\mathbf{x}_c, \mathbf{x}_c)^{-1}K_{\log\ell}(\mathbf{x}_c, x^\prime)\right)\left(K_{\exp(\log\ell)}(x^\prime, \mathbf{x}_c)\alpha(\mathbf{x}_c)\right)p(x)p(x^\prime)\ \mathrm{d}x\ \mathrm{d}x^\prime\\\\
&= \int \int \left(K_{\exp(\log\ell)}(x, \mathbf{x}_c)\alpha(\mathbf{x}_c)\right)K_{\log\ell}(x, x^\prime)\left(K_{\exp(\log\ell)}(x^\prime, \mathbf{x}_c)\alpha(\mathbf{x}_c)\right)p(x)p(x^\prime)\ \mathrm{d}x\ \mathrm{d}x^\prime\\\\
&\ \ \ \ - \int \int \left(K_{\exp(\log\ell)}(x, \mathbf{x}_c)\alpha(\mathbf{x}_c)\right) K_{\log\ell}(x, \mathbf{x}_c)K_{\log\ell}(\mathbf{x}_c, \mathbf{x}_c)^{-1}K_{\log\ell}(\mathbf{x}_c, x^\prime)\left(K_{\exp(\log\ell)}(x^\prime, \mathbf{x}_c)\alpha(\mathbf{x}_c)\right)p(x)p(x^\prime)\ \mathrm{d}x\ \mathrm{d}x^\prime,
\end{align*}\end{split}\]
where
\(\alpha(\mathbf{x}_c)=K_{\exp(\log\ell)}(\mathbf{x}_c, \mathbf{x}_c)^{-1}\bar{\ell}(\mathbf{x}_c)\).
The first part of this is:
\[\begin{split}\begin{align*}
\int \int &\left(K_{\exp(\log\ell)}(x, \mathbf{x}_c)\alpha(\mathbf{x}_c)\right)K_{\log\ell}(x, x^\prime)\left(K_{\exp(\log\ell)}(x^\prime, \mathbf{x}_c)\alpha(\mathbf{x}_c)\right)p(x)p(x^\prime)\ \mathrm{d}x\ \mathrm{d}x^\prime\\\\
&= \alpha(\mathbf{x}_c)^\top \left(\int\int K_{\exp(\log\ell)}(\mathbf{x}_c, x)K_{\log\ell}(x, x^\prime)K_{\exp(\log\ell)}(x^\prime, \mathbf{x}_c)p(x)p(x^\prime)\ \mathrm{d}x\ \mathrm{d}x^\prime\right)\alpha(\mathbf{x}_c)
\end{align*}\end{split}\]
And the second part is:
\[\begin{split}\begin{align*}
- \int \int &\left(K_{\exp(\log\ell)}(x, \mathbf{x}_c)\alpha(\mathbf{x}_c)\right) K_{\log\ell}(x, \mathbf{x}_c)K_{\log\ell}(\mathbf{x}_c, \mathbf{x}_c)^{-1}K_{\log\ell}(\mathbf{x}_c, x^\prime)\left(K_{\exp(\log\ell)}(x^\prime, \mathbf{x}_c)\alpha(\mathbf{x}_c)\right)p(x)p(x^\prime)\ \mathrm{d}x\ \mathrm{d}x^\prime\\\\
&= - \int \int \left(K_{\exp(\log\ell)}(x, \mathbf{x}_c)\alpha(\mathbf{x}_c)\right) K_{\log\ell}(x, \mathbf{x}_c)\left(L_{\log\ell}(\mathbf{x}_c, \mathbf{x}_c)^{-1}\right)^\top L_{\log\ell}(\mathbf{x}_c, \mathbf{x}_c)^{-1}K_{\log\ell}(\mathbf{x}_c, x^\prime)\left(K_{\exp(\log\ell)}(x^\prime, \mathbf{x}_c)\alpha(\mathbf{x}_c)\right)p(x)p(x^\prime)\ \mathrm{d}x\ \mathrm{d}x^\prime\\\\
&= - \int \int \left(L_{\log\ell}(\mathbf{x}_c, \mathbf{x}_c)^{-1}K_{\log\ell}(\mathbf{x}_c, x) K_{\exp(\log\ell)}(x, \mathbf{x}_c)\alpha(\mathbf{x}_c) \right)^\top \left(L_{\log\ell}(\mathbf{x}_c, \mathbf{x}_c)^{-1}K_{\log\ell}(\mathbf{x}_c, x^\prime)K_{\exp(\log\ell)}(x^\prime, \mathbf{x}_c)\alpha(\mathbf{x}_c)\right)p(x)p(x^\prime)\ \mathrm{d}x\ \mathrm{d}x^\prime\\\\
&= - \left[L_{\log\ell}(\mathbf{x}_c, \mathbf{x}_c)^{-1}\left(\int K_{\log\ell}(\mathbf{x}_c, x) K_{\exp(\log\ell)}(x, \mathbf{x}_c)p(x)\ \mathrm{d}x \right)\alpha(\mathbf{x}_c)\right]^\top\left[L_{\log\ell}(\mathbf{x}_c, \mathbf{x}_c)^{-1}\left(\int K_{\log\ell}(\mathbf{x}_c, x) K_{\exp(\log\ell)}(x, \mathbf{x}_c)p(x)\ \mathrm{d}x \right)\alpha(\mathbf{x}_c)\right]\\\\
&= -\beta(\mathbf{x}_c)^\top\beta(\mathbf{x}_c),
\end{align*}\end{split}\]
where
\(\beta(\mathbf{x}_c)=L_{\log\ell}(\mathbf{x}_c, \mathbf{x}_c)^{-1}\left(\int K_{\log\ell}(\mathbf{x}_c, x) K_{\exp(\log\ell)}(x, \mathbf{x}_c)p(x)\ \mathrm{d}x \right)\alpha(\mathbf{x}_c)\)
and \(L_{\log\ell}(\mathbf{x}_c, \mathbf{x}_c)\) is the Cholesky
decomposition of \(K_{\log\ell}(\mathbf{x}_c, \mathbf{x}_c)\).
Putting these together, we obtain:
\[E[\bar{\ell} C_{\log\ell}\bar{\ell}]=\alpha(\mathbf{x}_c)^\top \left(\int\int K_{\exp(\log\ell)}(\mathbf{x}_c, x)K_{\log\ell}(x, x^\prime)K_{\exp(\log\ell)}(x^\prime, \mathbf{x}_c)p(x)p(x^\prime)\ \mathrm{d}x\ \mathrm{d}x^\prime\right)\alpha(\mathbf{x}_c)-\beta(\mathbf{x}_c)^\top\beta(\mathbf{x}_c)\]
First integral
Assuming the kernels are Gaussian kernels, and \(p(x)\) is also
Gaussian with mean \(\mu\) and covariance \(\Sigma\), we can
derive the integrals analytically. Then:
\[\begin{split}\begin{align*}
\int\int & K_{\exp(\log\ell)}(x_{c,i}, x)K_{\log\ell}(x, x^\prime)K_{\exp(\log\ell)}(x^\prime, x_{c,j})p(x)p(x^\prime)\ \mathrm{d}x\ \mathrm{d}x^\prime\\\\
&= h_1^4 h_2^2 \int\int \mathcal{N}\left(x_{c,i}\ \big\vert\ x, W_\ell\right)\mathcal{N}\left(x\ \big\vert\ x^\prime, W_{\log\ell}\right)\mathcal{N}\left(x^\prime\ \big\vert\ x_{c,j}, W_\ell\right)\mathcal{N}\left(x\ \big\vert\ \mu, \Sigma\right)\mathcal{N}\left(x^\prime\ \big\vert\ \mu, \Sigma\right)\ \mathrm{d}x\ \mathrm{d}x^\prime\\\\
\end{align*}\end{split}\]
From [O13]_, we have:
\[\mathcal{N}\left(x_{c,i}\ \big\vert\ x, W_\ell\right)\mathcal{N}\left(x\ \big\vert\ \mu, \Sigma\right) = \mathcal{N}\left(x_{c,i}\ \big\vert\ \mu, W_\ell + \Sigma\right)\mathcal{N}\left(x\ \big\vert\ \mu + \Gamma(x_{c,i}-\mu), \Sigma -\Gamma\Sigma\right),\]
where \(\Gamma = \Sigma(W_\ell + \Sigma)^{-1}\).
Using this identity for both \(x_{c,i}\) and \(x_{c,j}\), we
obtain:
\[= h_1^4 h_2^2 \mathcal{N}\left(x_{c,i}\ \big\vert\ \mu, W_\ell + \Sigma\right)\mathcal{N}\left(x_{c,j}\ \big\vert\ \mu, W_\ell+\Sigma\right)\int \mathcal{N}\left(x\ \big\vert\ \mu + \Gamma(x_{c,i}-\mu), \Sigma -\Gamma\Sigma\right)\int \mathcal{N}\left(x-x^\prime\ \big\vert\ 0, W_{\log\ell}\right)\mathcal{N}\left(x^\prime\ \big\vert\ \mu + \Gamma(x_{c,j}-\mu), \Sigma -\Gamma\Sigma\right)\ \mathrm{d}x\ \mathrm{d}x^\prime\]
The innermost integral is a convolution, giving us:
\[= h_1^4 h_2^2 \mathcal{N}\left(x_{c,i}\ \big\vert\ \mu, W_\ell + \Sigma\right)\mathcal{N}\left(x_{c,j}\ \big\vert\ \mu, W_\ell+\Sigma\right)\int \mathcal{N}\left(x\ \big\vert\ \mu + \Gamma(x_{c,i}-\mu), \Sigma -\Gamma\Sigma\right) \mathcal{N}\left(x\ \big\vert\ \mu + \Gamma(x_{c,j}-\mu), W_{\log\ell} + \Sigma -\Gamma\Sigma\right)\ \mathrm{d}x\]
This can also be rewritten in convolution form:
\[\begin{split}\begin{align*}
&= h_1^4 h_2^2 \mathcal{N}\left(x_{c,i}\ \big\vert\ \mu, W_\ell + \Sigma\right)\mathcal{N}\left(x_{c,j}\ \big\vert\ \mu, W_\ell+\Sigma\right)\int \mathcal{N}\left(x_{c,i}-x\ \big\vert\ x_{c,i} - \mu - \Gamma(x_{c,i}-\mu), \Sigma -\Gamma W_\ell\right) \mathcal{N}\left(x\ \big\vert\ \mu + \Gamma(x_{c,j}-\mu), W_{\log\ell} + \Sigma -\Gamma W_\ell\right)\ \mathrm{d}x\\\\
&= h_1^4 h_2^2 \mathcal{N}\left(x_{c,i}\ \big\vert\ \mu, W_\ell + \Sigma\right)\mathcal{N}\left(x_{c,j}\ \big\vert\ \mu, W_\ell+\Sigma\right) \mathcal{N}\left(x_{c,i}\ \big\vert\ x_{c,i} + \Gamma(x_{c,j}-x_{c,i}), W_{\log\ell} + 2\Sigma -2\Gamma\Sigma\right)\\\\
&= h_1^4 h_2^2 \left\vert \Gamma\right\vert^{-1} \mathcal{N}\left(x_{c,i}\ \big\vert\ \mu, W_\ell + \Sigma\right)\mathcal{N}\left(x_{c,j}\ \big\vert\ \mu, W_\ell+\Sigma\right) \mathcal{N}\left(x_{c,i}\ \big\vert\ x_{c,j}, \Gamma^{-1}(W_{\log\ell} + 2\Sigma -2\Gamma\Sigma)\Gamma^{-1}\right)
\end{align*}\end{split}\]
Second integral
The integral in \(\beta(x)\) can be expressed analytically as
follows, from [O13]_:
\[\begin{split}\begin{align*}
\int K_{\log\ell}(\mathbf{x}_c, x) K_{\exp(\log\ell)}(x, \mathbf{x}_c)p(x)\ \mathrm{d}x&=\int h_{\log\ell}^2 h_\ell^2\mathcal{N}\left(x_{s,i}\ \big\vert\ x, W_{\log\ell}\right)\mathcal{N}\left(x_{s,j}\ \big\vert\ x, W_\ell\right)\mathcal{N}\left(x\ \big\vert\ \mu, \Sigma\right)\ \mathrm{d}x\\\\
&= h_{\log\ell}^2 h_\ell^2\int\mathcal{N}\left([x_{s,i}, x_{s,j}]\ \big\vert\ [x, x], [W_{\log\ell}, 0; 0, W_\ell]\right)\mathcal{N}\left(x\ \big\vert\ \mu, \Sigma\right)\ \mathrm{d}x\\\\
&= h_{\log\ell}^2 h_\ell^2 \mathcal{N}\left([x_{s,i}, x_{s,j}]\ \big\vert\ [\mu, \mu], [W_{\log\ell}+\Sigma, \Sigma; \Sigma, W_\ell+\Sigma]\right)
\end{align*}\end{split}\]