View on GitHub

memo

Chapter3.

3.6.

3.6.1 Free Energy

3.6.2 Generalization Error and Emprical error

By theorem 3,

\[\begin{eqnarray} G_{n} & = & L(w_{0}) + \frac{1}{n} \left( \frac{d}{2 \beta} + \frac{1}{2} \norm{\xi_{n}}^{2} - \frac{1}{2\beta} \mathrm{tr} \left( IJ^{-1} \right) \right) + o_{p} \left( \frac{1}{n} \right) \nonumber \\ & = & L(\hat{w}) - \frac{1}{2n} \norm{\xi_{n}}^{2} + \frac{1}{n} \left( \frac{d}{2 \beta} + \frac{1}{2} \norm{\xi_{n}}^{2} - \frac{1}{2\beta} \mathrm{tr} \left( IJ^{-1} \right) \right) + o_{p} \left( \frac{1}{n} \right) \quad (\because \text{theorem 5}) \nonumber \\ & = & L(\hat{w}) + \frac{1}{n} \left( \frac{d}{2 \beta} - \frac{1}{2\beta} \mathrm{tr} \left( IJ^{-1} \right) \right) + o_{p} \left( \frac{1}{n} \right) \label{equation_03_18} \end{eqnarray}\]

Smilariy,

\[\begin{eqnarray} T_{n} & = & L_{n}(\hat{w}) + \frac{1}{n} \left( \frac{d}{2\beta} - \frac{1}{2\beta} \mathrm{tr} \left( IJ^{-1} \right) \right) + o_{p} \left( \frac{1}{n} \right) \label{equation_03_19} \end{eqnarray}\]

By definition,

\[\begin{eqnarray} \mathrm{E} \left[ I \right] & = & \mathrm{E} \left[ I(w_{0}) \right] \nonumber \\ & = & \mathrm{E}_{X} \left[ \nabla f(X, w_{0}) (\nabla f(X, w_{0}))^{\mathrm{T}} \right] . \nonumber \end{eqnarray}\] \[\begin{eqnarray} I_{n}(w) & := & \frac{1}{n} \sum_{i=1}^{n} \nabla \log f(X_{i} \mid w) (\nabla \log f(X_{i} \mid w))^{\mathrm{T}} \label{equation_03_20} \\ \mathrm{E} \left[ I_{n}(w) \right] & = & \mathrm{E} \left[ \frac{1}{n} \sum_{i=1}^{n} \nabla \log f(X_{i} \mid w) (\nabla \log f(X_{i} \mid w))^{\mathrm{T}} \right] \nonumber \\ & = & I \nonumber \\ J_{n}(w) & = & -\frac{1}{n} \sum_{i=1}^{n} \nabla^{2} \log p(X_{i} \mid w) . \label{equation_03_21} \end{eqnarray}\]

By law of large numbers and the CLT,

\[\begin{eqnarray} I & := & I_{n}(\hat{w}) + o_{p}(1), \nonumber \\ J & := & J_{n}(\hat{w}) + o_{p}(1) . \nonumber \end{eqnarray}\]

Therefore, emplical loss $T_{n}$ can be computed with the sample. On the other hand, generalized loss $G_{n}$ is not computable only with the sample since $L(\hat{w})$ is not derived from the sample.

Let us define Regular Information Criteria:

\[\begin{eqnarray} \mathrm{RIC} & := & T_{n} + \frac{1}{n} \mathrm{tr} \left( IJ^{-1} \right) . \nonumber \end{eqnarray}\] \[\begin{eqnarray} \mathrm{RIC} & = & L_{n}(\hat{w}) + \frac{1}{n} \left( \frac{d}{2 \beta} - \frac{1}{2 \beta} \mathrm{tr} \left( IJ^{-1} \right) \right) + \frac{1}{n} \mathrm{tr} \left( IJ^{-1} \right) + o_{p}(\frac{1}{n}) \nonumber \\ & = & L_{n}(\hat{w}) + \frac{1}{n} \left( \frac{d}{2 \beta} + \left( 1 - \frac{1}{2 \beta} \right) \mathrm{tr} \left( IJ^{-1} \right) \right) + o_{p}(\frac{1}{n}) \nonumber \end{eqnarray}\]

Then

\[\begin{eqnarray} \mathrm{E} \left[ \mathrm{RIC} \right] & = & \mathrm{E} \left[ L_{n}(\hat{w}) \right] + \frac{1}{n} \left( \frac{d}{2 \beta} + \left( 1 - \frac{1}{2 \beta} \right) \mathrm{E} \left[ \mathrm{tr} \left( IJ^{-1} \right) \right] \right) + \mathrm{E} \left[ o_{p} \left( \frac{1}{n} \right) \right] \nonumber \\ & = & \mathrm{E} \left[ L_{n}(\hat{w}) \right] + \frac{1}{n} \left( \frac{d}{2 \beta} - \frac{1}{2 \beta} \mathrm{E} \left[ \mathrm{tr} \left( IJ^{-1} \right) \right] \right) + \mathrm{E} \left[ o_{p} \left( \frac{1}{n} \right) \right] \nonumber \\ & = & \mathrm{E} \left[ G_{n} \right] . \nonumber \end{eqnarray}\]

When $\beta = 1$,

\[\begin{eqnarray} (G_{n} - L(w_{0})) + (\mathrm{RIC} - L_{n}(w_{0})) & = & \frac{1}{2n} \left( d + \norm{\xi_{n}}^{2} - \mathrm{tr} \left( IJ^{-1} \right) \right) - \frac{1}{2n} \norm{\xi_{n}}^{2} + \frac{1}{n} \left( \frac{d}{2} + \left( 1 - \frac{1}{2} \right) \mathrm{tr} \left( IJ^{-1} \right) \right) + o_{p}(\frac{1}{n}) \quad (\because \text{Theorem 3}) \nonumber \\ & = & \frac{d}{n} + \mathrm{tr} \left( IJ^{-1} \right) \frac{1}{n} + o_{p}(\frac{1}{n}) \nonumber \\ & = & \frac{d}{n} + \frac{d}{n} + o_{p}(\frac{1}{n}) \quad (\because \text{lemma 14}) \nonumber \\ & = & \frac{2d}{n} + o_{p}(\frac{1}{n}) . \nonumber \end{eqnarray}\]

Remark 30

We assume that the statiscal model is regular. By lemma 14, if the statistical model is regular,

\[\begin{eqnarray} \mathrm{tr} \left( IJ^{-1} \right) = d . \nonumber \end{eqnarray}\] \[\begin{eqnarray} \mathrm{TIC} & = & L_{n}(\hat{w}) + \frac{1}{n} \mathrm{tr} \left( I_{n}(\hat{w}) J_{n}^{-1}(\hat{w}) \right) \nonumber \end{eqnarray}\] \[\begin{eqnarray} \mathrm{E} \left[ \mathrm{TIC} \right] & = & \mathrm{E} \left[ L_{n}(\hat{w}) \right] + \frac{1}{n} \mathrm{E} \left[ \mathrm{tr} \left( I_{n}(\hat{w}) J_{n}^{-1}(\hat{w}) \right) \right] \nonumber \\ & = & \mathrm{E} \left[ L(\hat{w}) \right] + o(\frac{1}{n}) \nonumber \end{eqnarray}\]

Moreover,

\[\begin{eqnarray} (L(\hat{w}) - L(w_{0})) + (\mathrm{TIC} - L_{n}(w_{0})) & = & \frac{1}{2n} \norm{\xi}^{2} - \frac{1}{2n} \norm{\xi}^{2} + \frac{1}{n} \mathrm{tr} \left( I_{n}(\hat{w}) J_{n}^{-1}(\hat{w}) \right) + o_{p}(\frac{1}{n}) \quad (\because \text{Theorem 3}) \nonumber \\ & = & \frac{1}{n} \mathrm{tr} \left( I_{n}(\hat{w}) J_{n}^{-1}(\hat{w}) \right) + o_{p}(\frac{1}{n}) . \nonumber \end{eqnarray}\]

Definition AIC

\[\begin{eqnarray} \mathrm{AIC} = L_{n}(\hat{w}) + \frac{d}{n} . \nonumber \end{eqnarray}\]

Expectation of AIC is

\[\begin{eqnarray} \mathrm{E} \left[ \mathrm{AIC} \right] & = & \mathrm{E} \left[ L(\hat{w}) \right] + o(1/n) . \nonumber \end{eqnarray}\]

Then

\[\begin{eqnarray} (L(\hat{w}) - L(w_{0})) + (\mathrm{AIC} - L_{n}(w_{0})) & = & \frac{1}{2n} \norm{\xi}^{2} - \frac{1}{2n} \norm{\xi}^{2} + \frac{ n }{ d } + o_{p}(\frac{1}{n}) \quad (\because \text{Theorem 3}) \nonumber \\ & = & \frac{ n }{ d } + o_{p}(\frac{1}{n}) \nonumber \end{eqnarray}\]

holds.

Example 9

Let us consider a few models. The first one is statistical model without parameters.

\[\begin{eqnarray} p_{0}(x) & := & \frac{1}{\sqrt{2 \pi}} \exp \left( - \frac{x^{2}}{2} \right) \nonumber \end{eqnarray}\]

The second example is a statistcal model with a parameter.

\[\begin{eqnarray} p_{1}(x \mid m) & := & \frac{1}{\sqrt{2 \pi}} \exp \left( - \frac{ (x - m)^{2} }{ 2 } \right) . \nonumber \end{eqnarray}\]

Given observed values $x_{1}, \ldots, x_{n}$.

\[\begin{eqnarray} \mathcal{L}(w; p_{0}) & = & -\frac{1}{n} \sum_{i=1}^{n} \log p_{0} (X_{i} \mid w) - \frac{1}{n \beta} \log \phi(w) \nonumber \\ & = & - \frac{1}{n} \sum_{i=1}^{n} \left( \log (1 / \sqrt{2\pi}) - \frac{X_{i}^{2}}{2} \right) - \frac{1}{n \beta} \log \phi(w) \nonumber \\ & = & - \log (1 / \sqrt{2\pi}) + \frac{1}{n} \sum_{i=1}^{n} \frac{X_{i}^{2}}{2} - \frac{1}{n \beta} \log \phi(w) \nonumber \\ \hat{w}_{0} & := & \argmin_{w} \mathcal{L}(w) \end{eqnarray}\]

Similary, for $p_{1}$,

\[\begin{eqnarray} \mathcal{L}(w; p_{1}) & = & -\frac{1}{n} \sum_{i=1}^{n} \log p_{1} (X_{i} \mid w) - \frac{1}{n \beta} \log \phi(w) \nonumber \\ & = & -\frac{1}{n} \sum_{i=1}^{n} \left( \log (1 / \sqrt{2\pi}) - \frac{ (X_{i} - m)^{2} }{ 2 } \right) - \frac{1}{n \beta} \log \phi(w) \nonumber \\ & = & - \log (1 / \sqrt{2\pi}) + \frac{1}{n} \sum_{i=1}^{n} \frac{ (X_{i} - m)^{2} }{ 2 } - \frac{1}{n \beta} \log \phi(w) \nonumber \\ \hat{w}_{1} & := & \argmin_{w} \mathcal{L}(w) \nonumber \end{eqnarray}\]

Let’s assume that $\phi$ is indepdent from $w$ and $\beta = 1$.

\[\begin{eqnarray} \mathcal{L}(w; p_{0}) & = & - \log(1 / \sqrt{2\pi}) + \frac{1}{n} \sum_{i=1}^{n} \frac{X_{i}^{2}}{2} - \frac{1}{n} \log \phi \nonumber \\ \mathcal{L}(w; p_{1}) & = & - \log(1 / \sqrt{2\pi}) + \frac{1}{n} \sum_{i=1}^{n} \frac{(X_{i} - w)^{2}}{2} - \frac{1}{n} \log \phi . \nonumber \end{eqnarray}\]

Under the assumption,

\[\begin{eqnarray} \hat{w}_{1} & = & \frac{1}{n} \sum_{i=1}^{n} X_{i} . \nonumber \end{eqnarray}\]

(i)

\[\begin{eqnarray} L_{n}(\hat{w}_{0}; p_{0}) & = & -\frac{1}{n} \sum_{i=1}^{n} \log p_{0}(X_{i} \mid \hat{w}) \nonumber \\ & = & - \log(1 / \sqrt{2\pi}) + \frac{1}{n} \sum_{i=1}^{n} \frac{X_{i}^{2}}{2} \nonumber \\ \mathrm{AIC}(p_{0}) & = & L_{n}(\hat{w}_{0}; p_{0}) + \frac{d}{n} \nonumber \\ & = & - \log(1 / \sqrt{2\pi}) + \frac{1}{2n} \sum_{i=1}^{n} X_{i}^{2} + \frac{0}{n} \nonumber \end{eqnarray}\] \[\begin{eqnarray} L_{n}(\hat{w}_{1}; p_{1}) & = & -\frac{1}{n} \sum_{i=1}^{n} \log p_{1}(X_{i} \mid \hat{w}_{1}) \nonumber \\ & = & - \log(1 / \sqrt{2\pi}) + \frac{1}{n} \sum_{i=1}^{n} \frac{ (X_{i} - m)^{2} }{ 2 } \nonumber \\ \mathrm{AIC}(p_{1}) & = & L_{n}(\hat{w}_{1}; p_{1}) + \frac{d}{n} \nonumber \\ & = & - \log(1 / \sqrt{2\pi}) + \frac{1}{2n} \sum_{i=1}^{n} (X_{i} - \hat{w}_{1})^{2} + \frac{1}{n} \nonumber \\ & = & - \log(1 / \sqrt{2\pi}) + \frac{1}{2n} \sum_{i=1}^{n} \left( X_{i}^{2} - \frac{2}{n} X_{i} \sum_{j=1}^{n} X_{j} + \frac{1}{n^{2}} \left( \sum_{j=1}^{n} X_{i} \right)^{2} \right) + \frac{1}{n} \nonumber \\ & = & - \log(1 / \sqrt{2\pi}) + \frac{1}{2n} \sum_{i=1}^{n} X_{i}^{2} - \frac{1}{2n} \sum_{i=1}^{n} \frac{2}{n} X_{i} \sum_{j=1}^{n} X_{j} + \frac{1}{2n} \sum_{i=1}^{n} \frac{1}{n^{2}} \left( \sum_{j=1}^{n} X_{i} \right)^{2} + \frac{1}{n} \nonumber \\ & = & - \log(1 / \sqrt{2\pi}) + \frac{1}{2n} \sum_{i=1}^{n} X_{i}^{2} - \frac{1}{2n} \frac{1}{n} \left( \sum_{j=1}^{n} X_{j} \right)^{2} + \frac{1}{n} . \end{eqnarray}\]

$\mathrm{AIC}(p_{1}) < \mathrm{AIC}(p_{0})$ is equivalent to

\[\begin{eqnarray} & & - \frac{1}{n} \left( \sum_{j=1}^{n} X_{j} \right)^{2} + 2 < 0 \nonumber \\ & \Leftrightarrow & 2 < \frac{1}{n} \left( \sum_{j=1}^{n} X_{j} \right)^{2} \end{eqnarray}\]

(ii)

\[\begin{eqnarray} \mathrm{BIC}(p_{0}) & = & \sum_{i=1}^{n} \log p_{0}(X_{i} \mid \hat{w}) + \frac{0}{2} \log n \nonumber \\ & = & - \log(1/\sqrt{2\pi}) + \frac{1}{n} \sum_{i=1}^{n} \frac{ (X_{i})^{2} }{ 2 } \nonumber \\ \mathrm{BIC}(p_{1}) & = & \sum_{i=1}^{n} \log p_{0}(X_{i} \mid \hat{w}) + \frac{d}{2} \log n \nonumber \\ & = & - \log(1 / \sqrt{2\pi}) + \frac{1}{n} \sum_{i=1}^{n} \frac{ (X_{i} - \hat{w}_{1})^{2} }{ 2 } + \frac{1}{2} \log n . \nonumber \end{eqnarray}\]

\(\mathrm{BIC}(p_{1}) < \mathrm{BIC}(p_{0})\) is equivalent to

\[\begin{eqnarray} & & \sum_{i=1}^{n} X_{i}^{2} - \frac{1}{n} \left( \sum_{i=1}^{n} X_{i} \right)^{2} + n \log n < \sum_{i=1}^{n} (X_{i})^{2} \nonumber \\ & \Leftrightarrow & n \log n < \frac{1}{n} \left( \sum_{i=1}^{n} X_{i} \right)^{2} . \nonumber \end{eqnarray}\]

(iii)

Let $0 \in \Theta \subset \mathbb{R}$ and \(\Theta_{0} := \{0\}\) be parameter space of Likelihood-ratio tes and a null hypothesis, respectively. The Likelihood-ration is

\[\begin{eqnarray} -2 \log \frac{ p_{0}(x) }{ \sup_{m \in \Theta} p_{1}(x \mid m) } & = & -2 \left( \log(1 / \sqrt{2\pi}) - \sum_{i=1}^{n} \frac{ X_{i}^{2} }{ 2 } - \left( \log(1 / \sqrt{2\pi}) + \sup_{m \in \Theta} - \sum_{i=1}^{n} \frac{ (X_{i} - m)^{2} }{ 2 } \right) \right) \nonumber \\ & = & \sum_{i=1}^{n} X_{i}^{2} - \sup_{m \in \Theta} - \sum_{i=1}^{n} (X_{i} - m)^{2} \nonumber \\ & = & \sum_{i=1}^{n} X_{i}^{2} - \min_{m \in \Theta} \sum_{i=1}^{n} (X_{i} - m)^{2} \nonumber \\ & = & \sum_{i=1}^{n} X_{i}^{2} - \sum_{i=1}^{n} \left( X_{i} - \frac{1}{n} \sum_{j=1}^{n} X_{j} \right)^{2} \nonumber \\ & = & \sum_{i=1}^{n} X_{i}^{2} - \sum_{i=1}^{n} \left( X_{i}^{2} - \frac{2}{n} \left( \frac{1}{n} \sum_{j=1}^{n} X_{j} \right) X_{i} + \frac{1}{n^{2}} \left( \frac{1}{n} \sum_{j=1}^{n} X_{j} \right)^{2} \right) \nonumber \\ & = & - \left( - \sum_{i=1}^{n} \frac{2}{n} \left( \frac{1}{n} \sum_{j=1}^{n} X_{j} \right) X_{i} + \frac{n}{n^{2}} \left( \frac{1}{n} \sum_{j=1}^{n} X_{j} \right)^{2} \right) \nonumber \\ & = & \frac{1}{n^{2}} \left( \frac{1}{n} \sum_{j=1}^{n} X_{j} \right)^{2} . \nonumber \end{eqnarray}\]

Under the null hypothesis, likehood has an asymptotic $\chi^{2}$-distribution with the degree of freedom $\mathrm{dim}(\Theta) - \mathrm{dim}(\Theta_{0})$ by Wilk’s theorem. Hence, For a test with the significance level $\alpha = 0.01$, a hypothesis isrejected if

\[\begin{eqnarray} \frac{1}{n^{2}} \left( \frac{1}{n} \sum_{j=1}^{n} X_{j} \right)^{2} & > & 6.63 . \nonumber \end{eqnarray}\]

Reference