3.4. Generalization error and emprical error
Theorem 3
- $q$
- is regular for $p(x \mid w)$,
proof
By lemma 17,
\[\begin{eqnarray} \forall \alpha \in [0, 1], \ \abs{ \frac{\partial^{3} }{\partial \alpha^{3}} \mathcal{G}_{n}(\alpha) } & = & \abs{ \frac{\partial^{3} }{\partial \alpha^{3}} \mathrm{E}_{X} \left[ \log \mathrm{E}_{w} \left[ p(X \mid w)^{\alpha} \right] \right] } \nonumber \\ & = & O_{p} \left( \frac{1}{n^{3/2}} \right) . \nonumber \end{eqnarray}\]This implies
\[\begin{eqnarray} \sup_{a \in [0, 1]} \abs{ \frac{\partial^{3} }{\partial \alpha^{3}} \mathcal{G}_{n}(\alpha) } & = & o_{p} \left( \frac{1}{n} \right) . \nonumber \end{eqnarray}\]In the proof of lemma 13,
\[\begin{eqnarray} n K_{n}(w) & = & \frac{ n }{ 2 } \norm{ J(w_{0} + \bar{w}^{*})^{1/2} \left( w - w_{0} - \frac{1}{\sqrt{n}} J(w_{0} + \bar{w}^{*})^{-1} \nabla \eta_{n}(w_{0} + \bar{w}^{**}) \right) }^{2} - \frac{1}{2} \norm{ J(w_{0} + \bar{w}^{*})^{-1/2} \nabla \eta_{n}(w_{0} + \bar{w}^{**}) }^{2} . \end{eqnarray}\]where $\bar{w}^{*}$ and $\bar{w}^{**}$
\[\begin{eqnarray} K(w) & = & \frac{1}{2} (w - w_{0})^{\mathrm{T}} J(w_{0} + \bar{w}^{*}) (w - w_{0}) \nonumber \\ \eta_{n}(w) & = & (w - w_{0})^{\mathrm{T}} \nabla \eta_{n}(w_{0} + \bar{w}^{*}) . \nonumber \end{eqnarray}\]Assume
\[\begin{eqnarray} J(w^{*}) & = & J(w_{0}) + o_{p} \nonumber \\ J(w^{*})^{-1/2} \nabla \eta_{n}(w^{**}) & = & \xi_{n} + o_{p}(1) \nonumber \\ \nabla \eta_{n}(w^{**}) & = & \hat{\xi}_{n} + o_{p}(1) \nonumber \end{eqnarray}\]Thus,
\[\begin{eqnarray} K_{n}(w) & = & \frac{ 1 }{ 2 } \norm{ J(w_{0} + \bar{w}^{*})^{1/2} \left( w - w_{0} - \frac{1}{\sqrt{n}} J(w_{0} + \bar{w}^{*})^{-1} \nabla \eta_{n}(w_{0} + \bar{w}^{**}) \right) }^{2} - \frac{1}{2n} \norm{ J(w_{0} + \bar{w}^{*})^{-1/2} \nabla \eta_{n}(w_{0} + \bar{w}^{**}) }^{2} \quad (\because \text{by definition}) \nonumber \\ & = & \frac{ 1 }{ 2 } \norm{ J(w_{0}) + \bar{w}^{*})^{1/2} (w - w_{0}) - \frac{1}{\sqrt{n}} J(w_{0} + \bar{w}^{*})^{-1/2} \nabla \eta_{n}(w_{0} + \bar{w}^{**}) }^{2} - \frac{1}{2n} \norm{ \xi_{n} + o_{p}(1) }^{2} \quad (\because \text{assumption}) \nonumber \\ & = & \frac{ 1 }{ 2 } \norm{ (J(w_{0})^{1/2} + o_{p}(1)) (w - w_{0}) - \frac{1}{\sqrt{n}} \left( \xi_{n} + o_{p}(1) \right) }^{2} - \frac{1}{2n} \norm{ \xi_{n} }^{2} - \frac{1}{n} \xi_{n}^{\mathrm{T}} o_{p}(1) + \frac{1}{n} o_{p}(1) \nonumber \\ & = & \frac{ 1 }{ 2 } \norm{ J(w_{0})^{1/2} (w - w_{0}) - o_{p}(1) (w - w_{0}) - \frac{1}{\sqrt{n}} \xi_{n} - \frac{1}{\sqrt{n}} o_{p}(1) }^{2} - \frac{1}{2n} \norm{ \xi_{n} }^{2} - \frac{1}{n} \xi_{n}^{\mathrm{T}} o_{p}(1) + \frac{1}{n} o_{p}(1) \nonumber \\ & = & \frac{ 1 }{ 2 } \norm{ J(w_{0})^{1/2} (w - w_{0}) - \frac{1}{\sqrt{n}} \xi_{n} - o_{p}(1) (w - w_{0}) - \frac{1}{\sqrt{n}} o_{p}(1) }^{2} - \frac{1}{2n} \norm{ \xi_{n} }^{2} - \frac{1}{n} \xi_{n}^{\mathrm{T}} o_{p}(1) + \frac{1}{n} o_{p}(1) \nonumber \\ & = & \frac{ 1 }{ 2 } \norm{ J(w_{0})^{1/2} \left( w - w_{0} - \frac{1}{\sqrt{n}} \hat{\xi}_{n} \right) - o_{p}(1) (w - w_{0}) - \frac{1}{\sqrt{n}} o_{p}(1) }^{2} - \frac{1}{2n} \norm{ \xi_{n} }^{2} - \frac{1}{n} \xi_{n}^{\mathrm{T}} o_{p}(1) + \frac{1}{n} o_{p}(1) \nonumber \\ & = & \frac{ 1 }{ 2 } \norm{ J(w_{0})^{1/2} \left( w - w_{0} - \frac{1}{\sqrt{n}} \hat{\xi}_{n} \right) }^{2} - \left( J(w_{0})^{1/2} \left( w - w_{0} - \frac{1}{\sqrt{n}} \xi_{n} \right) \right)^{\mathrm{T}} \left( o_{p}(1) (w - w_{0}) - \frac{1}{\sqrt{n}} o_{p}(1) \right) + \frac{ 1 }{ 2 } \norm{ o_{p}(1) (w - w_{0}) - \frac{1}{\sqrt{n}} o_{p}(1) }^{2} - \frac{1}{2n} \norm{ \xi_{n} }^{2} - \frac{1}{n} \xi_{n}^{\mathrm{T}} o_{p}(1) + \frac{1}{n} o_{p}(1) \nonumber \end{eqnarray}\]Continuing
\[\begin{eqnarray} & & K_{n}(w) - \frac{ 1 }{ 2 } \norm{ J(w_{0})^{1/2} \left( w - w_{0} - \frac{1}{\sqrt{n}} \hat{\xi}_{n} \right) }^{2} + \frac{1}{2n} \norm{ \xi_{n} }^{2} \nonumber \\ & = & - \left( J(w_{0})^{1/2} \left( w - w_{0} - \frac{1}{\sqrt{n}} \xi_{n} \right) \right)^{\mathrm{T}} \left( o_{p}(1) (w - w_{0}) - o_{p}(\frac{1}{\sqrt{n}}) \right) + \frac{ 1 }{ 2 } \norm{ o_{p}(1) (w - w_{0}) - o_{p}(\frac{1}{\sqrt{n}}) }^{2} - o_{p}(\frac{1}{n}) + o_{p}(\frac{1}{n}) \nonumber \\ & = & - \left( J(w_{0})^{1/2} \left( w - w_{0} - \frac{1}{\sqrt{n}} \xi_{n} \right) \right)^{\mathrm{T}} o_{p}(1) (w - w_{0}) - \left( J(w_{0})^{1/2} \left( w - w_{0} - \frac{1}{\sqrt{n}} \xi_{n} \right) \right)^{\mathrm{T}} o_{p}(\frac{1}{\sqrt{n}}) + \frac{ 1 }{ 2 } \norm{ o_{p}(1) (w - w_{0}) - o_{p}(\frac{1}{\sqrt{n}}) }^{2} + o_{p}(\frac{1}{n}) \nonumber \\ & = & - \left( w - w_{0} - \frac{1}{\sqrt{n}} \xi_{n} \right)^{\mathrm{T}} o_{p}(1) (w - w_{0}) - \left( w - w_{0} - \frac{1}{\sqrt{n}} \xi_{n} \right)^{\mathrm{T}} o_{p}(\frac{1}{\sqrt{n}}) + \frac{ 1 }{ 2 } \norm{ o_{p}(1) (w - w_{0}) - o_{p}(\frac{1}{\sqrt{n}}) }^{2} + o_{p}(\frac{1}{n}) \nonumber \\ & = & - \left( o_{p}(\frac{1}{\sqrt{n}}) - \frac{1}{\sqrt{n}} \xi_{n} \right)^{\mathrm{T}} o_{p}(1) o_{p}(\frac{1}{\sqrt{n}}) - \left( o_{p}(\frac{1}{\sqrt{n}}) - \frac{1}{\sqrt{n}} \xi_{n} \right)^{\mathrm{T}} o_{p}(\frac{1}{\sqrt{n}}) + \frac{ 1 }{ 2 } \norm{ o_{p}(1) o_{p}(\frac{1}{\sqrt{n}}) - o_{p}(\frac{1}{\sqrt{n}}) }^{2} + o_{p}(\frac{1}{n}) \quad (\because \text{ Assuming } w - w_{0} = o_{p}(1 / \sqrt{n})) \nonumber \\ & = & - o_{p}(\frac{1}{n}) + \frac{1}{\sqrt{n}} \xi_{n} o_{p}(\frac{1}{\sqrt{n}}) - o_{p}(\frac{1}{n}) + \frac{1}{\sqrt{n}} \xi_{n} o_{p}(\frac{1}{\sqrt{n}}) + \frac{ 1 }{ 2 } \norm{ o_{p}(\frac{1}{\sqrt{n}}) }^{2} + o_{p}(\frac{1}{n}) \nonumber \\ & = & o_{p}(\frac{1}{n}) \nonumber \end{eqnarray}\]In the proof of Lemma 16, we’ve shown
\[\begin{eqnarray} \mathrm{E}_{w} \left[ J(w_{0}) \norm{ (w - w_{0}) - \frac{1}{\sqrt{n}} \hat{\xi}_{n} }^{2} \right] & = & \frac{ \mathrm{tr}(J^{-1}I) }{ 2 n \beta } + o_{p}(\frac{1}{n}) \nonumber \\ & = & \frac{ d }{ 2 n \beta } + o_{p}(\frac{1}{n}) \quad (\because \text{lemma 14}) \end{eqnarray}\]where the last equality comes from the assumption that the true distribution is regular for the stochastic model. Hence
\[\begin{eqnarray} \mathcal{T}^{\prime}(0) & = & -L_{n}(w_{0}) - \mathrm{E}_{w} \left[ K_{n}(w) \right] \nonumber \\ & = & L_{n}(w_{0}) + \frac{ d }{ 2 n \beta } - \frac{1}{2n} \norm{\xi_{n}}^{2} + o_{p}(\frac{1}{n}) . \nonumber \end{eqnarray}\]By Taylor’s theorem,
\[\begin{eqnarray} K(w) & = & \frac{1}{2} (w - w_{0})^{\mathrm{T}} J(w^{*}) (w - w_{0}) \nonumber \\ & = & \frac{1}{2} \mathrm{tr} \left( J(w^{*}) (w - w_{0}) (w - w_{0})^{\mathrm{T}} \right) \nonumber \end{eqnarray}\]By lemma 16,
\[\begin{eqnarray} \mathcal{G}_{n}(0) & = & L(w_{0}) + \mathrm{E}_{w} \left[ K(w) \right] \nonumber \\ & = & L(w_{0}) + \frac{ d }{ 2n \beta } + \frac{1}{2} \norm{ \xi_{n}}^{2} + o_{p}(\frac{1}{n}) . \nonumber \end{eqnarray}\] \[\begin{eqnarray} \mathcal{G}_{n}^{\prime\prime}(0) & = & - \mathrm{E}_{X} \left[ \mathrm{E}_{w} \left[ f(X, w)^{2} \right] - \mathrm{E}_{w} \left[ f(X, w) \right]^{2} \right] \nonumber \\ & = & - \frac{1}{n \beta} \mathrm{E}_{X} \left[ \mathrm{tr} \left( J^{-1} \left( \nabla f(X, w_{0}) (\nabla f(X, w_{0}))^{\mathrm{T}} \right) \right) \right] + o_{p}(\frac{1}{n}) \nonumber \\ & = & - \frac{1}{n \beta} \mathrm{tr} \left( I J^{-1} \right) + o_{p}(\frac{1}{n}) . \nonumber \end{eqnarray}\] \[\begin{eqnarray} \mathcal{T}_{n}^{\prime\prime}(0) & = & - \frac{1}{n} \sum_{i=1}^{n} \left( \mathrm{E}_{w} \left[ f(X, w)^{2} \right] - \mathrm{E}_{w} \left[ f(X, w) \right]^{2} \right) \nonumber \end{eqnarray}\]$\Box$
Theorem 4
Let
\[\begin{eqnarray} \lambda & := & \frac{d}{2}, \nonumber \\ \mu & := & \frac{1}{2} \mathrm{tr} \left( IJ^{-1} \right) . \nonumber \end{eqnarray}\]If $\beta = 1$,
\[\begin{eqnarray} (G_{n} - L(w_{0})) + \left( T_{n} + \frac{2 \nu}{n} - L_{n}(w_{0}) \right) & = & \frac{ 2 \lambda }{ n } + o_{p}(\frac{1}{n}) . \nonumber \end{eqnarray}\]Moreover,
\[\begin{eqnarray} \mathrm{E} \left[ G_{n} \right] & = & L(w_{0}) + \frac{1}{n} \left( \frac{\lambda - \nu}{\beta} + \nu \right) + o(\frac{1}{n}) \nonumber \\ \mathrm{E} \left[ T_{n} \right] & = & L(w_{0}) + \frac{1}{n} \left( \frac{\lambda - \nu}{\beta} - \nu \right) + o(\frac{1}{n}) . \nonumber \end{eqnarray}\]proof
\[\begin{eqnarray} \mathrm{E} \left[ G_{n} \right] & = & L(w_{0}) + \frac{1}{n} \left( \frac{d}{2 \beta} + \frac{1}{2} \mathrm{E} \left[ \norm{\xi_{n}}^{2} \right] - \frac{1}{2 \beta} \mathrm{tr} \left( IJ^{-1} \right) \right) + \mathrm{E} \left[ o_{p}(\frac{1}{n}) \right] \nonumber \\ & = & L(w_{0}) + \frac{1}{n} \left( \frac{\lambda}{\beta} + \frac{1}{2} \left( \mathrm{tr} \left( IJ^{-1} \right) + o(1) \right) - \frac{\nu}{\beta} \right) + \mathrm{E} \left[ o_{p}(\frac{1}{n}) \right] \nonumber \\ & = & L(w_{0}) + \frac{1}{n} \left( \frac{\lambda - \nu}{\beta} + \nu \frac{o(1)}{2} \right) + \mathrm{E} \left[ o_{p}(\frac{1}{n}) \right] \nonumber \\ & = & L(w_{0}) + \frac{1}{n} \left( \frac{\lambda - \nu}{\beta} + \nu \right) + \mathrm{E} \left[ o_{p}(\frac{1}{n}) \right] + \frac{o(1)}{2n} \nonumber \end{eqnarray}\] \[\begin{eqnarray} \mathrm{E} \left[ T_{n} \right] & = & L(w_{0}) + \frac{1}{n} \left( \frac{d}{2 \beta} - \frac{1}{2} \mathrm{E} \left[ \norm{\xi_{n}}^{2} \right] - \frac{1}{2 \beta} \mathrm{tr} \left( IJ^{-1} \right) \right) + \mathrm{E} \left[ o_{p}(\frac{1}{n}) \right] \nonumber \\ & = & L(w_{0}) + \frac{1}{n} \left( \frac{\lambda}{\beta} - \frac{1}{2} \left( \mathrm{tr} \left( IJ^{-1} \right) + o(1) \right) - \frac{\nu}{\beta} \right) + \mathrm{E} \left[ o_{p}(\frac{1}{n}) \right] \nonumber \\ & = & L(w_{0}) + \frac{1}{n} \left( \frac{\lambda - \nu}{\beta} - \nu - \frac{o(1)}{2} \right) + \mathrm{E} \left[ o_{p}(\frac{1}{n}) \right] \nonumber \\ & = & L(w_{0}) + \frac{1}{n} \left( \frac{\lambda - \nu}{\beta} - \nu \right) + \mathrm{E} \left[ o_{p}(\frac{1}{n}) \right] + \frac{o(1)}{2n} \nonumber \end{eqnarray}\]$\Box$
Remark 26
By theorem 4, if $\beta = 1$, $G_{n} - L(w_{0})$ and \(T_{n} + 2 \nu/n - L_{n}(w_{0})\) have the same expectation and variance.
If $q(x)$ is realizable, $S = L(w_{0})$. By Lemma 14, if $q(x)$ is realizable and regular, \(\mathrm{tr}(IJ^{-1}) = d\). We have $\nu = \lambda$ under those assumptions w. Therefore, in $\beta = 1$,
\[\begin{eqnarray} \mathrm{E} \left[ G_{n} \right] & = & S + \frac{d}{2n} + o(\frac{1}{n}) \nonumber \\ \mathrm{E} \left[ T_{n} \right] & = & S - \frac{d}{2n} + o(\frac{1}{n}) . \nonumber \end{eqnarray}\]If $q(x)$ is not realizable and $\beta = 1$, we have
\[\begin{eqnarray} \mathrm{E} \left[ G_{n} \right] & = & L(w_{0}) + \frac{\lambda}{n} + o(\frac{1}{n}) \nonumber \\ \mathrm{E} \left[ T_{n} \right] & = & L(w_{0}) + \frac{\lambda - 2 \nu}{n} + o(\frac{1}{n}) \nonumber \end{eqnarray}\]■