\(
\newcommand{\A} {\mathbb{A}}
\newcommand{\rma} {\mathrm{a}}
\newcommand{\rmA} {\mathrm{A}}
\newcommand{\ba} {\mathbf{a}}
\newcommand{\bA} {\mathbf{A}}
\newcommand{\cA} {\mathcal{A}}
\newcommand{\bcA} {\boldsymbol{\cA}}
\newcommand{\tnsrA} {\underline{\bA}}
\newcommand{\wta} {\widetilde{a}}
\newcommand{\wtA} {\widetilde{A}}
\newcommand{\wtrma} {\widetilde{\rma}}
\newcommand{\wtrmA} {\widetilde{\rmA}}
\newcommand{\wtba} {\widetilde{\ba}}
\newcommand{\wtbA} {\widetilde{\bA}}
\newcommand{\wha} {\widehat{a}}
\newcommand{\whA} {\widehat{A}}
\newcommand{\whrma} {\widehat{\rma}}
\newcommand{\whrmA} {\widehat{\rmA}}
\newcommand{\whba} {\widehat{\ba}}
\newcommand{\whbA} {\widehat{\bA}}
\newcommand{\whcA} {\widehat{\cA}}
\newcommand{\whtnsrA} {\widehat{\tnsrA}}
\newcommand{\B} {\mathbb{B}}
\newcommand{\rmb} {\mathrm{b}}
\newcommand{\rmB} {\mathrm{B}}
\newcommand{\bb} {\mathbf{b}}
\newcommand{\bB} {\mathbf{B}}
\newcommand{\cB} {\mathcal{B}}
\newcommand{\bcB} {\boldsymbol{\cB}}
\newcommand{\tnsrB} {\underline{\bB}}
\newcommand{\wtb} {\widetilde{b}}
\newcommand{\wtB} {\widetilde{B}}
\newcommand{\wtrmb} {\widetilde{\rmb}}
\newcommand{\wtrmB} {\widetilde{\rmB}}
\newcommand{\wtbb} {\widetilde{\bb}}
\newcommand{\wtbB} {\widetilde{\bB}}
\newcommand{\whb} {\widehat{b}}
\newcommand{\whB} {\widehat{B}}
\newcommand{\whrmb} {\widehat{\rmb}}
\newcommand{\whrmB} {\widehat{\rmB}}
\newcommand{\whbb} {\widehat{\bb}}
\newcommand{\whbB} {\widehat{\bB}}
\newcommand{\whcB} {\widehat{\cB}}
\newcommand{\whtnsrB} {\widehat{\tnsrB}}
\newcommand{\C} {\mathbb{C}}
\newcommand{\rmc} {\mathrm{c}}
\newcommand{\rmC} {\mathrm{C}}
\newcommand{\bc} {\mathbf{c}}
\newcommand{\bC} {\mathbf{C}}
\newcommand{\cC} {\mathcal{C}}
\newcommand{\bcC} {\boldsymbol{\cC}}
\newcommand{\tnsrC} {\underline{\bC}}
\newcommand{\wtc} {\widetilde{c}}
\newcommand{\wtC} {\widetilde{C}}
\newcommand{\wtrmc} {\widetilde{\rmc}}
\newcommand{\wtrmC} {\widetilde{\rmC}}
\newcommand{\wtbc} {\widetilde{\bc}}
\newcommand{\wtbC} {\widetilde{\bC}}
\newcommand{\whc} {\widehat{c}}
\newcommand{\whC} {\widehat{C}}
\newcommand{\whrmc} {\widehat{\rmc}}
\newcommand{\whrmC} {\widehat{\rmC}}
\newcommand{\whbc} {\widehat{\bc}}
\newcommand{\whbC} {\widehat{\bC}}
\newcommand{\whcC} {\widehat{\cC}}
\newcommand{\whtnsrC} {\widehat{\tnsrC}}
\newcommand{\D} {\mathbb{D}}
\newcommand{\rmd} {\mathrm{d}}
\newcommand{\rmD} {\mathrm{D}}
\newcommand{\bd} {\mathbf{d}}
\newcommand{\bD} {\mathbf{D}}
\newcommand{\cD} {\mathcal{D}}
\newcommand{\bcD} {\boldsymbol{\cD}}
\newcommand{\tnsrD} {\underline{\bD}}
\newcommand{\wtd} {\widetilde{d}}
\newcommand{\wtD} {\widetilde{D}}
\newcommand{\wtrmd} {\widetilde{\rmd}}
\newcommand{\wtrmD} {\widetilde{\rmD}}
\newcommand{\wtbd} {\widetilde{\bd}}
\newcommand{\wtbD} {\widetilde{\bD}}
\newcommand{\whd} {\widehat{d}}
\newcommand{\whD} {\widehat{D}}
\newcommand{\whrmd} {\widehat{\rmd}}
\newcommand{\whrmD} {\widehat{\rmD}}
\newcommand{\whbd} {\widehat{\bd}}
\newcommand{\whbD} {\widehat{\bD}}
\newcommand{\whcD} {\widehat{\cD}}
\newcommand{\whtnsrD} {\widehat{\tnsrD}}
\newcommand{\E} {\mathbb{E}}
\newcommand{\rme} {\mathrm{e}}
\newcommand{\rmE} {\mathrm{E}}
\newcommand{\be} {\mathbf{e}}
\newcommand{\bE} {\mathbf{E}}
\newcommand{\cE} {\mathcal{E}}
\newcommand{\bcE} {\boldsymbol{\cE}}
\newcommand{\tnsrE} {\underline{\bE}}
\newcommand{\wte} {\widetilde{e}}
\newcommand{\wtE} {\widetilde{E}}
\newcommand{\wtrme} {\widetilde{\rme}}
\newcommand{\wtrmE} {\widetilde{\rmE}}
\newcommand{\wtbe} {\widetilde{\be}}
\newcommand{\wtbE} {\widetilde{\bE}}
\newcommand{\whe} {\widehat{e}}
\newcommand{\whE} {\widehat{E}}
\newcommand{\whrme} {\widehat{\rme}}
\newcommand{\whrmE} {\widehat{\rmE}}
\newcommand{\whbe} {\widehat{\be}}
\newcommand{\whbE} {\widehat{\bE}}
\newcommand{\whcE} {\widehat{\cE}}
\newcommand{\whtnsrE} {\widehat{\tnsrE}}
\newcommand{\F} {\mathbb{F}}
\newcommand{\rmf} {\mathrm{f}}
\newcommand{\rmF} {\mathrm{F}}
\newcommand{\bff} {\mathbf{f}}
\newcommand{\bF} {\mathbf{F}}
\newcommand{\cF} {\mathcal{F}}
\newcommand{\bcF} {\boldsymbol{\cF}}
\newcommand{\tnsrF} {\underline{\bF}}
\newcommand{\wtf} {\widetilde{f}}
\newcommand{\wtF} {\widetilde{F}}
\newcommand{\wtrmf} {\widetilde{\rmf}}
\newcommand{\wtrmF} {\widetilde{\rmF}}
\newcommand{\wtbf} {\widetilde{\bf}}
\newcommand{\wtbF} {\widetilde{\bF}}
\newcommand{\whf} {\widehat{f}}
\newcommand{\whF} {\widehat{F}}
\newcommand{\whrmf} {\widehat{\rmf}}
\newcommand{\whrmF} {\widehat{\rmF}}
\newcommand{\whbf} {\widehat{\bf}}
\newcommand{\whbF} {\widehat{\bF}}
\newcommand{\whcF} {\widehat{\cF}}
\newcommand{\whtnsrF} {\widehat{\tnsrF}}
\newcommand{\G} {\mathbb{G}}
\newcommand{\rmg} {\mathrm{g}}
\newcommand{\rmG} {\mathrm{G}}
\newcommand{\bg} {\mathbf{g}}
\newcommand{\bG} {\mathbf{G}}
\newcommand{\cG} {\mathcal{G}}
\newcommand{\bcG} {\boldsymbol{\cG}}
\newcommand{\tnsrG} {\underline{\bG}}
\newcommand{\wtg} {\widetilde{g}}
\newcommand{\wtG} {\widetilde{G}}
\newcommand{\wtrmg} {\widetilde{\rmg}}
\newcommand{\wtrmG} {\widetilde{\rmG}}
\newcommand{\wtbg} {\widetilde{\bg}}
\newcommand{\wtbG} {\widetilde{\bG}}
\newcommand{\whg} {\widehat{g}}
\newcommand{\whG} {\widehat{G}}
\newcommand{\whrmg} {\widehat{\rmg}}
\newcommand{\whrmG} {\widehat{\rmG}}
\newcommand{\whbg} {\widehat{\bg}}
\newcommand{\whbG} {\widehat{\bG}}
\newcommand{\whcG} {\widehat{\cG}}
\newcommand{\whtnsrG} {\widehat{\tnsrG}}
\newcommand{\bbH} {\mathbb{H}}
\newcommand{\rmh} {\mathrm{h}}
\newcommand{\rmH} {\mathrm{H}}
\newcommand{\bh} {\mathbf{h}}
\newcommand{\bH} {\mathbf{H}}
\newcommand{\cH} {\mathcal{H}}
\newcommand{\bcH} {\boldsymbol{\cH}}
\newcommand{\tnsrH} {\underline{\bH}}
\newcommand{\wth} {\widetilde{h}}
\newcommand{\wtH} {\widetilde{H}}
\newcommand{\wtrmh} {\widetilde{\rmh}}
\newcommand{\wtrmH} {\widetilde{\rmH}}
\newcommand{\wtbh} {\widetilde{\bh}}
\newcommand{\wtbH} {\widetilde{\bH}}
\newcommand{\whh} {\widehat{h}}
\newcommand{\whH} {\widehat{H}}
\newcommand{\whrmh} {\widehat{\rmh}}
\newcommand{\whrmH} {\widehat{\rmH}}
\newcommand{\whbh} {\widehat{\bh}}
\newcommand{\whbH} {\widehat{\bH}}
\newcommand{\whcH} {\widehat{\cH}}
\newcommand{\whtnsrH} {\widehat{\tnsrH}}
\newcommand{\I} {\mathbb{I}}
\newcommand{\rmi} {\mathrm{i}}
\newcommand{\rmI} {\mathrm{I}}
\newcommand{\bi} {\mathbf{i}}
\newcommand{\bI} {\mathbf{I}}
\newcommand{\cI} {\mathcal{I}}
\newcommand{\bcI} {\boldsymbol{\cI}}
\newcommand{\tnsrI} {\underline{\bI}}
\newcommand{\wti} {\widetilde{i}}
\newcommand{\wtI} {\widetilde{I}}
\newcommand{\wtrmi} {\widetilde{\rmi}}
\newcommand{\wtrmI} {\widetilde{\rmI}}
\newcommand{\wtbi} {\widetilde{\bi}}
\newcommand{\wtbI} {\widetilde{\bI}}
\newcommand{\whi} {\widehat{i}}
\newcommand{\whI} {\widehat{I}}
\newcommand{\whrmi} {\widehat{\rmi}}
\newcommand{\whrmI} {\widehat{\rmI}}
\newcommand{\whbi} {\widehat{\bi}}
\newcommand{\whbI} {\widehat{\bI}}
\newcommand{\whcI} {\widehat{\cI}}
\newcommand{\whtnsrI} {\widehat{\tnsrI}}
\newcommand{\J} {\mathbb{J}}
\newcommand{\rmj} {\mathrm{j}}
\newcommand{\rmJ} {\mathrm{J}}
\newcommand{\bj} {\mathbf{j}}
\newcommand{\bJ} {\mathbf{J}}
\newcommand{\cJ} {\mathcal{J}}
\newcommand{\bcJ} {\boldsymbol{\cJ}}
\newcommand{\tnsrJ} {\underline{\bJ}}
\newcommand{\wtj} {\widetilde{j}}
\newcommand{\wtJ} {\widetilde{J}}
\newcommand{\wtrmj} {\widetilde{\rmj}}
\newcommand{\wtrmJ} {\widetilde{\rmJ}}
\newcommand{\wtbj} {\widetilde{\bj}}
\newcommand{\wtbJ} {\widetilde{\bJ}}
\newcommand{\whj} {\widehat{j}}
\newcommand{\whJ} {\widehat{J}}
\newcommand{\whrmj} {\widehat{\rmj}}
\newcommand{\whrmJ} {\widehat{\rmJ}}
\newcommand{\whbj} {\widehat{\bj}}
\newcommand{\whbJ} {\widehat{\bJ}}
\newcommand{\whcJ} {\widehat{\cJ}}
\newcommand{\whtnsrJ} {\widehat{\tnsrJ}}
\newcommand{\K} {\mathbb{K}}
\newcommand{\rmk} {\mathrm{k}}
\newcommand{\rmK} {\mathrm{K}}
\newcommand{\bk} {\mathbf{k}}
\newcommand{\bK} {\mathbf{K}}
\newcommand{\cK} {\mathcal{K}}
\newcommand{\bcK} {\boldsymbol{\cK}}
\newcommand{\tnsrK} {\underline{\bK}}
\newcommand{\wtk} {\widetilde{k}}
\newcommand{\wtK} {\widetilde{K}}
\newcommand{\wtrmk} {\widetilde{\rmk}}
\newcommand{\wtrmK} {\widetilde{\rmK}}
\newcommand{\wtbk} {\widetilde{\bk}}
\newcommand{\wtbK} {\widetilde{\bK}}
\newcommand{\whk} {\widehat{k}}
\newcommand{\whK} {\widehat{K}}
\newcommand{\whrmk} {\widehat{\rmk}}
\newcommand{\whrmK} {\widehat{\rmK}}
\newcommand{\whbk} {\widehat{\bk}}
\newcommand{\whbK} {\widehat{\bK}}
\newcommand{\whcK} {\widehat{\cK}}
\newcommand{\whtnsrK} {\widehat{\tnsrK}}
\newcommand{\bbL} {\mathbb{L}}
\newcommand{\rml} {\mathrm{l}}
\newcommand{\rmL} {\mathrm{L}}
\newcommand{\bl} {\mathbf{l}}
\newcommand{\bL} {\mathbf{L}}
\newcommand{\cL} {\mathcal{L}}
\newcommand{\bcL} {\boldsymbol{\cL}}
\newcommand{\tnsrL} {\underline{\bL}}
\newcommand{\wtl} {\widetilde{l}}
\newcommand{\wtL} {\widetilde{L}}
\newcommand{\wtrml} {\widetilde{\rml}}
\newcommand{\wtrmL} {\widetilde{\rmL}}
\newcommand{\wtbl} {\widetilde{\bl}}
\newcommand{\wtbL} {\widetilde{\bL}}
\newcommand{\whl} {\widehat{l}}
\newcommand{\whL} {\widehat{L}}
\newcommand{\whrml} {\widehat{\rml}}
\newcommand{\whrmL} {\widehat{\rmL}}
\newcommand{\whbl} {\widehat{\bl}}
\newcommand{\whbL} {\widehat{\bL}}
\newcommand{\whcL} {\widehat{\cL}}
\newcommand{\whtnsrL} {\widehat{\tnsrL}}
\newcommand{\M} {\mathbb{M}}
\newcommand{\rmm} {\mathrm{m}}
\newcommand{\rmM} {\mathrm{M}}
\newcommand{\bm} {\mathbf{m}}
\newcommand{\bM} {\mathbf{M}}
\newcommand{\cM} {\mathcal{M}}
\newcommand{\bcM} {\boldsymbol{\cM}}
\newcommand{\tnsrM} {\underline{\bM}}
\newcommand{\wtm} {\widetilde{m}}
\newcommand{\wtM} {\widetilde{M}}
\newcommand{\wtrmm} {\widetilde{\rmm}}
\newcommand{\wtrmM} {\widetilde{\rmM}}
\newcommand{\wtbm} {\widetilde{\bm}}
\newcommand{\wtbM} {\widetilde{\bM}}
\newcommand{\whm} {\widehat{m}}
\newcommand{\whM} {\widehat{M}}
\newcommand{\whrmm} {\widehat{\rmm}}
\newcommand{\whrmM} {\widehat{\rmM}}
\newcommand{\whbm} {\widehat{\bm}}
\newcommand{\whbM} {\widehat{\bM}}
\newcommand{\whcM} {\widehat{\cM}}
\newcommand{\whtnsrM} {\widehat{\tnsrM}}
\newcommand{\N} {\mathbb{N}}
\newcommand{\rmn} {\mathrm{n}}
\newcommand{\rmN} {\mathrm{N}}
\newcommand{\bn} {\mathbf{n}}
\newcommand{\bN} {\mathbf{N}}
\newcommand{\cN} {\mathcal{N}}
\newcommand{\bcN} {\boldsymbol{\cN}}
\newcommand{\tnsrN} {\underline{\bN}}
\newcommand{\wtn} {\widetilde{n}}
\newcommand{\wtN} {\widetilde{N}}
\newcommand{\wtrmn} {\widetilde{\rmn}}
\newcommand{\wtrmN} {\widetilde{\rmN}}
\newcommand{\wtbn} {\widetilde{\bn}}
\newcommand{\wtbN} {\widetilde{\bN}}
\newcommand{\whn} {\widehat{n}}
\newcommand{\whN} {\widehat{N}}
\newcommand{\whrmn} {\widehat{\rmn}}
\newcommand{\whrmN} {\widehat{\rmN}}
\newcommand{\whbn} {\widehat{\bn}}
\newcommand{\whbN} {\widehat{\bN}}
\newcommand{\whcN} {\widehat{\cN}}
\newcommand{\whtnsrN} {\widehat{\tnsrN}}
\newcommand{\bbO} {\mathbb{O}}
\newcommand{\rmo} {\mathrm{o}}
\newcommand{\rmO} {\mathrm{O}}
\newcommand{\bo} {\mathbf{o}}
\newcommand{\bO} {\mathbf{O}}
\newcommand{\cO} {\mathcal{O}}
\newcommand{\bcO} {\boldsymbol{\cO}}
\newcommand{\tnsrO} {\underline{\bO}}
\newcommand{\wto} {\widetilde{o}}
\newcommand{\wtO} {\widetilde{O}}
\newcommand{\wtrmo} {\widetilde{\rmo}}
\newcommand{\wtrmO} {\widetilde{\rmO}}
\newcommand{\wtbo} {\widetilde{\bo}}
\newcommand{\wtbO} {\widetilde{\bO}}
\newcommand{\who} {\widehat{o}}
\newcommand{\whO} {\widehat{O}}
\newcommand{\whrmo} {\widehat{\rmo}}
\newcommand{\whrmO} {\widehat{\rmO}}
\newcommand{\whbo} {\widehat{\bo}}
\newcommand{\whbO} {\widehat{\bO}}
\newcommand{\whcO} {\widehat{\cO}}
\newcommand{\whtnsrO} {\widehat{\tnsrO}}
\newcommand{\bbP} {\mathbb{P}}
\newcommand{\rmp} {\mathrm{p}}
\newcommand{\rmP} {\mathrm{P}}
\newcommand{\bp} {\mathbf{p}}
\newcommand{\bP} {\mathbf{P}}
\newcommand{\cP} {\mathcal{P}}
\newcommand{\bcP} {\boldsymbol{\cP}}
\newcommand{\tnsrP} {\underline{\bP}}
\newcommand{\wtp} {\widetilde{p}}
\newcommand{\wtP} {\widetilde{P}}
\newcommand{\wtrmp} {\widetilde{\rmp}}
\newcommand{\wtrmP} {\widetilde{\rmP}}
\newcommand{\wtbp} {\widetilde{\bp}}
\newcommand{\wtbP} {\widetilde{\bP}}
\newcommand{\whp} {\widehat{p}}
\newcommand{\whP} {\widehat{P}}
\newcommand{\whrmp} {\widehat{\rmp}}
\newcommand{\whrmP} {\widehat{\rmP}}
\newcommand{\whbp} {\widehat{\bp}}
\newcommand{\whbP} {\widehat{\bP}}
\newcommand{\whcP} {\widehat{\cP}}
\newcommand{\whtnsrP} {\widehat{\tnsrP}}
\newcommand{\Q} {\mathbb{Q}}
\newcommand{\rmq} {\mathrm{q}}
\newcommand{\rmQ} {\mathrm{Q}}
\newcommand{\bq} {\mathbf{q}}
\newcommand{\bQ} {\mathbf{Q}}
\newcommand{\cQ} {\mathcal{Q}}
\newcommand{\bcQ} {\boldsymbol{\cQ}}
\newcommand{\tnsrQ} {\underline{\bQ}}
\newcommand{\wtq} {\widetilde{q}}
\newcommand{\wtQ} {\widetilde{Q}}
\newcommand{\wtrmq} {\widetilde{\rmq}}
\newcommand{\wtrmQ} {\widetilde{\rmQ}}
\newcommand{\wtbq} {\widetilde{\bq}}
\newcommand{\wtbQ} {\widetilde{\bQ}}
\newcommand{\whq} {\widehat{q}}
\newcommand{\whQ} {\widehat{Q}}
\newcommand{\whrmq} {\widehat{\rmq}}
\newcommand{\whrmQ} {\widehat{\rmQ}}
\newcommand{\whbq} {\widehat{\bq}}
\newcommand{\whbQ} {\widehat{\bQ}}
\newcommand{\whcQ} {\widehat{\cQ}}
\newcommand{\whtnsrQ} {\widehat{\tnsrQ}}
\newcommand{\R} {\mathbb{R}}
\newcommand{\rmr} {\mathrm{r}}
\newcommand{\rmR} {\mathrm{R}}
\newcommand{\br} {\mathbf{r}}
\newcommand{\bR} {\mathbf{R}}
\newcommand{\cR} {\mathcal{R}}
\newcommand{\bcR} {\boldsymbol{\cR}}
\newcommand{\tnsrR} {\underline{\bR}}
\newcommand{\wtr} {\widetilde{r}}
\newcommand{\wtR} {\widetilde{R}}
\newcommand{\wtrmr} {\widetilde{\rmr}}
\newcommand{\wtrmR} {\widetilde{\rmR}}
\newcommand{\wtbr} {\widetilde{\br}}
\newcommand{\wtbR} {\widetilde{\bR}}
\newcommand{\whr} {\widehat{r}}
\newcommand{\whR} {\widehat{R}}
\newcommand{\whrmr} {\widehat{\rmr}}
\newcommand{\whrmR} {\widehat{\rmR}}
\newcommand{\whbr} {\widehat{\br}}
\newcommand{\whbR} {\widehat{\bR}}
\newcommand{\whcR} {\widehat{\cR}}
\newcommand{\whtnsrR} {\widehat{\tnsrR}}
\newcommand{\bbS} {\mathbb{S}}
\newcommand{\rms} {\mathrm{s}}
\newcommand{\rmS} {\mathrm{S}}
\newcommand{\bs} {\mathbf{s}}
\newcommand{\bS} {\mathbf{S}}
\newcommand{\cS} {\mathcal{S}}
\newcommand{\bcS} {\boldsymbol{\cS}}
\newcommand{\tnsrS} {\underline{\bS}}
\newcommand{\wts} {\widetilde{s}}
\newcommand{\wtS} {\widetilde{S}}
\newcommand{\wtrms} {\widetilde{\rms}}
\newcommand{\wtrmS} {\widetilde{\rmS}}
\newcommand{\wtbs} {\widetilde{\bs}}
\newcommand{\wtbS} {\widetilde{\bS}}
\newcommand{\whs} {\widehat{s}}
\newcommand{\whS} {\widehat{S}}
\newcommand{\whrms} {\widehat{\rms}}
\newcommand{\whrmS} {\widehat{\rmS}}
\newcommand{\whbs} {\widehat{\bs}}
\newcommand{\whbS} {\widehat{\bS}}
\newcommand{\whcS} {\widehat{\cS}}
\newcommand{\whtnsrS} {\widehat{\tnsrS}}
\newcommand{\T} {\mathbb{T}}
\newcommand{\rmt} {\mathrm{t}}
\newcommand{\rmT} {\mathrm{T}}
\newcommand{\bt} {\mathbf{t}}
\newcommand{\bT} {\mathbf{T}}
\newcommand{\cT} {\mathcal{T}}
\newcommand{\bcT} {\boldsymbol{\cT}}
\newcommand{\tnsrT} {\underline{\bT}}
\newcommand{\wtt} {\widetilde{t}}
\newcommand{\wtT} {\widetilde{T}}
\newcommand{\wtrmt} {\widetilde{\rmt}}
\newcommand{\wtrmT} {\widetilde{\rmT}}
\newcommand{\wtbt} {\widetilde{\bt}}
\newcommand{\wtbT} {\widetilde{\bT}}
\newcommand{\wht} {\widehat{t}}
\newcommand{\whT} {\widehat{T}}
\newcommand{\whrmt} {\widehat{\rmt}}
\newcommand{\whrmT} {\widehat{\rmT}}
\newcommand{\whbt} {\widehat{\bt}}
\newcommand{\whbT} {\widehat{\bT}}
\newcommand{\whcT} {\widehat{\cT}}
\newcommand{\whtnsrT} {\widehat{\tnsrT}}
\newcommand{\U} {\mathbb{U}}
\newcommand{\rmu} {\mathrm{u}}
\newcommand{\rmU} {\mathrm{U}}
\newcommand{\bu} {\mathbf{u}}
\newcommand{\bU} {\mathbf{U}}
\newcommand{\cU} {\mathcal{U}}
\newcommand{\bcU} {\boldsymbol{\cU}}
\newcommand{\tnsrU} {\underline{\bU}}
\newcommand{\wtu} {\widetilde{u}}
\newcommand{\wtU} {\widetilde{U}}
\newcommand{\wtrmu} {\widetilde{\rmu}}
\newcommand{\wtrmU} {\widetilde{\rmU}}
\newcommand{\wtbu} {\widetilde{\bu}}
\newcommand{\wtbU} {\widetilde{\bU}}
\newcommand{\whu} {\widehat{u}}
\newcommand{\whU} {\widehat{U}}
\newcommand{\whrmu} {\widehat{\rmu}}
\newcommand{\whrmU} {\widehat{\rmU}}
\newcommand{\whbu} {\widehat{\bu}}
\newcommand{\whbU} {\widehat{\bU}}
\newcommand{\whcU} {\widehat{\cU}}
\newcommand{\whtnsrU} {\widehat{\tnsrU}}
\newcommand{\V} {\mathbb{V}}
\newcommand{\rmv} {\mathrm{v}}
\newcommand{\rmV} {\mathrm{V}}
\newcommand{\bv} {\mathbf{v}}
\newcommand{\bV} {\mathbf{V}}
\newcommand{\cV} {\mathcal{V}}
\newcommand{\bcV} {\boldsymbol{\cV}}
\newcommand{\tnsrV} {\underline{\bV}}
\newcommand{\wtv} {\widetilde{v}}
\newcommand{\wtV} {\widetilde{V}}
\newcommand{\wtrmv} {\widetilde{\rmv}}
\newcommand{\wtrmV} {\widetilde{\rmV}}
\newcommand{\wtbv} {\widetilde{\bv}}
\newcommand{\wtbV} {\widetilde{\bV}}
\newcommand{\whv} {\widehat{v}}
\newcommand{\whV} {\widehat{V}}
\newcommand{\whrmv} {\widehat{\rmv}}
\newcommand{\whrmV} {\widehat{\rmV}}
\newcommand{\whbv} {\widehat{\bv}}
\newcommand{\whbV} {\widehat{\bV}}
\newcommand{\whcV} {\widehat{\cV}}
\newcommand{\whtnsrV} {\widehat{\tnsrV}}
\newcommand{\W} {\mathbb{W}}
\newcommand{\rmw} {\mathrm{w}}
\newcommand{\rmW} {\mathrm{W}}
\newcommand{\bw} {\mathbf{w}}
\newcommand{\bW} {\mathbf{W}}
\newcommand{\cW} {\mathcal{W}}
\newcommand{\bcW} {\boldsymbol{\cW}}
\newcommand{\tnsrW} {\underline{\bW}}
\newcommand{\wtw} {\widetilde{w}}
\newcommand{\wtW} {\widetilde{W}}
\newcommand{\wtrmw} {\widetilde{\rmw}}
\newcommand{\wtrmW} {\widetilde{\rmW}}
\newcommand{\wtbw} {\widetilde{\bw}}
\newcommand{\wtbW} {\widetilde{\bW}}
\newcommand{\whw} {\widehat{w}}
\newcommand{\whW} {\widehat{W}}
\newcommand{\whrmw} {\widehat{\rmw}}
\newcommand{\whrmW} {\widehat{\rmW}}
\newcommand{\whbw} {\widehat{\bw}}
\newcommand{\whbW} {\widehat{\bW}}
\newcommand{\whcW} {\widehat{\cW}}
\newcommand{\whtnsrW} {\widehat{\tnsrW}}
\newcommand{\X} {\mathbb{X}}
\newcommand{\rmx} {\mathrm{x}}
\newcommand{\rmX} {\mathrm{X}}
\newcommand{\bx} {\mathbf{x}}
\newcommand{\bX} {\mathbf{X}}
\newcommand{\cX} {\mathcal{X}}
\newcommand{\bcX} {\boldsymbol{\cX}}
\newcommand{\tnsrX} {\underline{\bX}}
\newcommand{\wtx} {\widetilde{x}}
\newcommand{\wtX} {\widetilde{X}}
\newcommand{\wtrmx} {\widetilde{\rmx}}
\newcommand{\wtrmX} {\widetilde{\rmX}}
\newcommand{\wtbx} {\widetilde{\bx}}
\newcommand{\wtbX} {\widetilde{\bX}}
\newcommand{\whx} {\widehat{x}}
\newcommand{\whX} {\widehat{X}}
\newcommand{\whrmx} {\widehat{\rmx}}
\newcommand{\whrmX} {\widehat{\rmX}}
\newcommand{\whbx} {\widehat{\bx}}
\newcommand{\whbX} {\widehat{\bX}}
\newcommand{\whcX} {\widehat{\cX}}
\newcommand{\whtnsrX} {\widehat{\tnsrX}}
\newcommand{\Y} {\mathbb{Y}}
\newcommand{\rmy} {\mathrm{y}}
\newcommand{\rmY} {\mathrm{Y}}
\newcommand{\by} {\mathbf{y}}
\newcommand{\bY} {\mathbf{Y}}
\newcommand{\cY} {\mathcal{Y}}
\newcommand{\bcY} {\boldsymbol{\cY}}
\newcommand{\tnsrY} {\underline{\bY}}
\newcommand{\wty} {\widetilde{y}}
\newcommand{\wtY} {\widetilde{Y}}
\newcommand{\wtrmy} {\widetilde{\rmy}}
\newcommand{\wtrmY} {\widetilde{\rmY}}
\newcommand{\wtby} {\widetilde{\by}}
\newcommand{\wtbY} {\widetilde{\bY}}
\newcommand{\why} {\widehat{y}}
\newcommand{\whY} {\widehat{Y}}
\newcommand{\whrmy} {\widehat{\rmy}}
\newcommand{\whrmY} {\widehat{\rmY}}
\newcommand{\whby} {\widehat{\by}}
\newcommand{\whbY} {\widehat{\bY}}
\newcommand{\whcY} {\widehat{\cY}}
\newcommand{\whtnsrY} {\widehat{\tnsrY}}
\newcommand{\Z} {\mathbb{Z}}
\newcommand{\rmz} {\mathrm{z}}
\newcommand{\rmZ} {\mathrm{Z}}
\newcommand{\bz} {\mathbf{z}}
\newcommand{\bZ} {\mathbf{Z}}
\newcommand{\cZ} {\mathcal{Z}}
\newcommand{\bcZ} {\boldsymbol{\cZ}}
\newcommand{\tnsrZ} {\underline{\bZ}}
\newcommand{\wtz} {\widetilde{z}}
\newcommand{\wtZ} {\widetilde{Z}}
\newcommand{\wtrmz} {\widetilde{\rmz}}
\newcommand{\wtrmZ} {\widetilde{\rmZ}}
\newcommand{\wtbz} {\widetilde{\bz}}
\newcommand{\wtbZ} {\widetilde{\bZ}}
\newcommand{\whz} {\widehat{z}}
\newcommand{\whZ} {\widehat{Z}}
\newcommand{\whrmz} {\widehat{\rmz}}
\newcommand{\whrmZ} {\widehat{\rmZ}}
\newcommand{\whbz} {\widehat{\bz}}
\newcommand{\whbZ} {\widehat{\bZ}}
\newcommand{\whcZ} {\widehat{\cZ}}
\newcommand{\whtnsrZ} {\widehat{\tnsrZ}}
\newcommand{\balpha} {\boldsymbol{\alpha}}
\newcommand{\wtalpha} {\widetilde{\alpha}}
\newcommand{\whalpha} {\widehat{\alpha}}
\newcommand{\wtbalpha} {\widetilde{\balpha}}
\newcommand{\whbalpha} {\widehat{\balpha}}
\newcommand{\bbeta} {\boldsymbol{\beta}}
\newcommand{\wtbeta} {\widetilde{\beta}}
\newcommand{\whbeta} {\widehat{\beta}}
\newcommand{\wtbbeta} {\widetilde{\bbeta}}
\newcommand{\whbbeta} {\widehat{\bbeta}}
\newcommand{\bgamma} {\boldsymbol{\gamma}}
\newcommand{\wtgamma} {\widetilde{\gamma}}
\newcommand{\whgamma} {\widehat{\gamma}}
\newcommand{\wtbgamma} {\widetilde{\bgamma}}
\newcommand{\whbgamma} {\widehat{\bgamma}}
\newcommand{\bdelta} {\boldsymbol{\delta}}
\newcommand{\wtdelta} {\widetilde{\delta}}
\newcommand{\whdelta} {\widehat{\delta}}
\newcommand{\wtbdelta} {\widetilde{\bdelta}}
\newcommand{\whbdelta} {\widehat{\bdelta}}
\newcommand{\bepsilon} {\boldsymbol{\epsilon}}
\newcommand{\wtepsilon} {\widetilde{\epsilon}}
\newcommand{\whepsilon} {\widehat{\epsilon}}
\newcommand{\wtbepsilon}{\widetilde{\bepsilon}}
\newcommand{\whbepsilon}{\widehat{\bepsilon}}
\newcommand{\veps} {\varepsilon}
\newcommand{\bveps} {\boldsymbol{\veps}}
\newcommand{\wtveps} {\widetilde{\veps}}
\newcommand{\whveps} {\widehat{\veps}}
\newcommand{\wtbveps} {\widetilde{\bveps}}
\newcommand{\whbveps} {\widehat{\bveps}}
\newcommand{\bEta} {\boldsymbol{\eta}}
\newcommand{\wteta} {\widetilde{\eta}}
\newcommand{\wheta} {\widehat{\eta}}
\newcommand{\wtbEta} {\widetilde{\bEta}}
\newcommand{\whbEta} {\widehat{\bEta}}
\newcommand{\btheta} {\boldsymbol{\theta}}
\newcommand{\wttheta} {\widetilde{\theta}}
\newcommand{\whtheta} {\widehat{\theta}}
\newcommand{\wtbtheta} {\widetilde{\btheta}}
\newcommand{\whbtheta} {\widehat{\btheta}}
\newcommand{\bvtheta} {\boldsymbol{\vartheta}}
\newcommand{\wtvtheta} {\widetilde{\vartheta}}
\newcommand{\whvtheta} {\widehat{\vartheta}}
\newcommand{\wtbvtheta} {\widetilde{\bvtheta}}
\newcommand{\whbvtheta} {\widehat{\bvtheta}}
\newcommand{\biota} {\boldsymbol{\iota}}
\newcommand{\wtiota} {\widetilde{\iota}}
\newcommand{\whiota} {\widehat{\iota}}
\newcommand{\wtbiota} {\widetilde{\biota}}
\newcommand{\whbiota} {\widehat{\biota}}
\newcommand{\bkappa} {\boldsymbol{\kappa}}
\newcommand{\wtkappa} {\widetilde{\kappa}}
\newcommand{\whkappa} {\widehat{\kappa}}
\newcommand{\wtbkappa} {\widetilde{\bkappa}}
\newcommand{\whbkappa} {\widehat{\bkappa}}
\newcommand{\blambda} {\boldsymbol{\lambda}}
\newcommand{\wtlambda} {\widetilde{\lambda}}
\newcommand{\whlambda} {\widehat{\lambda}}
\newcommand{\wtblambda} {\widetilde{\blambda}}
\newcommand{\whblambda} {\widehat{\blambda}}
\newcommand{\bmu} {\boldsymbol{\mu}}
\newcommand{\wtmu} {\widetilde{\mu}}
\newcommand{\whmu} {\widehat{\mu}}
\newcommand{\wtbmu} {\widetilde{\bmu}}
\newcommand{\whbmu} {\widehat{\bmu}}
\newcommand{\bnu} {\boldsymbol{\nu}}
\newcommand{\wtnu} {\widetilde{\nu}}
\newcommand{\whnu} {\widehat{\nu}}
\newcommand{\wtbnu} {\widetilde{\bnu}}
\newcommand{\whbnu} {\widehat{\bnu}}
\newcommand{\bxi} {\boldsymbol{\xi}}
\newcommand{\wtxi} {\widetilde{\xi}}
\newcommand{\whxi} {\widehat{\xi}}
\newcommand{\wtbxi} {\widetilde{\bxi}}
\newcommand{\whbxi} {\widehat{\bxi}}
\newcommand{\bpi} {\boldsymbol{\pi}}
\newcommand{\wtpi} {\widetilde{\pi}}
\newcommand{\whpi} {\widehat{\pi}}
\newcommand{\wtbpi} {\widetilde{\bpi}}
\newcommand{\whbpi} {\widehat{\bpi}}
\newcommand{\bvpi} {\boldsymbol{\varpi}}
\newcommand{\wtvpi} {\widetilde{\varpi}}
\newcommand{\whvpi} {\widehat{\varpi}}
\newcommand{\wtbvpi} {\widetilde{\bvpi}}
\newcommand{\whbvpi} {\widehat{\bvpi}}
\newcommand{\brho} {\boldsymbol{\rho}}
\newcommand{\wtrho} {\widetilde{\rho}}
\newcommand{\whrho} {\widehat{\rho}}
\newcommand{\wtbrho} {\widetilde{\brho}}
\newcommand{\whbrho} {\widehat{\brho}}
\newcommand{\bvrho} {\boldsymbol{\varrho}}
\newcommand{\wtvrho} {\widetilde{\varrho}}
\newcommand{\whvrho} {\widehat{\varrho}}
\newcommand{\wtbvrho} {\widetilde{\bvrho}}
\newcommand{\whbvrho} {\widehat{\bvrho}}
\newcommand{\bsigma} {\boldsymbol{\sigma}}
\newcommand{\wtsigma} {\widetilde{\sigma}}
\newcommand{\whsigma} {\widehat{\sigma}}
\newcommand{\wtbsigma} {\widetilde{\bsigma}}
\newcommand{\whbsigma} {\widehat{\bsigma}}
\newcommand{\bvsigma} {\boldsymbol{\varsigma}}
\newcommand{\wtvsigma} {\widetilde{\varsigma}}
\newcommand{\whvsigma} {\widehat{\varsigma}}
\newcommand{\wtbvsigma} {\widetilde{\bvsigma}}
\newcommand{\whbvsigma} {\widehat{\bvsigma}}
\newcommand{\btau} {\boldsymbol{\tau}}
\newcommand{\wttau} {\widetilde{\tau}}
\newcommand{\whtau} {\widehat{\tau}}
\newcommand{\wtbtau} {\widetilde{\btau}}
\newcommand{\whbtau} {\widehat{\btau}}
\newcommand{\bupsilon} {\boldsymbol{\upsilon}}
\newcommand{\wtupsilon} {\widetilde{\upsilon}}
\newcommand{\whupsilon} {\widehat{\upsilon}}
\newcommand{\wtbupsilon}{\widetilde{\bupsilon}}
\newcommand{\whbupsilon}{\widehat{\bupsilon}}
\newcommand{\bzeta} {\boldsymbol{\zeta}}
\newcommand{\wtzeta} {\widetilde{\zeta}}
\newcommand{\whzeta} {\widehat{\zeta}}
\newcommand{\wtbzeta}{\widetilde{\bzeta}}
\newcommand{\whbzeta}{\widehat{\bzeta}}
\newcommand{\bphi} {\boldsymbol{\phi}}
\newcommand{\wtphi} {\widetilde{\phi}}
\newcommand{\whphi} {\widehat{\phi}}
\newcommand{\wtbphi} {\widetilde{\bphi}}
\newcommand{\whbphi} {\widehat{\bphi}}
\newcommand{\bvphi} {\boldsymbol{\varphi}}
\newcommand{\wtvphi} {\widetilde{\varphi}}
\newcommand{\whvphi} {\widehat{\varphi}}
\newcommand{\wtbvphi} {\widetilde{\bvphi}}
\newcommand{\whbvphi} {\widehat{\bvphi}}
\newcommand{\bchi} {\boldsymbol{\chi}}
\newcommand{\wtchi} {\widetilde{\chi}}
\newcommand{\whchi} {\widehat{\chi}}
\newcommand{\wtbchi} {\widetilde{\bchi}}
\newcommand{\whbchi} {\widehat{\bchi}}
\newcommand{\bpsi} {\boldsymbol{\psi}}
\newcommand{\wtpsi} {\widetilde{\psi}}
\newcommand{\whpsi} {\widehat{\psi}}
\newcommand{\wtbpsi} {\widetilde{\bpsi}}
\newcommand{\whbpsi} {\widehat{\bpsi}}
\newcommand{\bomega} {\boldsymbol{\omega}}
\newcommand{\wtomega} {\widetilde{\omega}}
\newcommand{\whomega} {\widehat{\omega}}
\newcommand{\wtbomega} {\widetilde{\bomega}}
\newcommand{\whbomega} {\widehat{\bomega}}
\newcommand{\bGamma} {\boldsymbol{\Gamma}}
\newcommand{\wtGamma} {\widetilde{\Gamma}}
\newcommand{\whGamma} {\widehat{\Gamma}}
\newcommand{\wtbGamma} {\widetilde{\bGamma}}
\newcommand{\whbGamma} {\widehat{\bGamma}}
\newcommand{\bDelta} {\boldsymbol{\Delta}}
\newcommand{\wtDelta} {\widetilde{\Delta}}
\newcommand{\whDelta} {\widehat{\Delta}}
\newcommand{\wtbDelta} {\widetilde{\bDelta}}
\newcommand{\whbDelta} {\widehat{\bDelta}}
\newcommand{\bLambda} {\boldsymbol{\Lambda}}
\newcommand{\wtLambda} {\widetilde{\Lambda}}
\newcommand{\whLambda} {\widehat{\Lambda}}
\newcommand{\wtbLambda} {\widetilde{\bLambda}}
\newcommand{\whbLambda} {\widehat{\bLambda}}
\newcommand{\bXi} {\boldsymbol{\Xi}}
\newcommand{\wtXi} {\widetilde{\Xi}}
\newcommand{\whXi} {\widehat{\Xi}}
\newcommand{\wtbXi} {\widetilde{\bXi}}
\newcommand{\whbXi} {\widehat{\bXi}}
\newcommand{\bPi} {\boldsymbol{\Pi}}
\newcommand{\wtPi} {\widetilde{\Pi}}
\newcommand{\whPi} {\widehat{\Pi}}
\newcommand{\wtbPi} {\widetilde{\bPi}}
\newcommand{\whbPi} {\widehat{\bPi}}
\newcommand{\bSigma} {\boldsymbol{\Sigma}}
\newcommand{\wtSigma} {\widetilde{\Sigma}}
\newcommand{\whSigma} {\widehat{\Sigma}}
\newcommand{\wtbSigma} {\widetilde{\bSigma}}
\newcommand{\whbSigma} {\widehat{\bSigma}}
\newcommand{\bUpsilon} {\boldsymbol{\Upsilon}}
\newcommand{\wtUpsilon} {\widetilde{\Upsilon}}
\newcommand{\whUpsilon} {\widehat{\Upsilon}}
\newcommand{\wtbUpsilon}{\widetilde{\bUpsilon}}
\newcommand{\whbUpsilon}{\widehat{\bUpsilon}}
\newcommand{\bPhi} {\boldsymbol{\Phi}}
\newcommand{\wtPhi} {\widetilde{\Phi}}
\newcommand{\whPhi} {\widehat{\Phi}}
\newcommand{\wtbPhi} {\widetilde{\bPhi}}
\newcommand{\whbPhi} {\widehat{\bPhi}}
\newcommand{\bPsi} {\boldsymbol{\Psi}}
\newcommand{\wtPsi} {\widetilde{\Psi}}
\newcommand{\whPsi} {\widehat{\Psi}}
\newcommand{\wtbPsi} {\widetilde{\bPsi}}
\newcommand{\whbPsi} {\widehat{\bPsi}}
\newcommand{\bOmega} {\boldsymbol{\Omega}}
\newcommand{\wtOmega} {\widetilde{\Omega}}
\newcommand{\whOmega} {\widehat{\Omega}}
\newcommand{\wtbOmega} {\widetilde{\bOmega}}
\newcommand{\whbOmega} {\widehat{\bOmega}}
\newcommand{\bzero} {\mathbf{0}}
\newcommand{\bone} {\mathbf{1}}
\newcommand{\norm}[1]{\left\lVert #1 \right\rVert}
\newcommand{\abs}[1]{\left\lvert#1\right\rvert}
\newcommand{\ceil}[1]{\left \lceil #1 \right\rceil}
\newcommand{\argmin}{\operatorname*{arg\,min}}
\newcommand{\argmax}{\operatorname*{arg\,max}}
\)
We introduced the basics of the stochastic gradient descent (SGD) algorithm and its implementation on Python with the MNIST dataset as a test case in our article Mastering Stochastic Gradient Descent algorithm with the MNIST Dataset. We can see that stochastic gradient descent is one of the most powerful algorithms used in the world of machine learning.
We prefer the use of the SGD algorithm over Gradient Descent (GD) solely for one main purpose the data is too large to handle for our machine, so instead of processing the entire batch of samples for learning, we process sample by sample to update our algorithm.
A key thing to remember: We use SGD/GD algorithms in machine learning to minimize the loss function associated with our machine learning problem (the problem can be classification or regression). The loss function $f(\cdot)$, however, can have certain attributes, is it convex, non-convex, smooth, non-smooth, etc. The nature of this function $f(\cdot)$ governs the performance of the SGD algorithm and its effect on the rate of convergence.
Suggested Read: Convergence analysis of Stochastic Gradient Descent (SGD) when a function is L-smooth and non-convex
In this article, we will investigate the proof of convergence when our loss function $f(\cdot)$ is convex.
Preliminaries
Let’s quickly recap a few things before jumping onto the convergence analysis. The stochastic gradient descent algorithm is given as:
Algorithm - Stochastic Gradient Descent (SGD)
Input: A random vector $\bx_0 \in \R^n$ and a decaying step size $\gamma_t > 0$;
1. for $t=1,2,\ldots,$ do
2. $\qquad$ Evaluate the stochastic gradient mapping at $\bx_t$, i.e., evaluate $\nabla F(\bx_t, \xi_t)$;
3. $\qquad$ Do the following update:
$\qquad$ $$ \bx_{t+1} := \cP_{\cX} \left(\bx_t - \gamma_t \nabla F(\bx_t, \xi_t) \right)$$
4. end for
In the above algorithm, $\bx_t$ is the update of the objective variable $\bx$ at time $t$, $\nabla f(\bx_t)$ is the gradient of the loss function $f$, and $\cP_{\cX}$ is the projection operator. We use the projection to ensure that the update of our algorithm doesn’t cause the objective variable $\bx$ to lie outside the feasible set, and if it does, the operator projects the update $\bx_{t+1}$ back to the feasible set. The details of the projection operator is given as:
Projection Operator:
We will use some properties of the projection operator in the update rule in our convergence analysis. So, we will take a brief detour towards the projection operator.
The idea of the projection operator is to project back the update iterate to the convex set by finding the point in the convex set having minimum Euclidian distance. Mathematically, it is defined as:
$$\cP_X \left( \whbx \right) = \argmin_{\bx \in X} \norm{\bx- \whbx}_2^2.$$
Properties of Projection Operator:
Let set $X \subseteq \R^n$ be a nonempty closed convex set. Consider the projection mapping $\cP_X: \R^n \to X$. The projection operator has the following properties:
- Non-expansive:
$$\begin{align*}
\norm{\cP_X(\bx) – \cP_X(\by)}_2 \leq \norm{\bx – \by}_2 \qquad \hbox{for all } \bx,\by \in \R^n.
\end{align*}$$
- Distance: The distance is a convex function in terms of $\bx$ from a set $X$ is given by:
$$ dist(\bx,X)= \norm{\cP_X (\bx) – \bx}_2$$
The Stochastic Optimization Problem
The fundamental idea of the stochastic gradient descent algorithm is to find the optimal solution of the following stochastic optimization problem:
$$
\begin{equation*}
\begin{aligned}
\text{minimize} & \quad f(\bx) {\triangleq \mathbb{E} [F(\bx,\bxi)]} \\
\text{subject to:} & \quad \bx \in X.
\end{aligned}
\end{equation*}
$$
Where $f : \R^n \to \R$ is an unknown deterministic function, $F: \R^n \times \Omega \to \R$ is a known stochastic function, $\bxi: \Omega \to \R^d$ is a random variable (this is a random data sample), and $\mathbb{E}[\cdot]$ denotes the expectation operator taking an expectation due to the random variable $\bxi$.
Concept of Filtration:
In algorithms like stochastic gradient descent, we process sample by sample. Hence, they arrive randomly, and in convergence analysis, we need to keep track of the history of these random samples. This is denoted by a set known as Filtration, which contains the random samples till iteration $t$. Mathematically, it is defined as:
$$ \cF_t \triangleq \{ \bx_0,\bxi_0,\bxi_1,\ldots,\bxi_{t-1} \} \qquad \hbox{for all } t \geq 1,$$
and $\cF_0 \triangleq {x_0}$. Since whenever randomness is involved, we need expectations to remove that. So, we define the conditional expectation given the filtration as $\E[\bullet\mid \cF_t]$.
The main idea of using conditional expectation is that in our Filtration, if we know $\bxi_{t-1}$, then we know what $\bx_t$ i.e., by knowing the randomness of $\bxi_{t-1}$ we can find the $\bx_t$ by using the update rule of the algorithm (Step 3). This is how filtration and conditional expectation is useful in removing the randomness, and we can characterize the next iterate $\bx_{t+1}$.
Apart from Filtration, we need the following two assumptions to prove the convergence rates for the SGD algorithm:
Assumptions:
- $\E [ \nabla F(\bx,\bxi) \mid \bx] = \nabla f (\bx) $ for all $\bx \in X$.
- $\E[\norm{\nabla F(\bx,\bxi)-\nabla f (\bx)}^2 \mid \bx] \leq \sigma^2 $ for all $\bx \in X$ for some $\sigma>0$.
This assumption $1$ implies that the stochastic gradient is an unbiased estimator of the true gradient of the objective function, and assumption $2$
Main Theorem
Consider the stochastic optimization problem and SGD Algorithm defined above. Let Assumption 1 hold. Let $X \subset \R^n$. Suppose $X$ is nonempty, compact, and convex. Assume that $f: \R^n \to \R$ is continuously differentiable and convex.
Let the step size be diminishing given by:
$$ \gamma_t :=\frac{\gamma_0}{\sqrt{t+1}} \qquad \hbox{for all } t \geq 0.$$
Consider the averaged iterate defined by:
$$ \bar x_T \triangleq \frac{\sum_{t=0}^{T-1} x_t}{T} \qquad \hbox{for all } T \geq 1.$$
Then, there exists some scalars $M>0$ and $C>0$ and an optimal solution vector $x^* \in X$ such that:
$$\E [f (\bar x_T)] – f(x^*) \leq \left( \frac{M}{2 \gamma_0} + \gamma_0 \left(C^2 + \sigma^2 \right)\right) \frac{1}{\sqrt{T}} \qquad \hbox{for all } T \geq 2.$$
Proof:
From the update rule of the algorithm (Step 3) and by using the non-expansivity of the projection operator, we obtain:
$$\begin{align*}
\norm{ \bx_{t+1} – \bx^*}^2 &\leq \norm{\cP_X (\bx_t – \gamma_t \nabla F(\bx_t,\bxi_t)) – \cP_X (\bx^*)}^2 \\
&\leq \norm{ \bx_t – \bx^*}^2 + \gamma_t^2 \norm{\nabla F(\bx_t,\bxi_t)}^2 – 2 \gamma_t \nabla F(\bx_t,\bxi_t)^T(\bx_t-\bx^*).
\end{align*}$$
We now define stochastic error as $\bw_t \triangleq \nabla F(\bx_t,\bxi_t)- \nabla f (\bx_t)$ for all $t \geq 0$. Invoking this by replacing $\nabla F(\bx_t,\bxi_t)$ for $\nabla f(\bx_t)+\bw_t$ we obtain:
$$\begin{align*}
\norm{\bx_{t+1}-\bx^*}^2 &\leq \norm{\bx_k – x^*}^2 + \gamma_t^2 \norm{\nabla f(\bx_t) + \bw_t}^2 – 2 \gamma_t \left(\nabla f(\bx_t) + \bw_t \right)^T (\bx_t- \bx^*) \\
&= \norm{\bx_t – \bx^*}^2 + \gamma_t^2 \left(\norm{\nabla f(\bx_t)}^2 + \norm{\bw_t}^2 + 2 \nabla f(\bx_t)^T \bw_t \right) – 2 \gamma_t \left(\nabla f(\bx_t) + \bw_t \right)^T (\bx_t-\bx^*).
\end{align*}$$
Taking conditional expectation w.r.t Filtration operator on both sides, we obtain:
$$\begin{align*}
\E[\norm{\bx_{t+1}-\bx^*}^2 \mid \cF_t] &\leq \norm{\bx_k – \bx^*}^2 + \gamma_t^2 \left(\norm{\nabla f(\bx_t)}^2 + \E[\norm{\bw_t}^2 \mid \cF_t] + 2 \nabla f(\bx_t)^T \E[\bw_t \mid \cF_t] \right) \\
&- 2 \gamma_t \left( \nabla f(\bx_t) + \E[\bw_t \mid \cF_t] \right)^T (\bx_t-\bx^*).
\end{align*}$$
We now briefly detour and define the following Lemma, which captures the statistical properties of stochastic errors.
Lemma 1: Consider the definition of stochastic error $\bw_t$ and under the defined assumption 1 we have for all $t\geq 0$:
$$\begin{align*}
&\E[\bw_t \mid \cF_t] = \mathbf{0}_n \\
&\E[\norm{\bw_t}^2 \mid \cF_t] \leq \sigma^2.
\end{align*}$$
(Now continuing the proof): Since $X$ is compact and that $f$ is continuous (due to convexity), there exists $C>0$ such that $\sup_{\bx \in X}\norm{\nabla f(\bx)} \leq C$. Invoking Assumption 1 and Lemma 1, we obtain:
$$\begin{align*}
\E[\norm{\bx_{t+1} – \bx^*}^2 \mid \cF_t] &\leq \norm{\bx_t – \bx^*}^2 + \gamma_t^2 \left( C^2 + \sigma^2 \right) – 2 \gamma_t \nabla f(\bx_t)^T (\bx_t-\bx^*).
\end{align*}$$
From convexity of $f$, we have that $ \nabla f(\bx_t)^T(\bx_t-\bx^*) \geq f(\bx_t) -f(\bx^*)$. We obtain:
$$\begin{align*}
\E[\norm{\bx_{t+1} – \bx^*}^2 \mid \cF_t] &\leq \norm{\bx_k – \bx^*}^2 + \gamma_t^2 \left(C^2 + \sigma^2 \right) -2 \gamma_t(f(\bx_t) – f(\bx^*)).
\end{align*}$$
Dividing both sides by $2\gamma_t$ and rearranging the terms, we obtain:
$$\begin{align*}
f(\bx_t) – f(\bx^*) \leq \frac{1}{2\gamma_t} \left(\norm{\bx_{t} -\bx^*}^2 – \E[\norm{\bx_{t+1}-\bx^*}^2 \mid \cF_t]\right) + \frac{\gamma_t}{2} \left(C^2 + \sigma^2 \right) .
\end{align*}$$
Taking expectation w.r.t $\cF_t$ on both sides and using the tower property of expectation, we obtain:
$$\begin{align*}
\E[f(\bx_t)] – f(\bx^*) &\leq \frac{1}{2\gamma_t} \left( \E[\norm{\bx_t – \bx^*}^2] – \E[\E[\norm{\bx_{t+1}-\bx^*}^2 \mid \cF_t]] \right) + \frac{\gamma_t}{2} \left(C^2 + \sigma^2 \right) \\
&= \frac{1}{2\gamma_t} \left( \E[\norm{\bx_k – \bx^*}^2] – \E[\norm{\bx_{t+1} – \bx^*}^2] \right) + \frac{\gamma_t}{2} \left(C^2 + \sigma^2 \right).
\end{align*}$$
Adding and subtracting $\frac{1}{2\gamma_{t-1}} \E[\norm{\bx_t – \bx^*}^2]$, we obtain:
From boundedness of the set $X$, there exists a scalar $M>0$ such that $\max_{x \in X}|x-x^*|^2 \leq M$. Thus, we have:
$$\begin{align*}
\E [f(\bx_t)] – f(\bx^*) &\leq \left(\frac{1}{2\gamma_{t-1}} \E[\norm{\bx_t – \bx^*}^2] – \frac{1}{2 \gamma_{t}}\E[ \norm{\bx_{t+1} -\bx^*}^2] \right) \\
&+ \left( \frac{1}{2 \gamma_t} – \frac{1}{2 \gamma_{t-1}} \right) M + \frac{\gamma_t}{2} \left( C^2 + \sigma^2 \right).
\end{align*}$$
Summing both sides over $t$ and dividing by $T$. On L.H.S., we can note that the function is convex, so we can apply Jensen’s inequality
$$\begin{align*}
\sum_{t=0}^T \E [f(\bx_t)] – Tf(\bx^*) &\leq \sum_{t=0}^T \left(\frac{1}{2\gamma_{t-1}} \E[\norm{\bx_t – \bx^*}^2] – \frac{1}{2 \gamma_{t}}\E[ \norm{\bx_{t+1} -\bx^*}^2] \right) \\
&+ \sum_{t=0}^T \left( \frac{1}{2 \gamma_t} – \frac{1}{2 \gamma_{t-1}} \right) M + \sum_{t=0}^T \frac{\gamma_t}{2} \left( C^2 + \sigma^2 \right). \\
\E f\left[ \frac{1}{T} \sum_{t=0}^T \bx_t \right] – f(\bx^*) &\leq \frac{1}{T} \sum_{t=0}^T \left[ \frac{1}{2 \gamma_{t-1}} \E [ \norm{\bx_t – \bx^*}^2] – \frac{1}{2\gamma_t} \E[\norm{\bx_{t+1} – \bx^*}^2] \right] \\ &+ \frac{1}{T} \sum_{t=0}^T \left( \frac{1}{2\gamma_t} – \frac{1}{2 \gamma_{t-1}} \right)M + \frac{C^2 + \sigma^2}{2T} \sum_{t=0}^T \gamma_t
\end{align*}$$
Now we apply the fact that \bar x_T = \frac{1}{T} \sum_{t=0}^T \bx_t$ and $\gamma_t = \frac{\gamma_0}{\sqrt{t+1}}$
$$\begin{align*}
\E f(\bar x_T) – f( \bx^* ) &\leq \frac{1}{T} \sum_{t=0}^T \left[ \frac{\sqrt{T} \E[ \norm{\bx_t – \bx^*}^2] – \sqrt{T+1} \E[\norm{\bx_{t+1} – \bx^*}^2]}{2 \gamma_0} \right] \\ &+ \frac{1}{T}\sum_{t=0}^T \left( \frac{\sqrt{T+1} – \sqrt{T}}{2 \gamma_0} \right)M + \frac{C^2 + \sigma^2}{2T} \sum_{t=0}^T \gamma_t
\end{align*}$$
By noticing the telescoping series and applying the inequality $\sum_{t=0}^T \gamma_t \leq 2\gamma_0 \sqrt{T}$ and further noticing the fact that:
$$\frac{M}{2\gamma_0} + \gamma_0 \left( C^2 + \sigma^2 \right) \geq \frac{M}{2 \gamma_0} + \gamma_0 \left( C^2 + \sigma^2 \right) – \frac{\E[\norm{\bx_t – \bx^*}^2]}{2\gamma_0}$$
We obtain:
$$\begin{align*}
\E \left[f(\bar x_T)\right] – f( \bx^* ) &\leq \frac{1}{\sqrt{T}} \left[ \frac{M}{2\gamma_0} + \gamma_0 \left(C^2 + \sigma^2 \right) \right] \qquad \quad \forall \quad T \geq 2
\end{align*}$$
This concludes the proof.