\( \newcommand{\A} {\mathbb{A}} \newcommand{\rma} {\mathrm{a}} \newcommand{\rmA} {\mathrm{A}} \newcommand{\ba} {\mathbf{a}} \newcommand{\bA} {\mathbf{A}} \newcommand{\cA} {\mathcal{A}} \newcommand{\bcA} {\boldsymbol{\cA}} \newcommand{\tnsrA} {\underline{\bA}} \newcommand{\wta} {\widetilde{a}} \newcommand{\wtA} {\widetilde{A}} \newcommand{\wtrma} {\widetilde{\rma}} \newcommand{\wtrmA} {\widetilde{\rmA}} \newcommand{\wtba} {\widetilde{\ba}} \newcommand{\wtbA} {\widetilde{\bA}} \newcommand{\wha} {\widehat{a}} \newcommand{\whA} {\widehat{A}} \newcommand{\whrma} {\widehat{\rma}} \newcommand{\whrmA} {\widehat{\rmA}} \newcommand{\whba} {\widehat{\ba}} \newcommand{\whbA} {\widehat{\bA}} \newcommand{\whcA} {\widehat{\cA}} \newcommand{\whtnsrA} {\widehat{\tnsrA}} \newcommand{\B} {\mathbb{B}} \newcommand{\rmb} {\mathrm{b}} \newcommand{\rmB} {\mathrm{B}} \newcommand{\bb} {\mathbf{b}} \newcommand{\bB} {\mathbf{B}} \newcommand{\cB} {\mathcal{B}} \newcommand{\bcB} {\boldsymbol{\cB}} \newcommand{\tnsrB} {\underline{\bB}} \newcommand{\wtb} {\widetilde{b}} \newcommand{\wtB} {\widetilde{B}} \newcommand{\wtrmb} {\widetilde{\rmb}} \newcommand{\wtrmB} {\widetilde{\rmB}} \newcommand{\wtbb} {\widetilde{\bb}} \newcommand{\wtbB} {\widetilde{\bB}} \newcommand{\whb} {\widehat{b}} \newcommand{\whB} {\widehat{B}} \newcommand{\whrmb} {\widehat{\rmb}} \newcommand{\whrmB} {\widehat{\rmB}} \newcommand{\whbb} {\widehat{\bb}} \newcommand{\whbB} {\widehat{\bB}} \newcommand{\whcB} {\widehat{\cB}} \newcommand{\whtnsrB} {\widehat{\tnsrB}} \newcommand{\C} {\mathbb{C}} \newcommand{\rmc} {\mathrm{c}} \newcommand{\rmC} {\mathrm{C}} \newcommand{\bc} {\mathbf{c}} \newcommand{\bC} {\mathbf{C}} \newcommand{\cC} {\mathcal{C}} \newcommand{\bcC} {\boldsymbol{\cC}} \newcommand{\tnsrC} {\underline{\bC}} \newcommand{\wtc} {\widetilde{c}} \newcommand{\wtC} {\widetilde{C}} \newcommand{\wtrmc} {\widetilde{\rmc}} \newcommand{\wtrmC} {\widetilde{\rmC}} \newcommand{\wtbc} {\widetilde{\bc}} \newcommand{\wtbC} {\widetilde{\bC}} \newcommand{\whc} {\widehat{c}} \newcommand{\whC} {\widehat{C}} \newcommand{\whrmc} {\widehat{\rmc}} \newcommand{\whrmC} {\widehat{\rmC}} \newcommand{\whbc} {\widehat{\bc}} \newcommand{\whbC} {\widehat{\bC}} \newcommand{\whcC} {\widehat{\cC}} \newcommand{\whtnsrC} {\widehat{\tnsrC}} \newcommand{\D} {\mathbb{D}} \newcommand{\rmd} {\mathrm{d}} \newcommand{\rmD} {\mathrm{D}} \newcommand{\bd} {\mathbf{d}} \newcommand{\bD} {\mathbf{D}} \newcommand{\cD} {\mathcal{D}} \newcommand{\bcD} {\boldsymbol{\cD}} \newcommand{\tnsrD} {\underline{\bD}} \newcommand{\wtd} {\widetilde{d}} \newcommand{\wtD} {\widetilde{D}} \newcommand{\wtrmd} {\widetilde{\rmd}} \newcommand{\wtrmD} {\widetilde{\rmD}} \newcommand{\wtbd} {\widetilde{\bd}} \newcommand{\wtbD} {\widetilde{\bD}} \newcommand{\whd} {\widehat{d}} \newcommand{\whD} {\widehat{D}} \newcommand{\whrmd} {\widehat{\rmd}} \newcommand{\whrmD} {\widehat{\rmD}} \newcommand{\whbd} {\widehat{\bd}} \newcommand{\whbD} {\widehat{\bD}} \newcommand{\whcD} {\widehat{\cD}} \newcommand{\whtnsrD} {\widehat{\tnsrD}} \newcommand{\E} {\mathbb{E}} \newcommand{\rme} {\mathrm{e}} \newcommand{\rmE} {\mathrm{E}} \newcommand{\be} {\mathbf{e}} \newcommand{\bE} {\mathbf{E}} \newcommand{\cE} {\mathcal{E}} \newcommand{\bcE} {\boldsymbol{\cE}} \newcommand{\tnsrE} {\underline{\bE}} \newcommand{\wte} {\widetilde{e}} \newcommand{\wtE} {\widetilde{E}} \newcommand{\wtrme} {\widetilde{\rme}} \newcommand{\wtrmE} {\widetilde{\rmE}} \newcommand{\wtbe} {\widetilde{\be}} \newcommand{\wtbE} {\widetilde{\bE}} \newcommand{\whe} {\widehat{e}} \newcommand{\whE} {\widehat{E}} \newcommand{\whrme} {\widehat{\rme}} \newcommand{\whrmE} {\widehat{\rmE}} \newcommand{\whbe} {\widehat{\be}} \newcommand{\whbE} {\widehat{\bE}} \newcommand{\whcE} {\widehat{\cE}} \newcommand{\whtnsrE} {\widehat{\tnsrE}} \newcommand{\F} {\mathbb{F}} \newcommand{\rmf} {\mathrm{f}} \newcommand{\rmF} {\mathrm{F}} \newcommand{\bff} {\mathbf{f}} \newcommand{\bF} {\mathbf{F}} \newcommand{\cF} {\mathcal{F}} \newcommand{\bcF} {\boldsymbol{\cF}} \newcommand{\tnsrF} {\underline{\bF}} \newcommand{\wtf} {\widetilde{f}} \newcommand{\wtF} {\widetilde{F}} \newcommand{\wtrmf} {\widetilde{\rmf}} \newcommand{\wtrmF} {\widetilde{\rmF}} \newcommand{\wtbf} {\widetilde{\bf}} \newcommand{\wtbF} {\widetilde{\bF}} \newcommand{\whf} {\widehat{f}} \newcommand{\whF} {\widehat{F}} \newcommand{\whrmf} {\widehat{\rmf}} \newcommand{\whrmF} {\widehat{\rmF}} \newcommand{\whbf} {\widehat{\bf}} \newcommand{\whbF} {\widehat{\bF}} \newcommand{\whcF} {\widehat{\cF}} \newcommand{\whtnsrF} {\widehat{\tnsrF}} \newcommand{\G} {\mathbb{G}} \newcommand{\rmg} {\mathrm{g}} \newcommand{\rmG} {\mathrm{G}} \newcommand{\bg} {\mathbf{g}} \newcommand{\bG} {\mathbf{G}} \newcommand{\cG} {\mathcal{G}} \newcommand{\bcG} {\boldsymbol{\cG}} \newcommand{\tnsrG} {\underline{\bG}} \newcommand{\wtg} {\widetilde{g}} \newcommand{\wtG} {\widetilde{G}} \newcommand{\wtrmg} {\widetilde{\rmg}} \newcommand{\wtrmG} {\widetilde{\rmG}} \newcommand{\wtbg} {\widetilde{\bg}} \newcommand{\wtbG} {\widetilde{\bG}} \newcommand{\whg} {\widehat{g}} \newcommand{\whG} {\widehat{G}} \newcommand{\whrmg} {\widehat{\rmg}} \newcommand{\whrmG} {\widehat{\rmG}} \newcommand{\whbg} {\widehat{\bg}} \newcommand{\whbG} {\widehat{\bG}} \newcommand{\whcG} {\widehat{\cG}} \newcommand{\whtnsrG} {\widehat{\tnsrG}} \newcommand{\bbH} {\mathbb{H}} \newcommand{\rmh} {\mathrm{h}} \newcommand{\rmH} {\mathrm{H}} \newcommand{\bh} {\mathbf{h}} \newcommand{\bH} {\mathbf{H}} \newcommand{\cH} {\mathcal{H}} \newcommand{\bcH} {\boldsymbol{\cH}} \newcommand{\tnsrH} {\underline{\bH}} \newcommand{\wth} {\widetilde{h}} \newcommand{\wtH} {\widetilde{H}} \newcommand{\wtrmh} {\widetilde{\rmh}} \newcommand{\wtrmH} {\widetilde{\rmH}} \newcommand{\wtbh} {\widetilde{\bh}} \newcommand{\wtbH} {\widetilde{\bH}} \newcommand{\whh} {\widehat{h}} \newcommand{\whH} {\widehat{H}} \newcommand{\whrmh} {\widehat{\rmh}} \newcommand{\whrmH} {\widehat{\rmH}} \newcommand{\whbh} {\widehat{\bh}} \newcommand{\whbH} {\widehat{\bH}} \newcommand{\whcH} {\widehat{\cH}} \newcommand{\whtnsrH} {\widehat{\tnsrH}} \newcommand{\I} {\mathbb{I}} \newcommand{\rmi} {\mathrm{i}} \newcommand{\rmI} {\mathrm{I}} \newcommand{\bi} {\mathbf{i}} \newcommand{\bI} {\mathbf{I}} \newcommand{\cI} {\mathcal{I}} \newcommand{\bcI} {\boldsymbol{\cI}} \newcommand{\tnsrI} {\underline{\bI}} \newcommand{\wti} {\widetilde{i}} \newcommand{\wtI} {\widetilde{I}} \newcommand{\wtrmi} {\widetilde{\rmi}} \newcommand{\wtrmI} {\widetilde{\rmI}} \newcommand{\wtbi} {\widetilde{\bi}} \newcommand{\wtbI} {\widetilde{\bI}} \newcommand{\whi} {\widehat{i}} \newcommand{\whI} {\widehat{I}} \newcommand{\whrmi} {\widehat{\rmi}} \newcommand{\whrmI} {\widehat{\rmI}} \newcommand{\whbi} {\widehat{\bi}} \newcommand{\whbI} {\widehat{\bI}} \newcommand{\whcI} {\widehat{\cI}} \newcommand{\whtnsrI} {\widehat{\tnsrI}} \newcommand{\J} {\mathbb{J}} \newcommand{\rmj} {\mathrm{j}} \newcommand{\rmJ} {\mathrm{J}} \newcommand{\bj} {\mathbf{j}} \newcommand{\bJ} {\mathbf{J}} \newcommand{\cJ} {\mathcal{J}} \newcommand{\bcJ} {\boldsymbol{\cJ}} \newcommand{\tnsrJ} {\underline{\bJ}} \newcommand{\wtj} {\widetilde{j}} \newcommand{\wtJ} {\widetilde{J}} \newcommand{\wtrmj} {\widetilde{\rmj}} \newcommand{\wtrmJ} {\widetilde{\rmJ}} \newcommand{\wtbj} {\widetilde{\bj}} \newcommand{\wtbJ} {\widetilde{\bJ}} \newcommand{\whj} {\widehat{j}} \newcommand{\whJ} {\widehat{J}} \newcommand{\whrmj} {\widehat{\rmj}} \newcommand{\whrmJ} {\widehat{\rmJ}} \newcommand{\whbj} {\widehat{\bj}} \newcommand{\whbJ} {\widehat{\bJ}} \newcommand{\whcJ} {\widehat{\cJ}} \newcommand{\whtnsrJ} {\widehat{\tnsrJ}} \newcommand{\K} {\mathbb{K}} \newcommand{\rmk} {\mathrm{k}} \newcommand{\rmK} {\mathrm{K}} \newcommand{\bk} {\mathbf{k}} \newcommand{\bK} {\mathbf{K}} \newcommand{\cK} {\mathcal{K}} \newcommand{\bcK} {\boldsymbol{\cK}} \newcommand{\tnsrK} {\underline{\bK}} \newcommand{\wtk} {\widetilde{k}} \newcommand{\wtK} {\widetilde{K}} \newcommand{\wtrmk} {\widetilde{\rmk}} \newcommand{\wtrmK} {\widetilde{\rmK}} \newcommand{\wtbk} {\widetilde{\bk}} \newcommand{\wtbK} {\widetilde{\bK}} \newcommand{\whk} {\widehat{k}} \newcommand{\whK} {\widehat{K}} \newcommand{\whrmk} {\widehat{\rmk}} \newcommand{\whrmK} {\widehat{\rmK}} \newcommand{\whbk} {\widehat{\bk}} \newcommand{\whbK} {\widehat{\bK}} \newcommand{\whcK} {\widehat{\cK}} \newcommand{\whtnsrK} {\widehat{\tnsrK}} \newcommand{\bbL} {\mathbb{L}} \newcommand{\rml} {\mathrm{l}} \newcommand{\rmL} {\mathrm{L}} \newcommand{\bl} {\mathbf{l}} \newcommand{\bL} {\mathbf{L}} \newcommand{\cL} {\mathcal{L}} \newcommand{\bcL} {\boldsymbol{\cL}} \newcommand{\tnsrL} {\underline{\bL}} \newcommand{\wtl} {\widetilde{l}} \newcommand{\wtL} {\widetilde{L}} \newcommand{\wtrml} {\widetilde{\rml}} \newcommand{\wtrmL} {\widetilde{\rmL}} \newcommand{\wtbl} {\widetilde{\bl}} \newcommand{\wtbL} {\widetilde{\bL}} \newcommand{\whl} {\widehat{l}} \newcommand{\whL} {\widehat{L}} \newcommand{\whrml} {\widehat{\rml}} \newcommand{\whrmL} {\widehat{\rmL}} \newcommand{\whbl} {\widehat{\bl}} \newcommand{\whbL} {\widehat{\bL}} \newcommand{\whcL} {\widehat{\cL}} \newcommand{\whtnsrL} {\widehat{\tnsrL}} \newcommand{\M} {\mathbb{M}} \newcommand{\rmm} {\mathrm{m}} \newcommand{\rmM} {\mathrm{M}} \newcommand{\bm} {\mathbf{m}} \newcommand{\bM} {\mathbf{M}} \newcommand{\cM} {\mathcal{M}} \newcommand{\bcM} {\boldsymbol{\cM}} \newcommand{\tnsrM} {\underline{\bM}} \newcommand{\wtm} {\widetilde{m}} \newcommand{\wtM} {\widetilde{M}} \newcommand{\wtrmm} {\widetilde{\rmm}} \newcommand{\wtrmM} {\widetilde{\rmM}} \newcommand{\wtbm} {\widetilde{\bm}} \newcommand{\wtbM} {\widetilde{\bM}} \newcommand{\whm} {\widehat{m}} \newcommand{\whM} {\widehat{M}} \newcommand{\whrmm} {\widehat{\rmm}} \newcommand{\whrmM} {\widehat{\rmM}} \newcommand{\whbm} {\widehat{\bm}} \newcommand{\whbM} {\widehat{\bM}} \newcommand{\whcM} {\widehat{\cM}} \newcommand{\whtnsrM} {\widehat{\tnsrM}} \newcommand{\N} {\mathbb{N}} \newcommand{\rmn} {\mathrm{n}} \newcommand{\rmN} {\mathrm{N}} \newcommand{\bn} {\mathbf{n}} \newcommand{\bN} {\mathbf{N}} \newcommand{\cN} {\mathcal{N}} \newcommand{\bcN} {\boldsymbol{\cN}} \newcommand{\tnsrN} {\underline{\bN}} \newcommand{\wtn} {\widetilde{n}} \newcommand{\wtN} {\widetilde{N}} \newcommand{\wtrmn} {\widetilde{\rmn}} \newcommand{\wtrmN} {\widetilde{\rmN}} \newcommand{\wtbn} {\widetilde{\bn}} \newcommand{\wtbN} {\widetilde{\bN}} \newcommand{\whn} {\widehat{n}} \newcommand{\whN} {\widehat{N}} \newcommand{\whrmn} {\widehat{\rmn}} \newcommand{\whrmN} {\widehat{\rmN}} \newcommand{\whbn} {\widehat{\bn}} \newcommand{\whbN} {\widehat{\bN}} \newcommand{\whcN} {\widehat{\cN}} \newcommand{\whtnsrN} {\widehat{\tnsrN}} \newcommand{\bbO} {\mathbb{O}} \newcommand{\rmo} {\mathrm{o}} \newcommand{\rmO} {\mathrm{O}} \newcommand{\bo} {\mathbf{o}} \newcommand{\bO} {\mathbf{O}} \newcommand{\cO} {\mathcal{O}} \newcommand{\bcO} {\boldsymbol{\cO}} \newcommand{\tnsrO} {\underline{\bO}} \newcommand{\wto} {\widetilde{o}} \newcommand{\wtO} {\widetilde{O}} \newcommand{\wtrmo} {\widetilde{\rmo}} \newcommand{\wtrmO} {\widetilde{\rmO}} \newcommand{\wtbo} {\widetilde{\bo}} \newcommand{\wtbO} {\widetilde{\bO}} \newcommand{\who} {\widehat{o}} \newcommand{\whO} {\widehat{O}} \newcommand{\whrmo} {\widehat{\rmo}} \newcommand{\whrmO} {\widehat{\rmO}} \newcommand{\whbo} {\widehat{\bo}} \newcommand{\whbO} {\widehat{\bO}} \newcommand{\whcO} {\widehat{\cO}} \newcommand{\whtnsrO} {\widehat{\tnsrO}} \newcommand{\bbP} {\mathbb{P}} \newcommand{\rmp} {\mathrm{p}} \newcommand{\rmP} {\mathrm{P}} \newcommand{\bp} {\mathbf{p}} \newcommand{\bP} {\mathbf{P}} \newcommand{\cP} {\mathcal{P}} \newcommand{\bcP} {\boldsymbol{\cP}} \newcommand{\tnsrP} {\underline{\bP}} \newcommand{\wtp} {\widetilde{p}} \newcommand{\wtP} {\widetilde{P}} \newcommand{\wtrmp} {\widetilde{\rmp}} \newcommand{\wtrmP} {\widetilde{\rmP}} \newcommand{\wtbp} {\widetilde{\bp}} \newcommand{\wtbP} {\widetilde{\bP}} \newcommand{\whp} {\widehat{p}} \newcommand{\whP} {\widehat{P}} \newcommand{\whrmp} {\widehat{\rmp}} \newcommand{\whrmP} {\widehat{\rmP}} \newcommand{\whbp} {\widehat{\bp}} \newcommand{\whbP} {\widehat{\bP}} \newcommand{\whcP} {\widehat{\cP}} \newcommand{\whtnsrP} {\widehat{\tnsrP}} \newcommand{\Q} {\mathbb{Q}} \newcommand{\rmq} {\mathrm{q}} \newcommand{\rmQ} {\mathrm{Q}} \newcommand{\bq} {\mathbf{q}} \newcommand{\bQ} {\mathbf{Q}} \newcommand{\cQ} {\mathcal{Q}} \newcommand{\bcQ} {\boldsymbol{\cQ}} \newcommand{\tnsrQ} {\underline{\bQ}} \newcommand{\wtq} {\widetilde{q}} \newcommand{\wtQ} {\widetilde{Q}} \newcommand{\wtrmq} {\widetilde{\rmq}} \newcommand{\wtrmQ} {\widetilde{\rmQ}} \newcommand{\wtbq} {\widetilde{\bq}} \newcommand{\wtbQ} {\widetilde{\bQ}} \newcommand{\whq} {\widehat{q}} \newcommand{\whQ} {\widehat{Q}} \newcommand{\whrmq} {\widehat{\rmq}} \newcommand{\whrmQ} {\widehat{\rmQ}} \newcommand{\whbq} {\widehat{\bq}} \newcommand{\whbQ} {\widehat{\bQ}} \newcommand{\whcQ} {\widehat{\cQ}} \newcommand{\whtnsrQ} {\widehat{\tnsrQ}} \newcommand{\R} {\mathbb{R}} \newcommand{\rmr} {\mathrm{r}} \newcommand{\rmR} {\mathrm{R}} \newcommand{\br} {\mathbf{r}} \newcommand{\bR} {\mathbf{R}} \newcommand{\cR} {\mathcal{R}} \newcommand{\bcR} {\boldsymbol{\cR}} \newcommand{\tnsrR} {\underline{\bR}} \newcommand{\wtr} {\widetilde{r}} \newcommand{\wtR} {\widetilde{R}} \newcommand{\wtrmr} {\widetilde{\rmr}} \newcommand{\wtrmR} {\widetilde{\rmR}} \newcommand{\wtbr} {\widetilde{\br}} \newcommand{\wtbR} {\widetilde{\bR}} \newcommand{\whr} {\widehat{r}} \newcommand{\whR} {\widehat{R}} \newcommand{\whrmr} {\widehat{\rmr}} \newcommand{\whrmR} {\widehat{\rmR}} \newcommand{\whbr} {\widehat{\br}} \newcommand{\whbR} {\widehat{\bR}} \newcommand{\whcR} {\widehat{\cR}} \newcommand{\whtnsrR} {\widehat{\tnsrR}} \newcommand{\bbS} {\mathbb{S}} \newcommand{\rms} {\mathrm{s}} \newcommand{\rmS} {\mathrm{S}} \newcommand{\bs} {\mathbf{s}} \newcommand{\bS} {\mathbf{S}} \newcommand{\cS} {\mathcal{S}} \newcommand{\bcS} {\boldsymbol{\cS}} \newcommand{\tnsrS} {\underline{\bS}} \newcommand{\wts} {\widetilde{s}} \newcommand{\wtS} {\widetilde{S}} \newcommand{\wtrms} {\widetilde{\rms}} \newcommand{\wtrmS} {\widetilde{\rmS}} \newcommand{\wtbs} {\widetilde{\bs}} \newcommand{\wtbS} {\widetilde{\bS}} \newcommand{\whs} {\widehat{s}} \newcommand{\whS} {\widehat{S}} \newcommand{\whrms} {\widehat{\rms}} \newcommand{\whrmS} {\widehat{\rmS}} \newcommand{\whbs} {\widehat{\bs}} \newcommand{\whbS} {\widehat{\bS}} \newcommand{\whcS} {\widehat{\cS}} \newcommand{\whtnsrS} {\widehat{\tnsrS}} \newcommand{\T} {\mathbb{T}} \newcommand{\rmt} {\mathrm{t}} \newcommand{\rmT} {\mathrm{T}} \newcommand{\bt} {\mathbf{t}} \newcommand{\bT} {\mathbf{T}} \newcommand{\cT} {\mathcal{T}} \newcommand{\bcT} {\boldsymbol{\cT}} \newcommand{\tnsrT} {\underline{\bT}} \newcommand{\wtt} {\widetilde{t}} \newcommand{\wtT} {\widetilde{T}} \newcommand{\wtrmt} {\widetilde{\rmt}} \newcommand{\wtrmT} {\widetilde{\rmT}} \newcommand{\wtbt} {\widetilde{\bt}} \newcommand{\wtbT} {\widetilde{\bT}} \newcommand{\wht} {\widehat{t}} \newcommand{\whT} {\widehat{T}} \newcommand{\whrmt} {\widehat{\rmt}} \newcommand{\whrmT} {\widehat{\rmT}} \newcommand{\whbt} {\widehat{\bt}} \newcommand{\whbT} {\widehat{\bT}} \newcommand{\whcT} {\widehat{\cT}} \newcommand{\whtnsrT} {\widehat{\tnsrT}} \newcommand{\U} {\mathbb{U}} \newcommand{\rmu} {\mathrm{u}} \newcommand{\rmU} {\mathrm{U}} \newcommand{\bu} {\mathbf{u}} \newcommand{\bU} {\mathbf{U}} \newcommand{\cU} {\mathcal{U}} \newcommand{\bcU} {\boldsymbol{\cU}} \newcommand{\tnsrU} {\underline{\bU}} \newcommand{\wtu} {\widetilde{u}} \newcommand{\wtU} {\widetilde{U}} \newcommand{\wtrmu} {\widetilde{\rmu}} \newcommand{\wtrmU} {\widetilde{\rmU}} \newcommand{\wtbu} {\widetilde{\bu}} \newcommand{\wtbU} {\widetilde{\bU}} \newcommand{\whu} {\widehat{u}} \newcommand{\whU} {\widehat{U}} \newcommand{\whrmu} {\widehat{\rmu}} \newcommand{\whrmU} {\widehat{\rmU}} \newcommand{\whbu} {\widehat{\bu}} \newcommand{\whbU} {\widehat{\bU}} \newcommand{\whcU} {\widehat{\cU}} \newcommand{\whtnsrU} {\widehat{\tnsrU}} \newcommand{\V} {\mathbb{V}} \newcommand{\rmv} {\mathrm{v}} \newcommand{\rmV} {\mathrm{V}} \newcommand{\bv} {\mathbf{v}} \newcommand{\bV} {\mathbf{V}} \newcommand{\cV} {\mathcal{V}} \newcommand{\bcV} {\boldsymbol{\cV}} \newcommand{\tnsrV} {\underline{\bV}} \newcommand{\wtv} {\widetilde{v}} \newcommand{\wtV} {\widetilde{V}} \newcommand{\wtrmv} {\widetilde{\rmv}} \newcommand{\wtrmV} {\widetilde{\rmV}} \newcommand{\wtbv} {\widetilde{\bv}} \newcommand{\wtbV} {\widetilde{\bV}} \newcommand{\whv} {\widehat{v}} \newcommand{\whV} {\widehat{V}} \newcommand{\whrmv} {\widehat{\rmv}} \newcommand{\whrmV} {\widehat{\rmV}} \newcommand{\whbv} {\widehat{\bv}} \newcommand{\whbV} {\widehat{\bV}} \newcommand{\whcV} {\widehat{\cV}} \newcommand{\whtnsrV} {\widehat{\tnsrV}} \newcommand{\W} {\mathbb{W}} \newcommand{\rmw} {\mathrm{w}} \newcommand{\rmW} {\mathrm{W}} \newcommand{\bw} {\mathbf{w}} \newcommand{\bW} {\mathbf{W}} \newcommand{\cW} {\mathcal{W}} \newcommand{\bcW} {\boldsymbol{\cW}} \newcommand{\tnsrW} {\underline{\bW}} \newcommand{\wtw} {\widetilde{w}} \newcommand{\wtW} {\widetilde{W}} \newcommand{\wtrmw} {\widetilde{\rmw}} \newcommand{\wtrmW} {\widetilde{\rmW}} \newcommand{\wtbw} {\widetilde{\bw}} \newcommand{\wtbW} {\widetilde{\bW}} \newcommand{\whw} {\widehat{w}} \newcommand{\whW} {\widehat{W}} \newcommand{\whrmw} {\widehat{\rmw}} \newcommand{\whrmW} {\widehat{\rmW}} \newcommand{\whbw} {\widehat{\bw}} \newcommand{\whbW} {\widehat{\bW}} \newcommand{\whcW} {\widehat{\cW}} \newcommand{\whtnsrW} {\widehat{\tnsrW}} \newcommand{\X} {\mathbb{X}} \newcommand{\rmx} {\mathrm{x}} \newcommand{\rmX} {\mathrm{X}} \newcommand{\bx} {\mathbf{x}} \newcommand{\bX} {\mathbf{X}} \newcommand{\cX} {\mathcal{X}} \newcommand{\bcX} {\boldsymbol{\cX}} \newcommand{\tnsrX} {\underline{\bX}} \newcommand{\wtx} {\widetilde{x}} \newcommand{\wtX} {\widetilde{X}} \newcommand{\wtrmx} {\widetilde{\rmx}} \newcommand{\wtrmX} {\widetilde{\rmX}} \newcommand{\wtbx} {\widetilde{\bx}} \newcommand{\wtbX} {\widetilde{\bX}} \newcommand{\whx} {\widehat{x}} \newcommand{\whX} {\widehat{X}} \newcommand{\whrmx} {\widehat{\rmx}} \newcommand{\whrmX} {\widehat{\rmX}} \newcommand{\whbx} {\widehat{\bx}} \newcommand{\whbX} {\widehat{\bX}} \newcommand{\whcX} {\widehat{\cX}} \newcommand{\whtnsrX} {\widehat{\tnsrX}} \newcommand{\Y} {\mathbb{Y}} \newcommand{\rmy} {\mathrm{y}} \newcommand{\rmY} {\mathrm{Y}} \newcommand{\by} {\mathbf{y}} \newcommand{\bY} {\mathbf{Y}} \newcommand{\cY} {\mathcal{Y}} \newcommand{\bcY} {\boldsymbol{\cY}} \newcommand{\tnsrY} {\underline{\bY}} \newcommand{\wty} {\widetilde{y}} \newcommand{\wtY} {\widetilde{Y}} \newcommand{\wtrmy} {\widetilde{\rmy}} \newcommand{\wtrmY} {\widetilde{\rmY}} \newcommand{\wtby} {\widetilde{\by}} \newcommand{\wtbY} {\widetilde{\bY}} \newcommand{\why} {\widehat{y}} \newcommand{\whY} {\widehat{Y}} \newcommand{\whrmy} {\widehat{\rmy}} \newcommand{\whrmY} {\widehat{\rmY}} \newcommand{\whby} {\widehat{\by}} \newcommand{\whbY} {\widehat{\bY}} \newcommand{\whcY} {\widehat{\cY}} \newcommand{\whtnsrY} {\widehat{\tnsrY}} \newcommand{\Z} {\mathbb{Z}} \newcommand{\rmz} {\mathrm{z}} \newcommand{\rmZ} {\mathrm{Z}} \newcommand{\bz} {\mathbf{z}} \newcommand{\bZ} {\mathbf{Z}} \newcommand{\cZ} {\mathcal{Z}} \newcommand{\bcZ} {\boldsymbol{\cZ}} \newcommand{\tnsrZ} {\underline{\bZ}} \newcommand{\wtz} {\widetilde{z}} \newcommand{\wtZ} {\widetilde{Z}} \newcommand{\wtrmz} {\widetilde{\rmz}} \newcommand{\wtrmZ} {\widetilde{\rmZ}} \newcommand{\wtbz} {\widetilde{\bz}} \newcommand{\wtbZ} {\widetilde{\bZ}} \newcommand{\whz} {\widehat{z}} \newcommand{\whZ} {\widehat{Z}} \newcommand{\whrmz} {\widehat{\rmz}} \newcommand{\whrmZ} {\widehat{\rmZ}} \newcommand{\whbz} {\widehat{\bz}} \newcommand{\whbZ} {\widehat{\bZ}} \newcommand{\whcZ} {\widehat{\cZ}} \newcommand{\whtnsrZ} {\widehat{\tnsrZ}} \newcommand{\balpha} {\boldsymbol{\alpha}} \newcommand{\wtalpha} {\widetilde{\alpha}} \newcommand{\whalpha} {\widehat{\alpha}} \newcommand{\wtbalpha} {\widetilde{\balpha}} \newcommand{\whbalpha} {\widehat{\balpha}} \newcommand{\bbeta} {\boldsymbol{\beta}} \newcommand{\wtbeta} {\widetilde{\beta}} \newcommand{\whbeta} {\widehat{\beta}} \newcommand{\wtbbeta} {\widetilde{\bbeta}} \newcommand{\whbbeta} {\widehat{\bbeta}} \newcommand{\bgamma} {\boldsymbol{\gamma}} \newcommand{\wtgamma} {\widetilde{\gamma}} \newcommand{\whgamma} {\widehat{\gamma}} \newcommand{\wtbgamma} {\widetilde{\bgamma}} \newcommand{\whbgamma} {\widehat{\bgamma}} \newcommand{\bdelta} {\boldsymbol{\delta}} \newcommand{\wtdelta} {\widetilde{\delta}} \newcommand{\whdelta} {\widehat{\delta}} \newcommand{\wtbdelta} {\widetilde{\bdelta}} \newcommand{\whbdelta} {\widehat{\bdelta}} \newcommand{\bepsilon} {\boldsymbol{\epsilon}} \newcommand{\wtepsilon} {\widetilde{\epsilon}} \newcommand{\whepsilon} {\widehat{\epsilon}} \newcommand{\wtbepsilon}{\widetilde{\bepsilon}} \newcommand{\whbepsilon}{\widehat{\bepsilon}} \newcommand{\veps} {\varepsilon} \newcommand{\bveps} {\boldsymbol{\veps}} \newcommand{\wtveps} {\widetilde{\veps}} \newcommand{\whveps} {\widehat{\veps}} \newcommand{\wtbveps} {\widetilde{\bveps}} \newcommand{\whbveps} {\widehat{\bveps}} \newcommand{\bEta} {\boldsymbol{\eta}} \newcommand{\wteta} {\widetilde{\eta}} \newcommand{\wheta} {\widehat{\eta}} \newcommand{\wtbEta} {\widetilde{\bEta}} \newcommand{\whbEta} {\widehat{\bEta}} \newcommand{\btheta} {\boldsymbol{\theta}} \newcommand{\wttheta} {\widetilde{\theta}} \newcommand{\whtheta} {\widehat{\theta}} \newcommand{\wtbtheta} {\widetilde{\btheta}} \newcommand{\whbtheta} {\widehat{\btheta}} \newcommand{\bvtheta} {\boldsymbol{\vartheta}} \newcommand{\wtvtheta} {\widetilde{\vartheta}} \newcommand{\whvtheta} {\widehat{\vartheta}} \newcommand{\wtbvtheta} {\widetilde{\bvtheta}} \newcommand{\whbvtheta} {\widehat{\bvtheta}} \newcommand{\biota} {\boldsymbol{\iota}} \newcommand{\wtiota} {\widetilde{\iota}} \newcommand{\whiota} {\widehat{\iota}} \newcommand{\wtbiota} {\widetilde{\biota}} \newcommand{\whbiota} {\widehat{\biota}} \newcommand{\bkappa} {\boldsymbol{\kappa}} \newcommand{\wtkappa} {\widetilde{\kappa}} \newcommand{\whkappa} {\widehat{\kappa}} \newcommand{\wtbkappa} {\widetilde{\bkappa}} \newcommand{\whbkappa} {\widehat{\bkappa}} \newcommand{\blambda} {\boldsymbol{\lambda}} \newcommand{\wtlambda} {\widetilde{\lambda}} \newcommand{\whlambda} {\widehat{\lambda}} \newcommand{\wtblambda} {\widetilde{\blambda}} \newcommand{\whblambda} {\widehat{\blambda}} \newcommand{\bmu} {\boldsymbol{\mu}} \newcommand{\wtmu} {\widetilde{\mu}} \newcommand{\whmu} {\widehat{\mu}} \newcommand{\wtbmu} {\widetilde{\bmu}} \newcommand{\whbmu} {\widehat{\bmu}} \newcommand{\bnu} {\boldsymbol{\nu}} \newcommand{\wtnu} {\widetilde{\nu}} \newcommand{\whnu} {\widehat{\nu}} \newcommand{\wtbnu} {\widetilde{\bnu}} \newcommand{\whbnu} {\widehat{\bnu}} \newcommand{\bxi} {\boldsymbol{\xi}} \newcommand{\wtxi} {\widetilde{\xi}} \newcommand{\whxi} {\widehat{\xi}} \newcommand{\wtbxi} {\widetilde{\bxi}} \newcommand{\whbxi} {\widehat{\bxi}} \newcommand{\bpi} {\boldsymbol{\pi}} \newcommand{\wtpi} {\widetilde{\pi}} \newcommand{\whpi} {\widehat{\pi}} \newcommand{\wtbpi} {\widetilde{\bpi}} \newcommand{\whbpi} {\widehat{\bpi}} \newcommand{\bvpi} {\boldsymbol{\varpi}} \newcommand{\wtvpi} {\widetilde{\varpi}} \newcommand{\whvpi} {\widehat{\varpi}} \newcommand{\wtbvpi} {\widetilde{\bvpi}} \newcommand{\whbvpi} {\widehat{\bvpi}} \newcommand{\brho} {\boldsymbol{\rho}} \newcommand{\wtrho} {\widetilde{\rho}} \newcommand{\whrho} {\widehat{\rho}} \newcommand{\wtbrho} {\widetilde{\brho}} \newcommand{\whbrho} {\widehat{\brho}} \newcommand{\bvrho} {\boldsymbol{\varrho}} \newcommand{\wtvrho} {\widetilde{\varrho}} \newcommand{\whvrho} {\widehat{\varrho}} \newcommand{\wtbvrho} {\widetilde{\bvrho}} \newcommand{\whbvrho} {\widehat{\bvrho}} \newcommand{\bsigma} {\boldsymbol{\sigma}} \newcommand{\wtsigma} {\widetilde{\sigma}} \newcommand{\whsigma} {\widehat{\sigma}} \newcommand{\wtbsigma} {\widetilde{\bsigma}} \newcommand{\whbsigma} {\widehat{\bsigma}} \newcommand{\bvsigma} {\boldsymbol{\varsigma}} \newcommand{\wtvsigma} {\widetilde{\varsigma}} \newcommand{\whvsigma} {\widehat{\varsigma}} \newcommand{\wtbvsigma} {\widetilde{\bvsigma}} \newcommand{\whbvsigma} {\widehat{\bvsigma}} \newcommand{\btau} {\boldsymbol{\tau}} \newcommand{\wttau} {\widetilde{\tau}} \newcommand{\whtau} {\widehat{\tau}} \newcommand{\wtbtau} {\widetilde{\btau}} \newcommand{\whbtau} {\widehat{\btau}} \newcommand{\bupsilon} {\boldsymbol{\upsilon}} \newcommand{\wtupsilon} {\widetilde{\upsilon}} \newcommand{\whupsilon} {\widehat{\upsilon}} \newcommand{\wtbupsilon}{\widetilde{\bupsilon}} \newcommand{\whbupsilon}{\widehat{\bupsilon}} \newcommand{\bzeta} {\boldsymbol{\zeta}} \newcommand{\wtzeta} {\widetilde{\zeta}} \newcommand{\whzeta} {\widehat{\zeta}} \newcommand{\wtbzeta}{\widetilde{\bzeta}} \newcommand{\whbzeta}{\widehat{\bzeta}} \newcommand{\bphi} {\boldsymbol{\phi}} \newcommand{\wtphi} {\widetilde{\phi}} \newcommand{\whphi} {\widehat{\phi}} \newcommand{\wtbphi} {\widetilde{\bphi}} \newcommand{\whbphi} {\widehat{\bphi}} \newcommand{\bvphi} {\boldsymbol{\varphi}} \newcommand{\wtvphi} {\widetilde{\varphi}} \newcommand{\whvphi} {\widehat{\varphi}} \newcommand{\wtbvphi} {\widetilde{\bvphi}} \newcommand{\whbvphi} {\widehat{\bvphi}} \newcommand{\bchi} {\boldsymbol{\chi}} \newcommand{\wtchi} {\widetilde{\chi}} \newcommand{\whchi} {\widehat{\chi}} \newcommand{\wtbchi} {\widetilde{\bchi}} \newcommand{\whbchi} {\widehat{\bchi}} \newcommand{\bpsi} {\boldsymbol{\psi}} \newcommand{\wtpsi} {\widetilde{\psi}} \newcommand{\whpsi} {\widehat{\psi}} \newcommand{\wtbpsi} {\widetilde{\bpsi}} \newcommand{\whbpsi} {\widehat{\bpsi}} \newcommand{\bomega} {\boldsymbol{\omega}} \newcommand{\wtomega} {\widetilde{\omega}} \newcommand{\whomega} {\widehat{\omega}} \newcommand{\wtbomega} {\widetilde{\bomega}} \newcommand{\whbomega} {\widehat{\bomega}} \newcommand{\bGamma} {\boldsymbol{\Gamma}} \newcommand{\wtGamma} {\widetilde{\Gamma}} \newcommand{\whGamma} {\widehat{\Gamma}} \newcommand{\wtbGamma} {\widetilde{\bGamma}} \newcommand{\whbGamma} {\widehat{\bGamma}} \newcommand{\bDelta} {\boldsymbol{\Delta}} \newcommand{\wtDelta} {\widetilde{\Delta}} \newcommand{\whDelta} {\widehat{\Delta}} \newcommand{\wtbDelta} {\widetilde{\bDelta}} \newcommand{\whbDelta} {\widehat{\bDelta}} \newcommand{\bLambda} {\boldsymbol{\Lambda}} \newcommand{\wtLambda} {\widetilde{\Lambda}} \newcommand{\whLambda} {\widehat{\Lambda}} \newcommand{\wtbLambda} {\widetilde{\bLambda}} \newcommand{\whbLambda} {\widehat{\bLambda}} \newcommand{\bXi} {\boldsymbol{\Xi}} \newcommand{\wtXi} {\widetilde{\Xi}} \newcommand{\whXi} {\widehat{\Xi}} \newcommand{\wtbXi} {\widetilde{\bXi}} \newcommand{\whbXi} {\widehat{\bXi}} \newcommand{\bPi} {\boldsymbol{\Pi}} \newcommand{\wtPi} {\widetilde{\Pi}} \newcommand{\whPi} {\widehat{\Pi}} \newcommand{\wtbPi} {\widetilde{\bPi}} \newcommand{\whbPi} {\widehat{\bPi}} \newcommand{\bSigma} {\boldsymbol{\Sigma}} \newcommand{\wtSigma} {\widetilde{\Sigma}} \newcommand{\whSigma} {\widehat{\Sigma}} \newcommand{\wtbSigma} {\widetilde{\bSigma}} \newcommand{\whbSigma} {\widehat{\bSigma}} \newcommand{\bUpsilon} {\boldsymbol{\Upsilon}} \newcommand{\wtUpsilon} {\widetilde{\Upsilon}} \newcommand{\whUpsilon} {\widehat{\Upsilon}} \newcommand{\wtbUpsilon}{\widetilde{\bUpsilon}} \newcommand{\whbUpsilon}{\widehat{\bUpsilon}} \newcommand{\bPhi} {\boldsymbol{\Phi}} \newcommand{\wtPhi} {\widetilde{\Phi}} \newcommand{\whPhi} {\widehat{\Phi}} \newcommand{\wtbPhi} {\widetilde{\bPhi}} \newcommand{\whbPhi} {\widehat{\bPhi}} \newcommand{\bPsi} {\boldsymbol{\Psi}} \newcommand{\wtPsi} {\widetilde{\Psi}} \newcommand{\whPsi} {\widehat{\Psi}} \newcommand{\wtbPsi} {\widetilde{\bPsi}} \newcommand{\whbPsi} {\widehat{\bPsi}} \newcommand{\bOmega} {\boldsymbol{\Omega}} \newcommand{\wtOmega} {\widetilde{\Omega}} \newcommand{\whOmega} {\widehat{\Omega}} \newcommand{\wtbOmega} {\widetilde{\bOmega}} \newcommand{\whbOmega} {\widehat{\bOmega}} \newcommand{\bzero} {\mathbf{0}} \newcommand{\bone} {\mathbf{1}} \newcommand{\norm}[1]{\left\lVert #1 \right\rVert} \newcommand{\abs}[1]{\left\lvert#1\right\rvert} \newcommand{\ceil}[1]{\left \lceil #1 \right\rceil} \newcommand{\argmin}{\operatorname*{arg\,min}} \newcommand{\argmax}{\operatorname*{arg\,max}} \)

We introduced the basics of the stochastic gradient descent (SGD) algorithm and its implementation on Python with the MNIST dataset as a test case in our article Mastering Stochastic Gradient Descent algorithm with the MNIST Dataset. We can see that stochastic gradient descent is one of the most powerful algorithms used in the world of machine learning.

We prefer the use of the SGD algorithm over Gradient Descent (GD) solely for one main purpose the data is too large to handle for our machine, so instead of processing the entire batch of samples for learning, we process sample by sample to update our algorithm.

A key thing to remember: We use SGD/GD algorithms in machine learning to minimize the loss function associated with our machine learning problem (the problem can be of classification or regression). The loss function $f(\cdot)$, however, can have certain attributes like is it a convex, non-convex, smooth, non-smooth etc. The nature of this function $f(\cdot)$ governs the performance of SGD algorithm and effect on rate of convergence.

In this article, we will investigate the proof of convergence when our loss function $f(\cdot)$ is L-smooth and non-convex.

Suggested Read: Convergence analysis of Stochastic Gradient Descent (SGD) when a function is convex

Preliminaries

Let’s quickly recap a few things before jumping onto the convergence analysis. The stochastic gradient descent algorithm is given as:


Algorithm - Stochastic Gradient Descent (SGD)

Input: A random vector $\bx_0 \in \R^n$ and a decaying step size $\gamma_t > 0$; 1. for $t=1,2,\ldots,$ do 2. $\qquad$ Evaluate the stochastic gradient mapping at $\bx_t$, i.e., evaluate $\nabla F(\bx_t, \xi_t)$; 3. $\qquad$ Do the following update: $\qquad$ $$ \bx_{t+1} := \cP_{\cX} \left(\bx_t - \gamma_t \nabla F(\bx_t, \xi_t) \right)$$ 4. end for

In the above algorithm, $\bx_t$ is the update of the objective variable $\bx$ at time $t$, $\nabla f(\bx_t)$ is the gradient of the loss function $f$, and $\cP_{\cX}$ is the projection operator. We use the projection to ensure that the update of our algorithm doesn’t cause the objective variable $\bx$ to lie outside the feasible set, and if it does, the operator projects the update $\bx_{t+1}$ back to the feasible set. However, this article will use the entire $\R^n$ as our feasible set, so we skip using the projection operator in our analysis.

The Stochastic Optimization Problem

The fundamental idea of the stochastic gradient descent algorithm is to find the optimal solution of the following stochastic optimization problem:

$$
\begin{equation*}
\begin{aligned}
\text{minimize} & \quad f(\bx) {\triangleq \mathbb{E} [F(\bx,\bxi)]} \\
\text{subject to:} & \quad \bx \in X.
\end{aligned}
\end{equation*}
$$

Where $f : \R^n \to \R$ is an unknown deterministic function, $F: \R^n \times \Omega \to \R$ is a known stochastic function, $\bxi: \Omega \to \R^d$ is a random variable (this is a random data sample), and $\mathbb{E}[\cdot]$ denotes the expectation operator taking an expectation due to the random variable $\bxi$.

Lipschitz smoothness: By definition, a differentiable function $f : X \to \R$ over the set $X \subseteq R^n$ is called L-smooth if the following property is satisfied, where $L>0$ is a scalar:

$$\norm{\nabla f(\bx)-\nabla f(\by)} \leq L \norm{\bx-\by} \quad \hbox{for all }\bx,\by \in X.$$

Descent Lemma: Let $f : \R^n \to R$ be $L$-smooth over the set $X \subseteq R^n$. Then, for any $\bx,\by \in X$ we have

$$ f(\bx) \leq f(\by) + \nabla f(\by)^T(\bx-\by) +\frac{L}{2} \norm{\bx-\by}^2.$$

Concept of Filtration:

In algorithms like stochastic gradient descent, we process sample by sample. Hence, they arrive randomly, and in convergence analysis, we need to keep track of the history of these random samples. This is denoted by a set known as Filtration, which contains the random samples till iteration $t$. Mathematically, it is defined as:

$$ \cF_t \triangleq \{ \bx_0,\bxi_0,\bxi_1,\ldots,\bxi_{t-1} \} \qquad \hbox{for all } t \geq 1,$$

and $\cF_0 \triangleq {x_0}$. Since whenever randomness is involved, we need expectations to remove that. So, we define the conditional expectation given the filtration as $\E[\bullet\mid \cF_t]$.

The main idea of using conditional expectation is that in our Filtration, if we know $\bxi_{t-1}$, then we know what $\bx_t$ i.e., by knowing the randomness of $\bxi_{t-1}$ we can find the $\bx_t$ by using the update rule of the algorithm (Step 3). This is how filtration and conditional expectation is useful in removing the randomness, and we can characterize the next iterate $\bx_{t+1}$.

Apart from Filtration, we need the following two assumptions to prove the convergence rates for the SGD algorithm:

Assumptions:

  1. $\E [ \nabla F(\bx,\bxi) \mid \bx] = \nabla f (\bx) $ for all $\bx \in X$.
  2. $\E[\norm{\nabla F(\bx,\bxi)-\nabla f (\bx)}^2 \mid \bx] \leq \sigma^2 $ for all $\bx \in X$ for some $\sigma>0$.

This assumption $1$ implies that the stochastic gradient is an unbiased estimator of the true gradient of the objective function, and assumption $2$ implies that the stochastic gradient has a bounded variance. These two assumptions are widely used in the convergence analysis of algorithms that resemble updates like stochastic gradient descent.

Main Theorem

Consider the stochastic optimization problem and an SGD Algorithm defined above. Let us define $f^*\triangleq \min_{\bx \in \R^n}f(\bx)$. Suppose $f^* >-\infty$. Let $X = \R^n$. Assume that $f$ is $L$-smooth (and non-convex). Then, the following results hold.

i) Let $\gamma_t \equiv \gamma : = \frac{1}{\sqrt{T}}$. Then, we have:

$$\min_{t \in {0,\ldots,T-1}}\mathbb{E}\left[\norm{\nabla f(x_t)}^2\right] \leq \frac{2\left(\mathbb{E}\left[f(x_0)\right]- f^*\right)+L\sigma^2}{\sqrt{T}} \qquad \hbox{for all }T \geq L^2.$$

ii) Let $\gamma_t$ be such that $\sum_{t=0}^\infty \gamma_t = \infty$ and $\sum_{t=0}^\infty \gamma_t^2 < \infty$. Then, we have:

$$ \lim_{t \to \infty} \E \left[\norm{\nabla f(\bx_t)}^2 \right] = 0.$$

Proof:

We will start with the descent lemma

$$
\begin{align*}
f(\bx_{t+1}) \leq f(\bx_t) + \nabla f(\bx_t)^T(\bx_{t+1}-\bx_t)+ \frac{L}{2}\norm{\bx_{t+1}-\bx_t}^2.
\end{align*}
$$

Note that since $X=\R^n$, we have $\bx_{t+1} := \bx_t -\gamma_t \nabla F(\bx_t,\bxi_t)$. From the update rule of the algorithm, we obtain:

$$\begin{align*}
f(\bx_{t+1}) \leq f(\bx_t) -\gamma_t \nabla f(\bx_t)^T(\nabla F(\bx_t,\bxi_t)) + \frac{\gamma_t^2L}{2}\norm{\nabla F(\bx_t,\bxi_t)}^2.
\end{align*}$$

We now define stochastic error as $\bw_t \triangleq \nabla F(\bx_t,\bxi_t)- \nabla f (\bx_t)$ for all $t \geq 0$. Invoking this by replacing $\nabla F(\bx_t,\bxi_t)$ for $\nabla f(\bx_t)+\bw_t$ we obtain

$$
\begin{align*}
f(\bx_{t+1}) & \leq f(\bx_t) -\gamma_t \nabla f(\bx_t)^T(\nabla f(\bx_t)+\bw_t) \frac{\gamma_t^2L}{2} \norm{\nabla f (\bx_t)+\bw_t}^2 \\
&= f(\bx_t) -\gamma_t \norm{\nabla f(\bx_t)}^2 – \gamma_t \nabla f(\bx_t)^T \bw_t + \frac{L \gamma_t^2}{2} \left(\norm{\nabla f(\bx_t)}^2+ \norm{\bw_t}^2 + 2 \nabla f(\bx_t)^T \bw_t \right).
\end{align*}$$

Taking conditional expectations on both sides, we obtain

$$\begin{align*}
\E [f(\bx_{t+1}) \mid \cF_t] &\leq \E [f(\bx_t) -\gamma_t \norm{\nabla f(\bx_t)}^2 – \gamma_t \nabla f(\bx_t)^T \bw_t + \frac{L \gamma_t^2}{2}\left(\norm{\nabla f(\bx_t)}^2+\norm{\bw_t}^2 + 2 \nabla f(\bx_t)^T \bw_t \right) \mid \cF_t] \\
&= f(\bx_t) – \gamma_t \norm{\nabla f(\bx_t)}^2 – \gamma_t \nabla f(\bx_t)^T \E[\bw_t \mid \cF_t]+ \frac{L \gamma_t^2}{2} \left(\norm{\nabla f(\bx_t)}^2+\E[\norm{\bw_t}^2 \mid \cF_t] + 2 \E[ \nabla f(\bx_k)^T \bw_t \mid \cF_t] \right).
\end{align*}$$

We now define the following lemma:

Lemma 1: Consider the definition of stochastic error $\bw_t$ and under the defined assumption 1 we have for all $t\geq 0$:

$$\begin{align*}
&\E[\bw_t \mid \cF_t] = \mathbf{0}_n \\
&\E[\norm{\bw_t}^2 \mid \cF_t] \leq \sigma^2.
\end{align*}$$

(Continuing the proof), Now using the above Lemma 1, we obtain

$$\begin{align*}
\E [f(\bx_{t+1}) \mid \cF_t] &\leq f(\bx_t) – \gamma_t \norm{\nabla f(\bx_t)}^2 + \frac{L \gamma_t^2}{2} \left(\norm{\nabla f(\bx_t)}^2 + \sigma^2 \right).
\end{align*}$$

Assuming $\gamma_t \leq \frac{1}{L}$, we obtain

$$\begin{align*}
\E[f(\bx_{t+1}) \mid \cF_t] &\leq f(\bx_t) – \gamma_t \norm{\nabla f(\bx_t)}^2 + \frac{\gamma_t}{2} \left(\norm{\nabla f(\bx_t)}^2 \right) + \frac{L\gamma_t^2}{2} \left( \sigma^2\right) \\
&\leq f(\bx_t) – \frac{ \gamma_t}{2} \left(\norm{\nabla f(\bx_t)}^2 \right) + \frac{L \gamma_t^2 \sigma^2}{2}.
\end{align*}$$

Taking expectation with respect to $\cF_t$ from both sides and invoking the tower property of conditional expectation, we obtain

$$\begin{align*}
\E [ \E[f(\bx_{t+1}) \mid \cF_t]] &\leq \E[f(\bx_t)] -\frac{\gamma_t}{2} \E[\norm{\nabla f(\bx_t)}^2] + \frac{L \gamma_t^2 \sigma^2}{2}.
\end{align*}$$

Thus, we have:

$$ \boxed{ \E [f(\bx_{t+1})] \leq \E[f(\bx_t)] – \frac{\gamma_t}{2} \E[\norm{\nabla f(\bx_t)}^2] + \frac{L \gamma_t^2 \sigma^2}{2}.} \qquad \qquad \qquad (1)$$

Under a constant step-size $\gamma_t \equiv \gamma$ we obtain

$$\E [\norm{\nabla f(\bx_t)}^2] \leq \frac{2}{\gamma} \left( \E[f(\bx_t)] – \E[f(\bx_{t+1})] \right)+ L\gamma \sigma^2 . $$

Summing both sides over $t=0,\ldots, T-1$ where $T \geq 1$, we obtain

$$\sum_{t=0}^{T-1}\E[\norm{\nabla f(\bx_t)}^2] \leq \frac{2}{\gamma} \left( \E[f(\bx_0)] -\E[f(\bx_{T})]
\right)+ T L \gamma \sigma^2.$$

From $f^* \leq f(x_{T})$ for any realization of $\bx_T$, we have:

$$ \min_{t \in {0, \ldots, T-1}} \E[\norm{\nabla f(x_t)}^2] \leq \frac{2 \left( f(\bx_0) – f^* \right)} {T \gamma } + L \gamma \sigma^2.$$

The bound in the given theorem is obtained by substituting $\gamma := \frac{1}{\sqrt{T}}$ above.

This concludes the proof of convergence of the stochastic gradient descent algorithm for L-smooth and non-convex functions.