[
  {
    "path": ".gitignore",
    "content": "## Core latex/pdflatex auxiliary files:\n*.aux\n*.lof\n*.log\n*.lot\n*.fls\n*.out\n*.toc\n*.fmt\n*.fot\n*.cb\n*.cb2\n.*.lb\n\n## Intermediate documents:\n*.dvi\n*.xdv\n*-converted-to.*\n# these rules might exclude image files for figures etc.\n# *.ps\n# *.eps\n*.pdf\n\n## Generated if empty string is given at \"Please type another file name for output:\"\n.pdf\n\n## Bibliography auxiliary files (bibtex/biblatex/biber):\n*.bbl\n*.bcf\n*.blg\n*-blx.aux\n*-blx.bib\n*.run.xml\n\n## Build tool auxiliary files:\n*.fdb_latexmk\n*.synctex\n*.synctex(busy)\n*.synctex.gz\n*.synctex.gz(busy)\n*.pdfsync\n\n## Build tool directories for auxiliary files\n# latexrun\nlatex.out/\n\n## Auxiliary and intermediate files from other packages:\n# algorithms\n*.alg\n*.loa\n\n# achemso\nacs-*.bib\n\n# amsthm\n*.thm\n\n# beamer\n*.nav\n*.pre\n*.snm\n*.vrb\n\n# changes\n*.soc\n\n# comment\n*.cut\n\n# cprotect\n*.cpt\n\n# elsarticle (documentclass of Elsevier journals)\n*.spl\n\n# endnotes\n*.ent\n\n# fixme\n*.lox\n\n# feynmf/feynmp\n*.mf\n*.mp\n*.t[1-9]\n*.t[1-9][0-9]\n*.tfm\n\n#(r)(e)ledmac/(r)(e)ledpar\n*.end\n*.?end\n*.[1-9]\n*.[1-9][0-9]\n*.[1-9][0-9][0-9]\n*.[1-9]R\n*.[1-9][0-9]R\n*.[1-9][0-9][0-9]R\n*.eledsec[1-9]\n*.eledsec[1-9]R\n*.eledsec[1-9][0-9]\n*.eledsec[1-9][0-9]R\n*.eledsec[1-9][0-9][0-9]\n*.eledsec[1-9][0-9][0-9]R\n\n# glossaries\n*.acn\n*.acr\n*.glg\n*.glo\n*.gls\n*.glsdefs\n*.lzo\n*.lzs\n\n# uncomment this for glossaries-extra (will ignore makeindex's style files!)\n# *.ist\n\n# gnuplottex\n*-gnuplottex-*\n\n# gregoriotex\n*.gaux\n*.gtex\n\n# htlatex\n*.4ct\n*.4tc\n*.idv\n*.lg\n*.trc\n*.xref\n\n# hyperref\n*.brf\n\n# knitr\n*-concordance.tex\n# TODO Comment the next line if you want to keep your tikz graphics files\n*.tikz\n*-tikzDictionary\n\n# listings\n*.lol\n\n# luatexja-ruby\n*.ltjruby\n\n# makeidx\n*.idx\n*.ilg\n*.ind\n\n# minitoc\n*.maf\n*.mlf\n*.mlt\n*.mtc[0-9]*\n*.slf[0-9]*\n*.slt[0-9]*\n*.stc[0-9]*\n\n# minted\n_minted*\n*.pyg\n\n# morewrites\n*.mw\n\n# nomencl\n*.nlg\n*.nlo\n*.nls\n\n# pax\n*.pax\n\n# pdfpcnotes\n*.pdfpc\n\n# sagetex\n*.sagetex.sage\n*.sagetex.py\n*.sagetex.scmd\n\n# scrwfile\n*.wrt\n\n# sympy\n*.sout\n*.sympy\nsympy-plots-for-*.tex/\n\n# pdfcomment\n*.upa\n*.upb\n\n# pythontex\n*.pytxcode\npythontex-files-*/\n\n# tcolorbox\n*.listing\n\n# thmtools\n*.loe\n\n# TikZ & PGF\n*.dpth\n*.md5\n*.auxlock\n\n# todonotes\n*.tdo\n\n# vhistory\n*.hst\n*.ver\n\n# easy-todo\n*.lod\n\n# xcolor\n*.xcp\n\n# xmpincl\n*.xmpi\n\n# xindy\n*.xdy\n\n# xypic precompiled matrices and outlines\n*.xyc\n*.xyd\n\n# endfloat\n*.ttt\n*.fff\n\n# Latexian\nTSWLatexianTemp*\n\n## Editors:\n# WinEdt\n*.bak\n*.sav\n\n# Texpad\n.texpadtmp\n\n# LyX\n*.lyx~\n\n# Kile\n*.backup\n\n# gummi\n.*.swp\n\n# KBibTeX\n*~[0-9]*\n\n# TeXnicCenter\n*.tps\n\n# auto folder when using emacs and auctex\n./auto/*\n*.el\n\n# expex forward references with \\gathertags\n*-tags.tex\n\n# standalone packages\n*.sta\n\n# Makeindex log files\n*.lpz\n\n# MacOS filesystem metadata\n*.DS_Store\n"
  },
  {
    "path": "Templates/macros.tex",
    "content": "\\usepackage{color}\n\\usepackage{lipsum}\n\n\n\n\\ifnum\\lectureformat=1\n\\newcommand{\\metadata}[3]\n{\n\t\\newpage\n\t\n\t\\def\\lectureID{#1}\n\t\n\t\\setcounter{chapter}{\\lectureID}\n\n\t\\draftnotice\n\t\n\t\\begin{center}\n\t\t\\bf\\large CS229M/STATS214: Machine Learning Theory\n\t\\end{center}\n\t\n\t\\noindent\n\tLecturer: Tengyu Ma   %%% FILL IN LECTURER (if not RS)\n\t\\hfill\n\tLecture \\# \\lectureID              %%% FILL IN LECTURE NUMBER HERE\n\t\\\\\n\tScribe: #2                  %%% FILL IN YOUR NAME HERE\n\t\\hfill\n\t#3           %%% FILL IN LECTURE DATE HERE\n\t\n\t\\noindent\n\t\\rule{\\textwidth}{1pt}\n\t\n\t\\medskip\n}\n\\else \n\\newcommand{\\metadata}[3]{}\n\\fi\n\n\\DeclareMathOperator*{\\Exp}{\\mathbb{E}}\n\\DeclareMathOperator*{\\argmin}{\\textup{argmin}}\n\\DeclareMathOperator*{\\argmax}{\\textup{argmax}}\n\\newcommand{\\E}{\\mathbb{E}}\n\n\\newcommand{\\err}{\\ell_{\\textup{0-1}}}\n\\newcommand{\\thetaerm}{\\theta_{\\textup{ERM}}}\n\\newcommand{\\hatL}{\\widehat{L}}\n\\newcommand{\\tilO}{\\widetilde{O}}\n\\newcommand{\\iid}{\\overset{\\textup{iid}}{\\sim}}\n\n\\newcommand{\\norm}[1]{\\|#1\\|}\n\\newcommand{\\Norm}[1]{\\left\\|#1\\right\\|}\n\n\n\\newcommand{\\al}[1]{\n\t\\begin{align}\n\t#1\n\t\\end{align}\n}\n\n\n\\renewcommand{\\sp}[1]{^{(#1)}}\n\n\\newcommand{\\cA}{\\mathcal A}\n\\newcommand{\\cB}{\\mathcal B}\n\\newcommand{\\cC}{\\mathcal C}\n\\newcommand{\\cD}{\\mathcal D}\n\\newcommand{\\cE}{\\mathcal E}\n\\newcommand{\\cF}{\\mathcal F}\n\\newcommand{\\cG}{\\mathcal G}\n\\newcommand{\\cH}{\\mathcal H}\n\\newcommand{\\cI}{\\mathcal I}\n\\newcommand{\\cJ}{\\mathcal J}\n\\newcommand{\\cK}{\\mathcal K}\n\\newcommand{\\cL}{\\mathcal L}\n\\newcommand{\\cM}{\\mathcal M}\n\\newcommand{\\cN}{\\mathcal N}\n\\newcommand{\\cO}{\\mathcal O}\n\\newcommand{\\cP}{\\mathcal P}\n\\newcommand{\\cQ}{\\mathcal Q}\n\\newcommand{\\cR}{\\mathcal R}\n\\newcommand{\\cS}{\\mathcal S}\n\\newcommand{\\cT}{\\mathcal T}\n\\newcommand{\\cU}{\\mathcal U}\n\\newcommand{\\cV}{\\mathcal V}\n\\newcommand{\\cW}{\\mathcal W}\n\\newcommand{\\cX}{\\mathcal X}\n\\newcommand{\\cY}{\\mathcal Y}\n\\newcommand{\\cZ}{\\mathcal Z}\n\n\\newcommand{\\bbB}{\\mathbb B}\n\\newcommand{\\bbS}{\\mathbb S}\n\\newcommand{\\bbR}{\\mathbb R}\n\\newcommand{\\bbZ}{\\mathbb Z}\n\\newcommand{\\bbI}{\\mathbb I}\n\\newcommand{\\bbQ}{\\mathbb Q}\n\\newcommand{\\bbP}{\\mathbb P}\n\\newcommand{\\bbE}{\\mathbb E}\n\\newcommand{\\bbN}{\\mathbb N}\n\n\\newcommand{\\R}{\\bbR}"
  },
  {
    "path": "Templates/master.tex",
    "content": "%% filename: amsbook-template.tex\n%% version: 1.1\n%% date: 2014/07/24\n%%\n%% American Mathematical Society\n%% Technical Support\n%% Publications Technical Group\n%% 201 Charles Street\n%% Providence, RI 02904\n%% USA\n%% tel: (401) 455-4080\n%%      (800) 321-4267 (USA and Canada only)\n%% fax: (401) 331-3842\n%% email: tech-support@ams.org\n%% \n%% Copyright 2006, 2008-2010, 2014 American Mathematical Society.\n%% \n%% This work may be distributed and/or modified under the\n%% conditions of the LaTeX Project Public License, either version 1.3c\n%% of this license or (at your option) any later version.\n%% The latest version of this license is in\n%%   http://www.latex-project.org/lppl.txt\n%% and version 1.3c or later is part of all distributions of LaTeX\n%% version 2005/12/01 or later.\n%% \n%% This work has the LPPL maintenance status `maintained'.\n%% \n%% The Current Maintainer of this work is the American Mathematical\n%% Society.\n%%\n%% ====================================================================\n\n%    AMS-LaTeX v.2 driver file template for use with amsbook\n%\n%    Remove any commented or uncommented macros you do not use.\n\n\\documentclass{book}\n\\usepackage{amsfonts,bm, amsthm, amsmath}\n\n\n\\newtheorem{theorem}{Theorem}[chapter]\n\\newtheorem{lemma}[theorem]{Lemma}\n\n\\theoremstyle{definition}\n\\newtheorem{definition}[theorem]{Definition}\n\\newtheorem{example}[theorem]{Example}\n\\newtheorem{xca}[theorem]{Exercise}\n\n\\theoremstyle{remark}\n\\newtheorem{remark}[theorem]{Remark}\n\n\\numberwithin{section}{chapter}\n\\numberwithin{equation}{chapter}\n\n%    For a single index; for multiple indexes, see the manual\n%    \"Instructions for preparation of papers and monographs:\n%    AMS-LaTeX\" (instr-l.pdf in the AMS-LaTeX distribution).\n\\makeindex\n\\def\\lectureformat{0}\n\\input{macros}\n\\begin{document}\n\n\\frontmatter\n\n\\title{Lecture Notes for Machine Learning Theory (CS229M/STATS214)}\n\n%    Remove any unused author tags.\n\n%    author one information\n\\author{Instructor: Tengyu Ma}\n%\\address{}\n%\\curraddr{}\n%\\email{}\n\\thanks{}\n\n%    author two information\n%\\author{}\n%\\address{}\n%\\curraddr{}\n%\\email{}\n%\\thanks{}\n\n%\\subjclass[2010]{Primary }\n\n%\\keywords{}\n\n%\\date{}\n\n%\\begin{abstract}\n%\\end{abstract}\n\n\\maketitle\n\n%    Dedication.  If the dedication is longer than a line or two,\n%    remove the centering instructions and the line break.\n%\\cleardoublepage\n%\\thispagestyle{empty}\n%\\vspace*{13.5pc}\n%\\begin{center}\n%  Dedication text (use \\\\[2pt] for line break if necessary)\n%\\end{center}\n%\\cleardoublepage\n\n%    Change page number to 6 if a dedication is present.\n\\setcounter{page}{4}\n\n\\tableofcontents\n\n%    Include unnumbered chapters (preface, acknowledgments, etc.) here.\n%\\include{}\n\\mainmatter\n\\let\\sec\\section\n\\let\\subsec\\subsection\n\n\\chapter{Generalization Bounds with Uniform Convergence}\n%\\section{}\n\\input{yoursunetID}\n\\input{yoursunetID2}\n\n%    Include main chapters here.\n%\\include{}\n,,\n\\appendix\n%    Include appendix \"chapters\" here.\n\n\n\\backmatter\n%    Bibliography styles amsplain or harvard are also acceptable.\n\\bibliographystyle{amsalpha}\n\\bibliography{}\n%    See note above about multiple indexes.\n%\\printindex\n\n\\end{document}\n\n%-----------------------------------------------------------------------\n% End of amsbook-template.tex\n%-----------------------------------------------------------------------\n"
  },
  {
    "path": "Templates/template.tex",
    "content": "\t\\documentclass[11pt]{book}\n\t\n\t\\usepackage{amsfonts,amsthm, bm,amsmath, bbm,amssymb,mathtools}\n\t\\usepackage{fullpage}\n\t\n\t\n\t\\newtheorem{theorem}{Theorem}[chapter]\n\t\\newtheorem{lemma}[theorem]{Lemma}\n\t\n\t\\theoremstyle{definition}\n\t\\newtheorem{definition}[theorem]{Definition}\n\t\\newtheorem{example}[theorem]{Example}\n\t\\newtheorem{xca}[theorem]{Exercise}\n\t\\newtheorem{corollary}[theorem]{Corollary}  % added for Lecture 5\n\t\\newtheorem{proposition}{Proposition}[section]  % added for Lecture 6\n\t\n\t\\theoremstyle{remark}\n\t\\newtheorem{remark}[theorem]{Remark}\n\t\n\t\\numberwithin{section}{chapter}\n\t\\numberwithin{equation}{chapter}\n\t\n\t\\makeindex\n\t\n\t\\def\\lectureformat{1}\n\t\\input{macros}\n\t\\begin{document}\n\t\n\t\\frontmatter\n\t\n\t\\mainmatter\n\t\\let\\sec\\section\n\t\\let\\subsec\\subsection\n\t\n\t\\newcommand{\\secwarning}[1]{\n\t\t{\t\n\t\t\t\\color{red}\n\t\t\t$\\backslash$section and $\\backslash$subsection are disallowed, please use \t$\\backslash$sec and $\\backslash$subsec instead\n\t\t}\n\t}\n\t\\let\\section\\secwarning\n\t\\let\\subsection\\secwarning\n\t\n\t\n\t\\newcommand{\\draftnotice}{\\vbox to 0.25in{\\noindent\n\t\t\t\\raisebox{0.6in}[0in][0in]{\\makebox[\\textwidth][r]{\\it\n\t\t\t\t\tDRAFT --- a final version will be posted shortly}}}\n\t\t\\vspace{-.25in}\\vspace{-\\baselineskip}\n\t}\n\t\n\t%\\section{}\n\t\\input{yoursunetID}\n\t\n\t\\input{yoursunetID2}\n\t\n\t%    Include main chapters here.\n\t%\\include{}\n\t\\appendix\n\t%    Include appendix \"chapters\" here.\n\t\n\t\n\t\\backmatter\n\t%    Bibliography styles amsplain or harvard are also acceptable.\n\t\\bibliographystyle{amsalpha}\n\t\\bibliography{}\n\t%    See note above about multiple indexes.\n%\t\\printindex\n\t\n\t\\end{document}\n\t\n\t%-----------------------------------------------------------------------\n\t% End of amsbook-template.tex\n\t%-----------------------------------------------------------------------\n"
  },
  {
    "path": "Templates/yoursunetID.tex",
    "content": "%\\newcommand{\\Exp}{\\mathbb{E}}\n\n% reset section counter\n\\setcounter{section}{0}\n\n%\\metadata{lecture ID}{Your names}{date}\n\\metadata{1}{Alice and Bob}{Jan 1st, 2021}\n\n\\sec{Review and Overview}\n\n\\begin{enumerate}\n\t\\item If appropriate, one paragraph to briefly review the connection to previous lectures.\n\t\\item An overview paragraph that summarizes the main idea of the lecture at a high-level. \n\\end{enumerate}  \n\\sec{Macros for frequently used notations}\nPlease try to reuse the macros defined below to ensure consistency. {\\color{blue}We encourage you to use macros frequently which could save a lot of time typing the equations and also help address notation inconsistency. }\n\\begin{itemize}\n\t\\item $\\Exp$, \n\t\\al{\n\t\t\\E_{x\\sim P}, \\Exp_{x\\sim P} \n\t}\n\t\\item $\\Pr[X=1\\vert Y=2]$\n\t\\item \n\t\\al{\n\t\t\\argmin_{x: x\\ge 1}\n\t}\n\t\\item \n\t$\\theta$, $\\theta^\\star$, $\\thetaerm$, \n\t\\item \n\t$\\cX,\\cY, \\cH, \\cF$\n\t\\item $x\\sp{1}, y\\sp{k}$\n\t\\item \n\t$x\\in \\R^3, \\bbZ$\n\t\\item $\\err(\\theta)$\n\t\\item $O(\\cdot)$, $\\tilO(\\cdot)$\n\t\\item $\\iid$\n\t\\item $\\norm{x}, \\Norm{x^{2^3}}$, $\\norm{x}_{2}$\n\t\\item $x^\\top$ \n\\end{itemize}\n\\begin{theorem}\n\t..\n\\end{theorem}\n\\begin{lemma}\n\t...\n\\end{lemma}\n\n\n\t\n\\lipsum\n%\\subsection{}"
  },
  {
    "path": "Templates/yoursunetID2.tex",
    "content": "%\\newcommand{\\Exp}{\\mathbb{E}}\n\n% reset section counter\n\\setcounter{section}{0}\n\n\\metadata{2}{Mary and Alex}{Jan 3rd, 2021}\n\n\\sec{Review and Overview}\n\n\\begin{enumerate}\n\t\\item If appropriate, one paragraph to briefly review the connection to previous lectures.\n\t\\item An overview paragraph that summarizes the main idea of the lecture at a high-level. \n\\end{enumerate}  \n\\sec{Macros for frequently used notations}\nPlease try to reuse the macros defined below to ensure consistency.\n\\begin{itemize}\n\t\\item $\\Exp$, \n\t\\al{\n\t\t\\E_{x\\sim P}, \\Exp_{x\\sim P} \n\t}\n\t\\item $\\Pr[X=1\\vert Y=2]$\n\t\\item \n\t\\al{\n\t\t\\argmin_{x: x\\ge 1}\n\t}\n\t\\item \n\t$\\theta$, $\\theta^\\star$, $\\thetaerm$, \n\t\\item \n\t$\\cX,\\cY, \\cH, \\cF$\n\t\\item $x\\sp{1}, y\\sp{k}$\n\t\\item \n\t$x\\in \\R^3, \\bbZ$\n\t\\item $\\err(\\theta)$\n\t\\item $O(\\cdot)$, $\\tilO(\\cdot)$\n\t\\item $\\iid$\n\t\\item $\\norm{x}, \\Norm{x^{2^3}}$, $\\norm{x}_{2}$\n\t\\item \n\\end{itemize}\n\\begin{theorem}\n\t..\n\\end{theorem}\n\\begin{lemma}\n\t...\n\\end{lemma}\n\n\n\t\n\\lipsum\n%\\subsection{}"
  },
  {
    "path": "tex/all.bib",
    "content": "@inproceedings{chung2007four,\n\ttitle={Four proofs for the Cheeger inequality and graph partition algorithms},\n\tauthor={Chung, Fan},\n\tbooktitle={Proceedings of ICCM},\n\tvolume={2},\n\tpages={378},\n\tyear={2007},\n\torganization={Citeseer}\n}\n@article{arora2009expander,\n  title={Expander flows, geometric embeddings and graph partitioning},\n  author={Arora, Sanjeev and Rao, Satish and Vazirani, Umesh},\n  journal={Journal of the ACM (JACM)},\n  volume={56},\n  number={2},\n  pages={1--37},\n  year={2009},\n  publisher={ACM New York, NY, USA}\n}\n\n@article{aarons2017puns,\n\ttitle        = {Puns and Tacit Linguistic Knowledge},\n\tauthor       = {Debra Aarons},\n\tyear         = 2017,\n\tjournal      = {The Routledge Handbook of Language and Humor, Routledge, New York, NY, Routledge Handbooks in Linguistics}\n}\n@article{aaronson2006lower,\n\ttitle        = {Lower bounds for local search by quantum arguments},\n\tauthor       = {Aaronson, Scott},\n\tyear         = 2006,\n\tjournal      = {SIAM Journal on Computing},\n\tpublisher    = {SIAM},\n\tvolume       = 35,\n\tnumber       = 4,\n\tpages        = {804--824}\n}\n@inproceedings{aaronson2008complexity,\n\ttitle        = {The Complexity Zoo},\n\tauthor       = {S. Aaronson and Chris Bourke},\n\tyear         = 2008\n}\n@article{abadi2015tensorflow,\n\ttitle        = {TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems},\n\tauthor       = {Martín Abadi and Ashish Agarwal and Paul Barham and Eugene Brevdo and Zhifeng Chen and Craig Citro and Gregory S. Corrado and Andy Davis and Jeffrey Dean and Matthieu Devin and Sanjay Ghemawat and Ian J. Goodfellow and Andrew Harp and Geoffrey Irving and Michael Isard and Yangqing Jia and Rafal Józefowicz and Lukasz Kaiser and Manjunath Kudlur and Josh Levenberg and Dan Mané and Rajat Monga and Sherry Moore and Derek Gordon Murray and Chris Olah and Mike Schuster and Jonathon Shlens and Benoit Steiner and Ilya Sutskever and Kunal Talwar and Paul A. Tucker and Vincent Vanhoucke and Vijay Vasudevan and Fernanda B. Viégas and Oriol Vinyals and Pete Warden and Martin Wattenberg and Martin Wicke and Yuan Yu and Xiaoqiang Zheng},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1603.04467}\n}\n@inproceedings{abadi2016tensorflow,\n\ttitle        = {TensorFlow: A system for large-scale machine learning},\n\tauthor       = {Martin Abadi and Paul Barham and Jianmin Chen and Zhifeng Chen and Andy Davis and Jeffrey Dean and Matthieu Devin and Sanjay Ghemawat and Geoffrey Irving and Michael Isard and others},\n\tyear         = 2016,\n\tbooktitle    = {Proceedings of the 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI). Savannah, Georgia, USA}\n}\n@inproceedings{abbasi2011improved,\n\ttitle        = {Improved algorithms for linear stochastic bandits},\n\tauthor       = {Abbasi-Yadkori, Yasin and P{\\'a}l, D{\\'a}vid and Szepesv{\\'a}ri, Csaba},\n\tyear         = 2011,\n\tbooktitle    = {Advances in Neural Information Processing Systems}\n}\n@article{abbasi2014linear,\n\ttitle        = {Linear programming for large-scale {M}arkov decision problems},\n\tauthor       = {Abbasi-Yadkori, Yasin and Bartlett, Peter L and Malek, Alan},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1402.6763}\n}\n@article{abbe2015community,\n\ttitle        = {Community detection in general stochastic block models: fundamental limits and efficient recovery algorithms},\n\tauthor       = {Emmanuel Abbe and Colin Sandon},\n\tyear         = 2015,\n\tjournal      = {arXiv}\n}\n@article{abbe2015detection,\n\ttitle        = {Detection in the stochastic block model with multiple clusters: proof of the achievability conjectures, acyclic {BP}, and the information-computation gap},\n\tauthor       = {Emmanuel Abbe and Colin Sandon},\n\tyear         = 2015,\n\tjournal      = {arXiv}\n}\n@misc{abbe2017community,\n\ttitle        = {Community detection and stochastic block models: recent developments},\n\tauthor       = {Emmanuel Abbe},\n\tyear         = 2017,\n\teprint       = {1703.10146},\n\tarchiveprefix = {arXiv},\n\tprimaryclass = {math.PR}\n}\n@inproceedings{abbeel2004apprenticeship,\n\ttitle        = {Apprenticeship learning via inverse reinforcement learning},\n\tauthor       = {Pieter Abbeel and Andrew Ng},\n\tyear         = 2004,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{abbeel2006using,\n\ttitle        = {Using inaccurate models in reinforcement learning},\n\tauthor       = {P. Abbeel and M. Quigley and A. Y. Ng},\n\tyear         = 2006,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1--8}\n}\n@article{abe2003reinforcement,\n\ttitle        = {Reinforcement learning with immediate rewards and linear hypotheses},\n\tauthor       = {Abe, Naoki and Biermann, Alan W and Long, Philip M},\n\tyear         = 2003,\n\tjournal      = {Algorithmica},\n\tpublisher    = {Springer},\n\tvolume       = 37,\n\tnumber       = 4,\n\tpages        = {263--293}\n}\n@techreport{abel2017classical,\n\ttitle        = {Classical measurement error with several regressors},\n\tauthor       = {Andrew B Abel},\n\tyear         = 2017,\n\tinstitution  = {Working Paper}\n}\n@inproceedings{abelson2014poor,\n\ttitle        = {Targeting Direct Cash Transfers to the Extremely Poor},\n\tauthor       = {Brian Abelson and Kush R. Varshney and Joy Sun},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)}\n}\n@article{abid2017contrastive,\n\ttitle        = {Contrastive principal component analysis},\n\tauthor       = {Abubakar Abid and Vivek K Bagaria and Martin J Zhang and James Zou},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1709.06716}\n}\n@article{abid2018exploring,\n\ttitle        = {Exploring patterns enriched in a dataset with contrastive principal component analysis},\n\tauthor       = {Abubakar Abid and Martin J Zhang and Vivek K Bagaria and James Zou},\n\tyear         = 2018,\n\tjournal      = {Nature Communications},\n\tvolume       = 9,\n\tnumber       = 1\n}\n@article{abid2021persistent,\n\ttitle        = {Persistent anti-muslim bias in large language models},\n\tauthor       = {Abubakar Abid and Maheen Farooqi and James Zou},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2101.05783}\n}\n@inproceedings{abiteboul1997querying,\n\ttitle        = {Querying semi-structured data},\n\tauthor       = {Serge Abiteboul},\n\tyear         = 1997,\n\tbooktitle    = {International Conference on Database Theory}\n}\n@article{abolafia2018neural,\n\ttitle        = {Neural Program Synthesis with Priority Queue Training},\n\tauthor       = {Daniel A Abolafia and Mohammad Norouzi and Quoc V Le},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1801.03526}\n}\n@techreport{abramson2004active,\n\ttitle        = {Active learning for visual object recognition},\n\tauthor       = {Yotam Abramson and Yoav Freund},\n\tyear         = 2004,\n\tinstitution  = {University of California, San Diego}\n}\n@book{absil2007optimization,\n\ttitle        = {Optimization Algorithms on Matrix Manifolds},\n\tauthor       = {Absil, P.A. and Mahony, R. and Sepulchre, R.},\n\tyear         = 2007,\n\tpublisher    = {Princeton University Press},\n\tisbn         = 9780691132983,\n\turl          = {https://books.google.com/books?id=gyaKmAEACAAJ},\n\tlccn         = 2007927538\n}\n@article{abu1990learning,\n\ttitle        = {Learning from hints in neural networks},\n\tauthor       = {Yaser S Abu-Mostafa},\n\tyear         = 1990,\n\tjournal      = {Journal of Complexity},\n\tvolume       = 6,\n\tnumber       = 2,\n\tpages        = {192--198}\n}\n@article{abujabal2018comqa,\n\ttitle        = {ComQA: A Community-sourced Dataset for Complex Factoid Question Answering with Paraphrase Clusters},\n\tauthor       = {Abdalghani Abujabal and Rishiraj Saha Roy and Mohamed Yahya and Gerhard Weikum},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1809.09528}\n}\n@article{acar2009unsupervised,\n\ttitle        = {Unsupervised multiway data analysis: A literature survey},\n\tauthor       = {Acar, Evrim and Yener, B{\\\"u}lent},\n\tyear         = 2009,\n\tjournal      = {Knowledge and Data Engineering, IEEE Transactions on},\n\tvolume       = 21,\n\tnumber       = 1,\n\tpages        = {6--20}\n}\n@inproceedings{achiam2017constrained,\n\ttitle        = {Constrained policy optimization},\n\tauthor       = {Achiam, Joshua and Held, David and Tamar, Aviv and Abbeel, Pieter},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {22--31},\n\torganization = {PMLR}\n}\n@article{achiam2019benchmarking,\n\ttitle        = {Benchmarking Safe Exploration in Deep Reinforcement Learning},\n\tauthor       = {Joshua Achiam and Dario Amodei},\n\tyear         = 2019,\n\tjournal      = {arXiv}\n}\n@inproceedings{achlioptas2005spectral,\n\ttitle        = {On spectral learning of mixtures of distributions},\n\tauthor       = {Dimitris Achlioptas and Frank McSherry},\n\tyear         = 2005,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@article{ackermann2019reducing,\n\ttitle        = {Reducing overestimation bias in multi-agent domains using double centralized critics},\n\tauthor       = {Ackermann, Johannes and Gabler, Volker and Osa, Takayuki and Sugiyama, Masashi},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1910.01465}\n}\n@article{adamczak2011chevet,\n\ttitle        = {Chevet type inequality and norms of submatrices},\n\tauthor       = {Adamczak, Rados{\\l}aw and Lata{\\l}a, Rafa{\\l} and Litvak, Alexander E and Pajor, Alain and Tomczak-Jaegermann, Nicole},\n\tyear         = 2011,\n\tjournal      = {arXiv preprint arXiv:1107.4066}\n}\n@article{adamczak2015concentration,\n\ttitle        = {Concentration inequalities for non-{L}ipschitz functions with bounded derivatives of higher order},\n\tauthor       = {Rados{\\l{}}aw Adamczak and Pawe{\\l{}} Wolff},\n\tyear         = 2015,\n\tjournal      = {Probability Theory and Related Fields},\n\tvolume       = 162,\n\tpages        = {531--586}\n}\n@inproceedings{adel2016comparing,\n\ttitle        = {Comparing Convolutional Neural Networks to Traditional Models for Slot Filling},\n\tauthor       = {Heike Adel and Benjamin Roth and Hinrich Sch\\\"{u}tze},\n\tyear         = 2016,\n\tbooktitle    = {Human Language Technology and North American Association for Computational Linguistics (HLT/NAACL)}\n}\n@article{adelman2008sixth,\n\ttitle        = {The sixth data release of the {S}loan digital sky survey},\n\tauthor       = {Jennifer K. Adelman-McCarthy and Marcel A. Ag{\\\"u}eros and Sahar S. Allam and Carlos Allende Prieto and Kurt S. J. Anderson and Scott F. Anderson and James Annis and Neta A. Bahcall and C. A. L. Bailer-Jones, and Ivan K. Baldry and others},\n\tyear         = 2008,\n\tjournal      = {The Astrophysical Journal Supplement Series},\n\tvolume       = 175,\n\tnumber       = 2\n}\n@article{adhlw19,\n\ttitle        = {Fine-Grained Analysis of Optimization and Generalization for Overparameterized Two-Layer Neural Networks},\n\tauthor       = {Sanjeev Arora and Simon S. Du and Wei Hu and Zhiyuan Li and Ruosong Wang},\n\tyear         = 2019,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1901.08584},\n\turl          = {http://arxiv.org/abs/1901.08584},\n\tarchiveprefix = {arXiv},\n\teprint       = {1901.08584},\n\ttimestamp    = {Sat, 02 Feb 2019 16:56:00 +0100},\n\tbiburl       = {https://dblp.org/rec/bib/journals/corr/abs-1901-08584},\n\tbibsource    = {dblp computer science bibliography, https://dblp.org}\n}\n@book{adler2009random,\n\ttitle        = {Random fields and geometry},\n\tauthor       = {Adler, Robert J and Taylor, Jonathan E},\n\tyear         = 2009,\n\tpublisher    = {Springer Science \\& Business Media}\n}\n@inproceedings{adler2012textexploration,\n\ttitle        = {Entailment-based Text Exploration with Application to the Health-care Domain},\n\tauthor       = {Meni Adler and Jonathan Berant and Ido Dagan},\n\tyear         = {2012 2012},\n\tbooktitle    = {ACL system demonstrations}\n}\n@article{adler2016auditing,\n\ttitle        = {Auditing Black-box Models for Indirect Influence},\n\tauthor       = {Philip Adler and Casey Falk and Sorelle A Friedler and Gabriel Rybeck and Carlos Scheidegger and Brandon Smith and Suresh Venkatasubramanian},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1602.07043}\n}\n@article{adomavicius2014bias,\n\ttitle        = {De-Biasing User Preference Ratings in Recommender Systems},\n\tauthor       = {Gediminas Adomavicius and Jesse Bockstedt and Shawn Curley and Jingjing Zhang},\n\tyear         = 2014,\n\tjournal      = {CEUR Workshop Proceedings},\n\tvolume       = 1253,\n\tpages        = {2--9}\n}\n@article{adragna2020fairness,\n\ttitle        = {Fairness and Robustness in Invariant Learning: A Case Study in Toxicity Classification},\n\tauthor       = {Robert Adragna and Elliot Creager and David Madras and Richard Zemel},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2011.06485}\n}\n@techreport{adriaans99shallow,\n\ttitle        = {Learning Shallow Context-Free Languages under Simple Distributions},\n\tauthor       = {Pieter W. Adriaans},\n\tyear         = 1999,\n\tinstitution  = {Stanford University}\n}\n@article{advani2017high,\n\ttitle        = {High-dimensional dynamics of generalization error in neural networks},\n\tauthor       = {Madhu S Advani and Andrew M Saxe},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1710.03667}\n}\n@inproceedings{afantenos2012developing,\n\ttitle        = {Developing a corpus of strategic conversation in The Settlers of Catan},\n\tauthor       = {Stergos Afantenos and Nicholas Asher and Farah Benamara and Anais Cadilhac and Cédric Dégremont and Pascal Denis and Markus Guhe and Simon Keizer and Alex Lascarides and Oliver Lemon and Philippe Muller and Soumya Paul and Verena Rieser and Laure Vieu},\n\tyear         = 2012,\n\tbooktitle    = {SeineDial 2012 - The 16th Workshop on the Semantics and Pragmatics of Dialogue}\n}\n@inproceedings{afantenos2012modelling,\n\ttitle        = {Modelling Strategic Conversation: Model, Annotation Design and Corpus},\n\tauthor       = {Stergos Afantenos and Nicholas Asher and Farah Benamara and Anais Cadilhac and Cedric Dégremont and Pascal Denis and Markus Guhe and Simon Keizer and Alex Lascarides and Oliver Lemon and others},\n\tyear         = 2012,\n\tbooktitle    = {Proceedings of SemDial 2012: Workshop on the Semantics and Pragmatics of Dialogue},\n\tpages        = {167--168}\n}\n@inproceedings{afsari2006simple,\n\ttitle        = {Simple {LU} and {QR} based non-orthogonal matrix joint diagonalization},\n\tauthor       = {Bijan Afsari},\n\tyear         = 2006,\n\tbooktitle    = {Independent Component Analysis and Blind Signal Separation},\n\tpages        = {1--7}\n}\n@article{afsari2008sensitivity,\n\ttitle        = {Sensitivity analysis for the problem of matrix joint diagonalization},\n\tauthor       = {B. Afsari},\n\tyear         = 2008,\n\tjournal      = {SIAM Journal on Matrix Analysis and Applications},\n\tvolume       = 30,\n\tnumber       = 3,\n\tpages        = {1148--1171}\n}\n@inproceedings{agarwal09hybrid,\n\ttitle        = {Exponential Family Hybrid Learning},\n\tauthor       = {Arvind Agarwal and Hal {Daum{\\'e} III}},\n\tyear         = 2009,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@article{agarwal2005geometric,\n\ttitle        = {Geometric approximation via coresets},\n\tauthor       = {Agarwal, Pankaj K. and {Har-Peled}, Sariel and Varadarajan, Kasturi R.},\n\tyear         = 2005,\n\tjournal      = {Combinatorial and computational geometry},\n\tpublisher    = {Cambridge University Press New York},\n\tvolume       = 52,\n\tpages        = {1--30}\n}\n@inproceedings{agarwal2013selective,\n\ttitle        = {Selective sampling algorithms for cost-sensitive multiclass prediction},\n\tauthor       = {Alekh Agarwal},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1220--1228}\n}\n@inproceedings{agarwal2014taming,\n\ttitle        = {Taming the monster: A fast and simple algorithm for contextual bandits},\n\tauthor       = {Agarwal, Alekh and Hsu, Daniel and Kale, Satyen and Langford, John and Li, Lihong and Schapire, Robert},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {1638--1646}\n}\n@article{agarwal2015multisection,\n\ttitle        = {Multisection in the stochastic block model using semidefinite programming},\n\tauthor       = {Naman Agarwal and Afonso S. Bandeira and Konstantinos Koiliaris and Alexandra Kolla},\n\tyear         = 2015,\n\tjournal      = {arXiv}\n}\n@article{agarwal2016finding,\n\ttitle        = {Finding approximate local minima for nonconvex optimization in linear time},\n\tauthor       = {Agarwal, Naman and Allen-Zhu, Zeyuan and Bullins, Brian and Hazan, Elad and Ma, Tengyu},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1611.01146}\n}\n@article{agarwal2016second,\n\ttitle        = {Second order stochastic optimization in linear time},\n\tauthor       = {Naman Agarwal and Brian Bullins and Elad Hazan},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1602.03943}\n}\n@misc{agarwal2017finding,\n\ttitle        = {Finding Approximate Local Minima Faster than Gradient Descent},\n\tauthor       = {Naman Agarwal and Zeyuan Allen-Zhu and Brian Bullins and Elad Hazan and Tengyu Ma},\n\tyear         = 2017,\n\teprint       = {1611.01146},\n\tarchiveprefix = {arXiv},\n\tprimaryclass = {math.OC}\n}\n@inproceedings{agarwal2018reductions,\n\ttitle        = {A Reductions Approach to Fair Classification},\n\tauthor       = {Alekh Agarwal and Alina Beygelzimer and Miroslav Dudik and John Langford and Hanna Wallach},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {60--69}\n}\n@article{agarwal2019learning,\n\ttitle        = {Learning to Generalize from Sparse and Underspecified Rewards},\n\tauthor       = {Rishabh Agarwal and Chen Liang and Dale Schuurmans and Mohammad Norouzi},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1902.07198}\n}\n@inproceedings{agarwal2019optimality,\n\ttitle        = {Optimality and Approximation with Policy Gradient Methods in {Markov} Decision Processes},\n\tauthor       = {Agarwal, Alekh and Kakade, Sham M and Lee, Jason D and Mahajan, Gaurav},\n\tyear         = 2020,\n\tmonth        = {09--12 Jul},\n\tbooktitle    = {Conference on Learning Theory},\n\tpublisher    = {PMLR},\n\tseries       = {Proceedings of Machine Learning Research},\n\tvolume       = 125,\n\tpages        = {64--66},\n\tpdf          = {http://proceedings.mlr.press/v125/agarwal20a/agarwal20a.pdf},\n\tabstract     = {Policy gradient (PG) methods are among the most effective methods in challenging reinforcement learning problems with large state and/or action spaces. However, little is known about even their most basic theoretical convergence properties, including: if and how fast they converge to a globally optimal solution (say with a sufficiently rich policy class); how they cope with approximation error due to using a restricted class of parametric policies; or their finite sample behavior. Such characterizations are important not only to compare these methods to their approximate value function counterparts (where such issues are relatively well understood, at least in the worst case), but also to help with more principled approaches to algorithm design. This work provides provable characterizations of computational, approximation, and sample size issues with regards to policy gradient methods in the context of discounted Markov Decision Processes (MDPs). We focus on both: 1) “tabular” policy parameterizations, where the optimal policy is contained in the class and where we show global convergence to the optimal policy, and 2) restricted policy classes, which may not contain the optimal policy and where we provide agnostic learning results. In the \\emph{tabular setting}, our main results are: 1) convergence rate to global optimum for direct parameterization and projected gradient ascent 2) an asymptotic convergence to global optimum for softmax policy parameterization and PG; and a convergence rate with additional entropy regularization, and 3) dimension-free convergence to global optimum for softmax policy parameterization and Natural Policy Gradient (NPG) method with exact gradients. In \\emph{function approximation}, we further analyze NPG with exact as well as inexact gradients under certain smoothness assumptions on the policy parameterization and establish rates of convergence in terms of the quality of the initial state distribution. One insight of this work is in formalizing how a favorable initial state distribution provides a means to circumvent worst-case exploration issues. Overall, these results place PG methods under a solid theoretical footing, analogous to the global convergence guarantees of iterative value function based algorithms.}\n}\n@article{agarwal2019reinforcement,\n\ttitle        = {Reinforcement learning: Theory and algorithms},\n\tauthor       = {Agarwal, Alekh and Jiang, Nan and Kakade, Sham M},\n\tyear         = 2019,\n\tjournal      = {CS Dept., UW Seattle, Seattle, WA, USA, Tech. Rep}\n}\n@article{agarwal2020disentangling,\n\ttitle        = {Disentangling Adaptive Gradient Methods from Learning Rates},\n\tauthor       = {Agarwal, Naman and Anil, Rohan and Hazan, Elad and Koren, Tomer and Zhang, Cyril},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2002.11803}\n}\n@article{agarwal2020flambe,\n\ttitle        = {FLAMBE: Structural complexity and representation learning of low rank MDPs},\n\tauthor       = {Agarwal, Alekh and Kakade, Sham and Krishnamurthy, Akshay and Sun, Wen},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.10814}\n}\n@inproceedings{agarwal2020pc,\n\ttitle        = {{PC-PG}: Policy cover directed exploration for provable policy gradient learning},\n\tauthor       = {Agarwal, Alekh and Henaff, Mikael and Kakade, Sham and Sun, Wen},\n\tyear         = 2020,\n\tbooktitle    = {Advances in Neural Information Processing Systems}\n}\n@article{AgarwalEtal:SparseCoding2013,\n\ttitle        = {{Learning Sparsely Used Overcomplete Dictionaries via Alternating Minimization}},\n\tauthor       = {A. Agarwal and A. Anandkumar and P. Jain and P. Netrapalli and R. Tandon},\n\tyear         = 2013,\n\tmonth        = {Oct.},\n\tjournal      = {Available on arXiv:1310.7991}\n}\n@inproceedings{agazzi1993connected,\n\ttitle        = {Connected and degraded text recognition using planar hidden {M}arkov models},\n\tauthor       = {Oscar E Agazzi and S-s Kuo and Esther Levin and Roberto Pieraccini},\n\tyear         = 1993,\n\tbooktitle    = {International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},\n\tvolume       = 5,\n\tpages        = {113--116}\n}\n@inproceedings{agichtein2000snowball,\n\ttitle        = {Snowball: Extracting relations from large plain-text collections},\n\tauthor       = {Eugene Agichtein and Luis Gravano},\n\tyear         = 2000,\n\tbooktitle    = {Proceedings of the fifth ACM conference on Digital Libraries}\n}\n@inproceedings{agirre2014semeval,\n\ttitle        = {Sem{E}val-2014 {T}ask 10: Multilingual Semantic Textual Similarity},\n\tauthor       = {Eneko Agirre and Carmen Banea and Claire Cardie and Daniel M Cer and Mona T Diab and Aitor Gonzalez-Agirre and Weiwei Guo and Rada Mihalcea and German Rigau and Janyce Wiebe},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)},\n\tpages        = {81--91}\n}\n@inproceedings{agmr17,\n\ttitle        = {Provable learning of noisy-or networks},\n\tauthor       = {Arora, Sanjeev and Ge, Rong and Ma, Tengyu and Risteski, Andrej},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the 49th Annual ACM SIGACT Symposium on Theory of Computing (STOC)},\n\tpages        = {1057--1066},\n\torganization = {ACM}\n}\n@incollection{agralwal2017optimistic,\n\ttitle        = {Optimistic posterior sampling for reinforcement learning: worst-case regret bounds},\n\tauthor       = {Agrawal, Shipra and Jia, Randy},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems 30},\n\tpublisher    = {Curran Associates, Inc.},\n\tpages        = {1184--1194},\n\turl          = {http://papers.nips.cc/paper/6718-optimistic-posterior-sampling-for-reinforcement-learning-worst-case-regret-bounds.pdf},\n\teditor       = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett}\n}\n@inproceedings{agrawal2012analysis,\n\ttitle        = {Analysis of thompson sampling for the multi-armed bandit problem},\n\tauthor       = {Agrawal, Shipra and Goyal, Navin},\n\tyear         = 2012,\n\tbooktitle    = {Conference on learning theory},\n\tpages        = {39--1}\n}\n@inproceedings{agrawal2013thompson,\n\ttitle        = {Thompson sampling for contextual bandits with linear payoffs},\n\tauthor       = {Agrawal, Shipra and Goyal, Navin},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {127--135}\n}\n@article{agrawal2015vqa1,\n\ttitle        = {{VQA}: Visual Question Answering},\n\tauthor       = {Aishwarya Agrawal and Jiasen Lu and Stanislaw Antol and Margaret Mitchell and C. Lawrence Zitnick and Devi Parikh and Dhruv Batra},\n\tyear         = 2015,\n\tjournal      = {International Journal of Computer Vision},\n\tvolume       = 123,\n\tpages        = {4--31}\n}\n@inproceedings{agrawal2016analyzing,\n\ttitle        = {Analyzing the behavior of visual question answering models},\n\tauthor       = {Aishwarya Agrawal and Dhruv Batra and Devi Parikh},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{agrawal2017discrete,\n\ttitle        = {Discrete Control Barrier Functions for Safety-Critical Control of Discrete Systems with Application to Bipedal Robot Navigation.},\n\tauthor       = {Agrawal, Ayush and Sreenath, Koushil},\n\tyear         = 2017,\n\tbooktitle    = {Robotics: Science and Systems}\n}\n@article{agrawal2017near,\n\ttitle        = {Near-optimal regret bounds for thompson sampling},\n\tauthor       = {Agrawal, Shipra and Goyal, Navin},\n\tyear         = 2017,\n\tjournal      = {Journal of the ACM (JACM)},\n\tpublisher    = {ACM New York, NY, USA},\n\tvolume       = 64,\n\tnumber       = 5,\n\tpages        = {1--24}\n}\n@inproceedings{agrawal2018don,\n\ttitle        = {Don't just assume; look and answer: Overcoming priors for visual question answering},\n\tauthor       = {Aishwarya Agrawal and Dhruv Batra and Devi Parikh and Aniruddha Kembhavi},\n\tyear         = 2018,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {4971--4980}\n}\n@article{agrawala1970learning,\n\ttitle        = {Learning with a probabilistic teacher},\n\tauthor       = {Ashok K. Agrawala},\n\tyear         = 1970,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tvolume       = 16,\n\tpages        = {373--379}\n}\n@inproceedings{aguiar2006automatic,\n\ttitle        = {Automatic Learning of Articulated Skeletons from 3D Marker Trajectories},\n\tauthor       = {Edilson de Aguiar and Christian Theobalt and Hans-Peter Seidel},\n\tyear         = 2006,\n\tbooktitle    = {ISVC (1)},\n\tpages        = {485--494}\n}\n@inproceedings{agv,\n\ttitle        = {Simultaneous Hardcore Bits and Cryptography against Memory Attacks},\n\tauthor       = {Akavia, Adi and Goldwasser, Shafi and Vaikuntanathan, Vinod},\n\tyear         = 2009,\n\tbooktitle    = {Proceedings of the 6th Theory of Cryptography Conference on Theory of Cryptography},\n\tlocation     = {San Francisco, CA},\n\tpublisher    = {Springer-Verlag},\n\taddress      = {Berlin, Heidelberg},\n\tseries       = {TCC '09},\n\tpages        = {474--495},\n\tdoi          = {10.1007/978-3-642-00457-5_28},\n\tisbn         = {978-3-642-00456-8},\n\turl          = {http://dx.doi.org/10.1007/978-3-642-00457-5_28},\n\tnumpages     = 22,\n\tacmid        = 1530469\n}\n@inproceedings{AH2016-nonconvex,\n\ttitle        = {{Variance Reduction for Faster Non-Convex Optimization}},\n\tauthor       = {{Allen-Zhu}, Zeyuan and Hazan, Elad},\n\tyear         = 2016,\n\tbooktitle    = {ICML}\n}\n@inproceedings{AH2016-reduction,\n\ttitle        = {{Optimal Black-Box Reductions Between Optimization Objectives}},\n\tauthor       = {{Allen-Zhu}, Zeyuan and Hazan, Elad},\n\tyear         = 2016,\n\tbooktitle    = {Proceedings of the 30th Conference on Neural Information Processing Systems},\n\tseries       = {NIPS~'16}\n}\n@inproceedings{ahadi2015exploring,\n\ttitle        = {Exploring machine learning methods to automatically identify students in need of assistance},\n\tauthor       = {Alireza Ahadi and Raymond Lister and Heikki Haapala and Arto Vihavainen},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the eleventh annual International Conference on International Computing Education Research},\n\tpages        = {121--130}\n}\n@inproceedings{aharon2005k,\n\ttitle        = {K-SVD and its non-negative variant for dictionary design},\n\tauthor       = {Aharon, Michal and Elad, Michael and Bruckstein, Alfred M},\n\tyear         = 2005,\n\tbooktitle    = {Optics \\& Photonics 2005},\n\tpages        = {591411--591411},\n\torganization = {International Society for Optics and Photonics},\n\towner        = {gewor_000},\n\ttimestamp    = {2013.11.10}\n}\n@article{aharon2006img,\n\ttitle        = {K-SVD: An algorithm for designing overcomplete dictionaries for sparse representation},\n\tauthor       = {Aharon, Michal and Elad, Michael and Bruckstein, Alfred},\n\tyear         = 2006,\n\tjournal      = {Signal Processing, IEEE Transactions on},\n\tpublisher    = {IEEE},\n\tvolume       = 54,\n\tnumber       = 11,\n\tpages        = {4311--4322},\n\towner        = {gewor_000},\n\ttimestamp    = {2013.11.10}\n}\n@phdthesis{AharonThesis,\n\ttitle        = {Overcomplete Dictionaries for Sparse Representation of Signals},\n\tauthor       = {Michal Aharon},\n\tyear         = 2006,\n\tschool       = {Technion - Israel Institute of Technology},\n\towner        = {gewor_000},\n\ttimestamp    = {2013.11.10}\n}\n@inproceedings{AHK,\n\ttitle        = {A method of moments for mixture models and hidden Markov models},\n\tauthor       = {A. Anandkumar and D. Hsu and S. Kakade},\n\tyear         = 2012,\n\tmonth        = {June},\n\tbooktitle    = {COLT}\n}\n@inproceedings{AHK12,\n\ttitle        = {A method of moments for mixture models and hidden {M}arkov models},\n\tauthor       = {Anima Anandkumar and Daniel Hsu and Sham M. Kakade},\n\tyear         = 2012,\n\tbooktitle    = {COLT}\n}\n@inproceedings{AHK2005,\n\ttitle        = {{Fast Algorithms for Approximate Semidefinite Programming using the Multiplicative Weights Update Method}},\n\tauthor       = {Arora, Sanjeev and Hazan, Elad and Kale, Satyen},\n\tyear         = 2005,\n\tbooktitle    = {46th Annual IEEE Symposium on Foundations of Computer Science (FOCS'05)},\n\tpublisher    = {IEEE},\n\tpages        = {339--348},\n\tdoi          = {10.1109/SFCS.2005.35},\n\tisbn         = {0-7695-2468-0},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Arora, Hazan, Kale - 2005 - Fast Algorithms for Approximate Semidefinite Programming using the Multiplicative Weights Update Method.pdf:pdf},\n\tmendeley-groups = {Algorithms/Multiplicative Weight,Algorithms/Multiplicative Weight/SDP}\n}\n@article{AHK2012,\n\ttitle        = {{The Multiplicative Weights Update Method: a Meta-Algorithm and Applications.}},\n\tauthor       = {Arora, Sanjeev and Hazan, Elad and Kale, Satyen},\n\tyear         = 2012,\n\tjournal      = {Theory of Computing},\n\tvolume       = 8,\n\tpages        = {121--164},\n\tdoi          = {10.4086/toc.2012.v008a006},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Arora, Hazan, Kale - 2012 - The Multiplicative Weights Update Method a Meta-Algorithm and Applications.pdf:pdf},\n\tmendeley-groups = {Algorithms/Multiplicative Weight}\n}\n@article{ahlszz18,\n\ttitle        = {Towards provable control for unknown linear dynamical systems},\n\tauthor       = {Arora, Sanjeev and Hazan, Elad and Lee, Holden and Singh, Karan and Zhang, Cyril and Zhang, Yi},\n\tyear         = 2018\n}\n@article{AhlWin02,\n\ttitle        = {Strong converse for identification via quantum channels},\n\tauthor       = {R.~Ahlswede and A.~Winter},\n\tyear         = 2002,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tvolume       = 48,\n\tnumber       = 3,\n\tpages        = {569--579}\n}\n@article{ahmad2010soilmoisture,\n\ttitle        = {Estimating soil moisture using remote sensing data: A machine learning approach},\n\tauthor       = {Sajjad Ahmad and Ajay Kalra and Haroon Stephen},\n\tyear         = 2010,\n\tjournal      = {Advances in Water Resources},\n\tvolume       = 33,\n\tnumber       = 1,\n\tpages        = {69--80}\n}\n@article{ahmadi2017dsos,\n\ttitle        = {{DSOS} and {SDSOS} optimization: more tractable alternatives to sum of squares and semidefinite optimization},\n\tauthor       = {Amir Ali Ahmadi and Anirudha Majumdar},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1706.02586}\n}\n@inproceedings{ahmed2018compilation,\n\ttitle        = {Compilation error repair: for the student programs, from the student programs},\n\tauthor       = {Umair Z Ahmed and Pawan Kumar and Amey Karkare and Purushottam Kar and Sumit Gulwani},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Software Engineering (ICSE)}\n}\n@inproceedings{ahmed2019understanding,\n\ttitle        = {Understanding the impact of entropy on policy optimization},\n\tauthor       = {Ahmed, Zafarali and Le Roux, Nicolas and Norouzi, Mohammad and Schuurmans, Dale},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {151--160},\n\torganization = {PMLR}\n}\n@inproceedings{Ahmedetal12,\n\ttitle        = {Scalable inference in latent variable models},\n\tauthor       = {A. Ahmed and M. Aly and J. Gonzalez and S. Narayanamurthy and A. J. Smola},\n\tyear         = 2012,\n\tbooktitle    = {WSDM '12: Proceedings of the fifth ACM international conference on Web search and data mining},\n\tlocation     = {Seattle, Washington, USA},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tpages        = {123--132},\n\tdoi          = {http://doi.acm.org/10.1145/2124295.2124312},\n\turl          = {http://dl.acm.org/authorize?6666391}\n}\n@inproceedings{ahn2004labeling,\n\ttitle        = {Labeling images with a computer game},\n\tauthor       = {Luis von Ahn and Laura A. Dabbish},\n\tyear         = 2004,\n\tbooktitle    = {Conference on Human Factors in Computing Systems (CHI)}\n}\n@inproceedings{ahuja2021empirical,\n\ttitle        = {Empirical or Invariant Risk Minimization? A Sample Complexity Perspective},\n\tauthor       = {Kartik Ahuja and Jun Wang and Amit Dhurandhar and Karthikeyan Shanmugam and Kush R. Varshney},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Learning Representations},\n\turl          = {https://openreview.net/forum?id=jrA5GAccy_}\n}\n@article{ai2019haim,\n\ttitle        = {{HAIM}: A Modest Step Towards Controllable Text Generation},\n\tauthor       = {AI21},\n\tyear         = 2019,\n\tjournal      = {AI21 Labs Blog}\n}\n@misc{ai2020wordtune,\n\ttitle        = {Wordtune (accessed 2020 {O}ct 30)},\n\tauthor       = {AI21},\n\tyear         = 2020,\n\thowpublished = {\\url{https://www.wordtune.com/}}\n}\n@article{aijo2014methods,\n\ttitle        = {Methods for time series analysis of {RNA}-seq data with application to human {Th17} cell differentiation},\n\tauthor       = {Tarmo {\\\"A}ij{\\\"o} and Vincent Butty and Zhi Chen and Verna Salo and Subhash Tripathi and Christopher B Burge and Riitta Lahesmaa and Harri L{\\\"a}hdesm{\\\"a}ki},\n\tyear         = 2014,\n\tjournal      = {Bioinformatics},\n\tvolume       = 30,\n\tnumber       = 12\n}\n@inproceedings{airoldi2009mixed,\n\ttitle        = {Mixed membership stochastic blockmodels},\n\tauthor       = {Airoldi, Edoardo M and Blei, David M and Fienberg, Stephen E and Xing, Eric P},\n\tyear         = 2009,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {33--40}\n}\n@inproceedings{AK01,\n\ttitle        = {Learning mixtures of arbitrary {G}aussians},\n\tauthor       = {S. Arora and R. Kannan},\n\tyear         = 2001,\n\tbooktitle    = {STOC}\n}\n@article{akaike74aic,\n\ttitle        = {A new look at the statistical model identification},\n\tauthor       = {Hirotugu Akaike},\n\tyear         = 1974,\n\tjournal      = {IEEE Transactions on Automatic Control},\n\tvolume       = 19,\n\tpages        = {716--723}\n}\n@article{akgun2012keyframe,\n\ttitle        = {Keyframe-based learning from demonstration},\n\tauthor       = {B. Akgun and M. Cakmak and K. Jiang and A. Thomaz},\n\tyear         = 2012,\n\tjournal      = {International Journal of Social Robotics (IJSR)},\n\tvolume       = 4,\n\tnumber       = 4,\n\tpages        = {343--355}\n}\n@article{akkaya2019solving,\n\ttitle        = {Solving rubik's cube with a robot hand},\n\tauthor       = {Akkaya, Ilge and Andrychowicz, Marcin and Chociej, Maciek and Litwin, Mateusz and McGrew, Bob and Petron, Arthur and Paino, Alex and Plappert, Matthias and Powell, Glenn and Ribas, Raphael and others},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1910.07113}\n}\n@article{akram2018leveraging,\n\ttitle        = {Leveraging unlabeled whole-slide-images for mitosis detection},\n\tauthor       = {Saad Ullah Akram and Talha Qaiser and Simon Graham and Juho Kannala and Janne Heikkil{\\\"a} and Nasir Rajpoot},\n\tyear         = 2018,\n\tjournal      = {Computational Pathology and Ophthalmic Medical Image Analysis},\n\tvolume       = 1,\n\tpages        = {69--77}\n}\n@article{al192,\n\ttitle        = {Can {SGD} Learn Recurrent Neural Networks with Provable Generalization?},\n\tauthor       = {Zeyuan Allen{-}Zhu and Yuanzhi Li},\n\tyear         = 2019,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1902.01028},\n\turl          = {http://arxiv.org/abs/1902.01028},\n\tarchiveprefix = {arXiv},\n\teprint       = {1902.01028},\n\ttimestamp    = {Fri, 01 Mar 2019 17:14:13 +0100},\n\tbiburl       = {https://dblp.org/rec/bib/journals/corr/abs-1902-01028},\n\tbibsource    = {dblp computer science bibliography, https://dblp.org}\n}\n@article{AL2016-kCCA,\n\ttitle        = {{Doubly Accelerated Methods for Faster CCA and Generalized Eigendecomposition}},\n\tauthor       = {{Allen-Zhu}, Zeyuan and Li, Yuanzhi},\n\tyear         = 2016,\n\tmonth        = jul,\n\tjournal      = {ArXiv e-prints},\n\tvolume       = {abs/1607.06017}\n}\n@inproceedings{AL2016-kSVD,\n\ttitle        = {{Even Faster SVD Decomposition Yet Without Agonizing Pain}},\n\tauthor       = {{Allen-Zhu}, Zeyuan and Li, Yuanzhi},\n\tyear         = 2016,\n\tbooktitle    = {NIPS}\n}\n@article{AL2016-onlinePCA,\n\ttitle        = {{Fast Global Convergence of Online PCA}},\n\tauthor       = {{Allen-Zhu}, Zeyuan and Li, Yuanzhi},\n\tyear         = 2016,\n\tmonth        = jul,\n\tjournal      = {ArXiv e-prints},\n\tvolume       = {abs/1607.07837}\n}\n@article{AL2016-PCR,\n\ttitle        = {{Faster Principal Component Regression via Optimal Polynomial Approximation to sgn(x)}},\n\tauthor       = {{Allen-Zhu}, Zeyuan and Li, Yuanzhi},\n\tyear         = 2016,\n\tmonth        = aug,\n\tjournal      = {ArXiv e-prints},\n\tvolume       = {abs/1608.04773}\n}\n@inproceedings{Alamgir2010,\n\ttitle        = {Multi-agent Random Walks for Local Clustering on Graphs},\n\tauthor       = {Alamgir, Morteza and von Luxburg, Ulrike},\n\tyear         = 2010,\n\tseries       = {ICDM '10},\n\tpages        = {18--27}\n}\n@article{alaoui2014fast,\n\ttitle        = {Fast randomized kernel methods with statistical guarantees},\n\tauthor       = {Alaoui, Ahmed El and Mahoney, Michael W},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1411.0306}\n}\n@article{albadawy2018tumor,\n\ttitle        = {Deep learning for segmentation of brain tumors: Impact of cross-institutional training and testing},\n\tauthor       = {EA AlBadawy and A Saha and MA Mazurowski},\n\tyear         = 2018,\n\tjournal      = {Med Phys.},\n\tvolume       = 45\n}\n@article{albuquerque2019generalizing,\n\ttitle        = {Generalizing to unseen domains via distribution matching},\n\tauthor       = {Isabela Albuquerque and João Monteiro and Mohammad Darvishi and Tiago H. Falk and Ioannis Mitliagkas},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1911.00804}\n}\n@article{aldous85exch,\n\ttitle        = {Exchangeability and related topics},\n\tauthor       = {D. Aldous},\n\tyear         = 1985,\n\tjournal      = {Springer Lecture Notes in Math},\n\tvolume       = 1117,\n\tpages        = {1--198}\n}\n@inproceedings{alekhnovich,\n\ttitle        = {More on Average Case vs Approximation Complexity},\n\tauthor       = {Alekhnovich, Michael},\n\tyear         = 2003,\n\tbooktitle    = {Proceedings of the 44th Annual IEEE Symposium on Foundations of Computer Science},\n\tpublisher    = {IEEE Computer Society},\n\taddress      = {Washington, DC, USA},\n\tseries       = {FOCS '03},\n\tpages        = {298--},\n\tisbn         = {0-7695-2040-5},\n\turl          = {http://dl.acm.org/citation.cfm?id=946243.946338},\n\tacmid        = 946338\n}\n@article{alemi2016deep,\n\ttitle        = {Deep variational information bottleneck},\n\tauthor       = {Alexander A Alemi and Ian Fischer and Joshua V Dillon and Kevin Murphy},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1612.00410}\n}\n@inproceedings{alexandari2020maximum,\n\ttitle        = {Maximum likelihood with bias-corrected calibration is hard-to-beat at label shift adaptation},\n\tauthor       = {Amr Alexandari and Anshul Kundaje and Avanti Shrikumar},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {222--232}\n}\n@inproceedings{alexandrescu2009graph,\n\ttitle        = {Graph-based learning for statistical machine translation},\n\tauthor       = {Andrei Alexandrescu and Katrin Kirchhoff},\n\tyear         = 2009,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)},\n\tpages        = {119--127}\n}\n@techreport{alfakih98embeddability,\n\ttitle        = {On the embeddability of weighted graphs in Euclidean spaces},\n\tauthor       = {A. Afakih and H. Wolkowicz},\n\tyear         = 1998,\n\tinstitution  = {University of Waterloo}\n}\n@inproceedings{alfonseca2012pattern,\n\ttitle        = {Pattern learning for relation extraction with a hierarchical topic model},\n\tauthor       = {Enrique Alfonseca and Katja Filippova and Jean-Yves Delort and Guillermo Garrido},\n\tyear         = 2012,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {54--59}\n}\n@article{ali1966general,\n\ttitle        = {A General Class of Coefficients of Divergence of One Distribution from Another},\n\tauthor       = {S M Ali and Samuel David Silvey},\n\tyear         = 1966,\n\tjournal      = {Journal of the Royal Statistical Society. Series B (Methodological)},\n\tvolume       = 28\n}\n@inproceedings{ali2010automation,\n\ttitle        = {Automation of question generation from sentences},\n\tauthor       = {Husam Ali and Yllias Chali and Sadid A Hasan},\n\tyear         = 2010,\n\tbooktitle    = {Proceedings of QG2010: The Third Workshop on Question Generation},\n\tpages        = {58--67}\n}\n@article{all18,\n\ttitle        = {{Learning and Generalization in Overparameterized Neural Networks, Going Beyond Two Layers}},\n\tauthor       = {{Allen-Zhu}, Zeyuan and Li, Yuanzhi and Liang, Yingyu},\n\tyear         = 2018,\n\tmonth        = nov,\n\tjournal      = {arXiv preprint arXiv:1811.04918}\n}\n@inproceedings{allamanis2015bimodal,\n\ttitle        = {Bimodal modelling of source code and natural language},\n\tauthor       = {Miltos Allamanis and Daniel Tarlow and Andrew Gordon and Yi Wei},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {2123--2132}\n}\n@inproceedings{allamanis2018varmisuse,\n\ttitle        = {Learning to Represent Programs with Graphs},\n\tauthor       = {Miltiadis Allamanis and Marc Brockschmidt and Mahmoud Khademi},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{allemand2001polynomial,\n\ttitle        = {A polynomial case of unconstrained zero-one quadratic optimization},\n\tauthor       = {Kim Allemand and Komei Fukuda and Thomas M Liebling and Erich Steiner},\n\tyear         = 2001,\n\tjournal      = {Mathematical programming},\n\tvolume       = 91,\n\tnumber       = 1,\n\tpages        = {49--52}\n}\n@article{allen1980analyzing,\n\ttitle        = {Analyzing intention in utterances},\n\tauthor       = {James F Allen and C Raymond Perrault},\n\tyear         = 1980,\n\tjournal      = {Artificial Intelligence},\n\tvolume       = 15,\n\tnumber       = 3,\n\tpages        = {143--178}\n}\n@article{allen2001toward,\n\ttitle        = {Toward conversational human-computer interaction},\n\tauthor       = {James F Allen and Donna K Byron and Myroslava Dzikovska and George Ferguson and Lucian Galescu and Amanda Stent},\n\tyear         = 2001,\n\tjournal      = {AI magazine},\n\tvolume       = 22,\n\tnumber       = 4\n}\n@inproceedings{allen2007plow,\n\ttitle        = {{PLOW}: A collaborative task learning agent},\n\tauthor       = {James Allen and Nathanael Chambers and George Ferguson and Lucian Galescu and Hyuckchul Jung and Mary Swift and William Taysom},\n\tyear         = 2007,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tpages        = {1514--1519}\n}\n@book{allen2014reasoning,\n\ttitle        = {Reasoning about plans},\n\tauthor       = {James Allen and Henry Kautz and Richard Pelavin and Josh Tenenberg},\n\tyear         = 2014,\n\tpublisher    = {Morgan Kaufmann}\n}\n@article{allen2016first,\n\ttitle        = {First Efficient Convergence for Streaming k-{PCA}: a Global, Gap-Free, and Near-Optimal Rate},\n\tauthor       = {Allen-Zhu, Zeyuan and Li, Yuanzhi},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1607.07837}\n}\n@article{allen2017natasha,\n\ttitle        = {Natasha 2: Faster non-convex optimization than {SGD}},\n\tauthor       = {Allen-Zhu, Zeyuan},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1708.08694}\n}\n@article{allen2018convergence,\n\ttitle        = {On the convergence rate of training recurrent neural networks},\n\tauthor       = {Allen-Zhu, Zeyuan and Li, Yuanzhi and Song, Zhao},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1810.12065}\n}\n@article{allen2018convergencetheory,\n\ttitle        = {A Convergence Theory for Deep Learning via Over-Parameterization},\n\tauthor       = {Allen-Zhu, Zeyuan and Li, Yuanzhi and Song, Zhao},\n\tyear         = 2018,\n\tmonth        = nov,\n\tjournal      = {arXiv preprint arXiv:1811.03962}\n}\n@article{allen2019can,\n\ttitle        = {What can resnet learn efficiently, going beyond kernels?},\n\tauthor       = {Allen-Zhu, Zeyuan and Li, Yuanzhi},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1905.10337}\n}\n@article{Allenzhu2016Katyusha,\n\ttitle        = {{Katyusha: The First Direct Acceleration of Stochastic Gradient Methods}},\n\tauthor       = {{Allen-Zhu}, Zeyuan},\n\tyear         = 2016,\n\tmonth        = mar,\n\tjournal      = {ArXiv e-prints},\n\tvolume       = {abs/1603.05953}\n}\n@article{allman11identifiability,\n\ttitle        = {Identifiability of 2-tree mixtures for group-based models},\n\tauthor       = {Elizabeth S. Allman and Sonja Petrovi and John A. Rhodes and Seth Sullivant},\n\tyear         = 2011,\n\tjournal      = {Transactions on Computational Biology and Bioinformatics},\n\tvolume       = 8,\n\tpages        = {710--722}\n}\n@inproceedings{ALO-bss,\n\ttitle        = {{Spectral Sparsification and Regret Minimization Beyond Multiplicative Updates}},\n\tauthor       = {{Allen-Zhu}, Zeyuan and Liao, Zhenyu and Orecchia, Lorenzo},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the 47th Annual ACM Symposium on Theory of Computing},\n\tseries       = {STOC~'15}\n}\n@inproceedings{ALO-sdp-parallel,\n\ttitle        = {Using Optimization to Obtain a Width-Independent, Parallel, Simpler, and Faster Positive {SDP} Solver},\n\tauthor       = {{Allen-Zhu}, Zeyuan and Lee, Yin Tat and Orecchia, Lorenzo},\n\tyear         = 2016,\n\tbooktitle    = {Proceedings of the 27th ACM-SIAM Symposium on Discrete Algorithms},\n\tseries       = {SODA~'16}\n}\n@article{alon1999space,\n\ttitle        = {The space complexity of approximating the frequency moments},\n\tauthor       = {Alon, Noga and Matias, Yossi and Szegedy, Mario},\n\tyear         = 1999,\n\tjournal      = {Journal of Computer and system sciences},\n\tpublisher    = {Elsevier},\n\tvolume       = 58,\n\tnumber       = 1,\n\tpages        = {137--147}\n}\n@article{alon2006approximating,\n\ttitle        = {Approximating the cut-norm via {G}rothendieck's inequality},\n\tauthor       = {N. Alon and A. Naor},\n\tyear         = 2006,\n\tjournal      = {SIAM Journal on Computing},\n\tvolume       = 35,\n\tnumber       = 4,\n\tpages        = {787--803}\n}\n@article{Alon86,\n\ttitle        = {Eigenvalues and expanders},\n\tauthor       = {Noga Alon},\n\tyear         = 1986,\n\tjournal      = {Combinatorica},\n\tvolume       = 6,\n\tnumber       = 2,\n\tpages        = {83--96}\n}\n@article{alphago16,\n\ttitle        = {Mastering the game of {G}o with deep neural networks and tree search},\n\tauthor       = {Silver, David and Huang, Aja and Maddison, Chris J and Guez, Arthur and Sifre, Laurent and Van Den Driessche, George and Schrittwieser, Julian and Antonoglou, Ioannis and Panneershelvam, Veda and Lanctot, Marc and others},\n\tyear         = 2016,\n\tjournal      = {Nature},\n\tpublisher    = {Nature Research},\n\tvolume       = 529,\n\tnumber       = 7587,\n\tpages        = {484--489}\n}\n@article{alphago17,\n\ttitle        = {Mastering the game of {G}o without human knowledge},\n\tauthor       = {Silver, David and Schrittwieser, Julian and Simonyan, Karen and Antonoglou, Ioannis and Huang, Aja and Guez, Arthur and Hubert, Thomas and Baker, Lucas and Lai, Matthew and Bolton, Adrian and others},\n\tyear         = 2017,\n\tjournal      = {Nature},\n\tpublisher    = {Nature Publishing Group},\n\tvolume       = 550,\n\tnumber       = 7676,\n\tpages        = 354\n}\n@inproceedings{alshawi11nlf,\n\ttitle        = {Deterministic Statistical Mapping of Sentences to Underspecified Semantics},\n\tauthor       = {Hiyan Alshawi and Pi-Chuan Chang and Michael Ringgaard},\n\tyear         = 2011,\n\tbooktitle    = {International Conference on Compositional Semantics (IWCS)},\n\tpages        = {15--24}\n}\n@inproceedings{alshiekh2018safe,\n\ttitle        = {Safe reinforcement learning via shielding},\n\tauthor       = {Alshiekh, Mohammed and Bloem, Roderick and Ehlers, R{\\\"u}diger and K{\\\"o}nighofer, Bettina and Niekum, Scott and Topcu, Ufuk},\n\tyear         = 2018,\n\tbooktitle    = {Thirty-Second AAAI Conference on Artificial Intelligence}\n}\n@inproceedings{alterovitz2011rapidly,\n\ttitle        = {Rapidly-exploring roadmaps: Weighing exploration vs. refinement in optimal motion planning},\n\tauthor       = {R. Alterovitz and S. Patil and A. Derbakova},\n\tyear         = 2011,\n\tbooktitle    = {International Conference on Robotics and Automation (ICRA)}\n}\n@article{altham1973,\n\ttitle        = {Rawls' Difference Principle},\n\tauthor       = {JE J. Altham},\n\tyear         = 1973,\n\tjournal      = {Philosophy},\n\tvolume       = 48,\n\tpages        = {75--78}\n}\n@book{altman1999constrained,\n\ttitle        = {Constrained Markov decision processes},\n\tauthor       = {Altman, Eitan},\n\tyear         = 1999,\n\tpublisher    = {CRC Press},\n\tvolume       = 7\n}\n@inproceedings{AltTensorDecomp:COLT2015,\n\ttitle        = {{Learning Overcomplete Latent Variable Models through Tensor Methods}},\n\tauthor       = {A. Anandkumar and R. Ge and M. Janzamin},\n\tyear         = 2015,\n\tmonth        = jul,\n\tbooktitle    = {Proceedings of the Conference on Learning Theory (COLT)},\n\taddress      = {Paris, France}\n}\n@article{AltTensorDecomp2014,\n\ttitle        = {{Guaranteed Non-Orthogonal Tensor Decomposition via Alternating Rank-$1$ Updates}},\n\tauthor       = {Anima Anandkumar and Rong Ge and Majid Janzamin},\n\tyear         = 2014,\n\tmonth        = feb,\n\tjournal      = {arXiv preprint arXiv:1402.5180}\n}\n@inproceedings{ALY2016-geometry,\n\ttitle        = {{Optimization Algorithms for Faster Computational Geometry}},\n\tauthor       = {{Allen-Zhu}, Zeyuan and Liao, Zhenyu and Yuan, Yang},\n\tyear         = 2016,\n\tbooktitle    = {ICALP}\n}\n@inproceedings{alzantot2018adversarial,\n\ttitle        = {Generating Natural Language Adversarial Examples},\n\tauthor       = {Moustafa Alzantot and Yash Sharma and Ahmed Elgohary and Bo-Jhang Ho and Mani Srivastava and Kai-Wei Chang},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{AM05,\n\ttitle        = {On Spectral Learning of Mixtures of Distributions},\n\tauthor       = {D. Achlioptas and F. McSherry},\n\tyear         = 2005,\n\tbooktitle    = {COLT}\n}\n@article{amari1998natural,\n\ttitle        = {Natural gradient works efficiently in learning},\n\tauthor       = {Amari, Shun-Ichi},\n\tyear         = 1998,\n\tjournal      = {Neural computation},\n\tpublisher    = {MIT Press},\n\tvolume       = 10,\n\tnumber       = 2,\n\tpages        = {251--276}\n}\n@article{amari2002geometrical,\n\ttitle        = {Geometrical singularities in the neuromanifold of multilayer perceptrons},\n\tauthor       = {Amari, Shun-ichi and Park, Hyeyoung and Ozeki, Tomoko},\n\tyear         = 2002,\n\tjournal      = {Advances in neural information processing systems},\n\tvolume       = 1,\n\tpages        = {343--350}\n}\n@article{amari2006singularities,\n\ttitle        = {Singularities affect dynamics of learning in neuromanifolds},\n\tauthor       = {Amari, Shun-Ichi and Park, Hyeyoung and Ozeki, Tomoko},\n\tyear         = 2006,\n\tjournal      = {Neural computation},\n\tpublisher    = {MIT Press},\n\tvolume       = 18,\n\tnumber       = 5,\n\tpages        = {1007--1065}\n}\n@inproceedings{amato2018decision,\n\ttitle        = {Decision-Making Under Uncertainty in Multi-Agent and Multi-Robot Systems: Planning and Learning.},\n\tauthor       = {Amato, Christopher},\n\tyear         = 2018,\n\tbooktitle    = {IJCAI},\n\tpages        = {5662--5666}\n}\n@inproceedings{ambainis2000quantum,\n\ttitle        = {Quantum lower bounds by quantum arguments},\n\tauthor       = {Ambainis, Andris},\n\tyear         = 2000,\n\tbooktitle    = {Proceedings of the thirty-second annual ACM symposium on Theory of computing},\n\tpages        = {636--643},\n\torganization = {ACM}\n}\n@article{amelunxen2014living,\n\ttitle        = {Living on the edge: Phase transitions in convex programs with random data},\n\tauthor       = {Amelunxen, Dennis and Lotz, Martin and McCoy, Michael B and Tropp, Joel A},\n\tyear         = 2014,\n\tjournal      = {Information and Inference: A Journal of the IMA},\n\tpublisher    = {OUP},\n\tvolume       = 3,\n\tnumber       = 3,\n\tpages        = {224--294}\n}\n@inproceedings{amershi2015modeltracker,\n\ttitle        = {Modeltracker: Redesigning performance analysis tools for machine learning},\n\tauthor       = {Saleema Amershi and Max Chickering and Steven M Drucker and Bongshin Lee and Patrice Simard and Jina Suh},\n\tyear         = 2015,\n\tbooktitle    = {Conference on Human Factors in Computing Systems (CHI)},\n\tpages        = {337--346}\n}\n@inproceedings{ames2019control,\n\ttitle        = {Control barrier functions: Theory and applications},\n\tauthor       = {Ames, Aaron D and Coogan, Samuel and Egerstedt, Magnus and Notomista, Gennaro and Sreenath, Koushil and Tabuada, Paulo},\n\tyear         = 2019,\n\tbooktitle    = {2019 18th European Control Conference (ECC)},\n\tpages        = {3420--3431},\n\torganization = {IEEE}\n}\n@inproceedings{amini2003semisupervised,\n\ttitle        = {Semi-Supervised Learning with Explicit Misclassification Modeling},\n\tauthor       = {Massih-Reza Amini and Patrick Gallinari},\n\tyear         = 2003,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@inproceedings{amit2007uncovering,\n\ttitle        = {Uncovering shared structures in multiclass classification},\n\tauthor       = {Amit, Yonatan and Fink, Michael and Srebro, Nathan and Ullman, Shimon},\n\tyear         = 2007,\n\tbooktitle    = {Proceedings of the 24th international conference on Machine learning},\n\tpages        = {17--24},\n\torganization = {ACM}\n}\n@inproceedings{amodei2016,\n\ttitle        = {Deep Speech 2 End to End Speech Recognition in {E}nglish and Mandarin},\n\tauthor       = {Dario Amodei and others},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {173--182}\n}\n@article{amodei2016concrete,\n\ttitle        = {Concrete problems in {AI} safety},\n\tauthor       = {Dario Amodei and Chris Olah and Jacob Steinhardt and Paul Christiano and John Schulman and Dan Mané},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1606.06565}\n}\n@inproceedings{amorim2018automated,\n\ttitle        = {Automated essay scoring in the presence of biased ratings},\n\tauthor       = {Evelin Amorim and Marcia Can{\\c{c}}ado and Adriano Veloso},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {229--237}\n}\n@inproceedings{amos2017input,\n\ttitle        = {Input convex neural networks},\n\tauthor       = {Amos, Brandon and Xu, Lei and Kolter, J Zico},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {146--155},\n\torganization = {PMLR}\n}\n@article{AMP2010,\n\ttitle        = {The dynamics of message passing on dense graphs, with applications to compressed sensing},\n\tauthor       = {Mohsen Bayati and Andrea Montanari},\n\tyear         = 2010,\n\tmonth        = jan,\n\tjournal      = {arXiv preprint arXiv:1001.3448}\n}\n@article{AMR09,\n\ttitle        = {{Identifiability of parameters in latent structure models with many observed variables}},\n\tauthor       = {E. S. Allman and C. Matias and J. A. Rhodes},\n\tyear         = 2009,\n\tjournal      = {The Annals of Statistics},\n\tvolume       = 37,\n\tnumber       = {6A},\n\tpages        = {3099--3132}\n}\n@article{anand2012semantic,\n\ttitle        = {Contextually Guided Semantic Labeling and Search for 3{D} Point Clouds},\n\tauthor       = {A. Anand and H. Koppula and T. Joachims and A. Saxena},\n\tyear         = 2012,\n\tjournal      = {International Journal of Robotics Research (IJRR)},\n\tvolume       = 32\n}\n@inproceedings{anandkumar11tree,\n\ttitle        = {Spectral Methods for Learning Multivariate Latent Tree Structure},\n\tauthor       = {Animashree Anandkumar and Kamalika Chaudhuri and Daniel Hsu and Sham M. Kakade and Le Song and Tong Zhang},\n\tyear         = 2011,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{anandkumar12lda,\n\ttitle        = {Two {SVD}s Suffice: Spectral decompositions for probabilistic topic modeling and latent {D}irichlet allocation},\n\tauthor       = {Animashree Anandkumar and Dean P. Foster and Daniel Hsu and Sham M. Kakade and Yi-Kai Liu},\n\tyear         = 2012,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{anandkumar12moments,\n\ttitle        = {A Method of Moments for Mixture Models and Hidden {M}arkov Models},\n\tauthor       = {Animashree Anandkumar and Daniel Hsu and Sham M. Kakade},\n\tyear         = 2012,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@inproceedings{anandkumar13linear,\n\ttitle        = {Learning Linear {B}ayesian Networks with Latent Variables},\n\tauthor       = {Animashree Anandkumar and Daniel Hsu and Adel Javanmard and Sham M. Kakade},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{anandkumar13tensor,\n\ttitle        = {Tensor decompositions for learning latent variable models},\n\tauthor       = {Anima Anandkumar and Rong Ge and Daniel Hsu and Sham M. Kakade and Matus Telgarsky},\n\tyear         = 2013,\n\tjournal      = {arXiv}\n}\n@inproceedings{anandkumar2013community,\n\ttitle        = {A Tensor Spectral Approach to Learning Mixed Membership Community Models},\n\tauthor       = {Animashree Anandkumar and Rong Ge and Daniel Hsu and Sham Kakade},\n\tyear         = 2013,\n\tbooktitle    = {Conference on Learning Theory (COLT)},\n\tpages        = {867--881}\n}\n@article{anandkumar2013overcomplete,\n\ttitle        = {When are Overcomplete Representations Identifiable? Uniqueness of Tensor Decompositions Under Expansion Constraints},\n\tauthor       = {Animashree Anandkumar and Daniel Hsu and Majid Janzamin and Sham Kakade},\n\tyear         = {2013 2013},\n\tjournal      = {arXiv}\n}\n@inproceedings{anandkumar2015learning,\n\ttitle        = {Learning overcomplete latent variable models through tensor methods},\n\tauthor       = {Anandkumar, Animashree and Ge, Rong and Janzamin, Majid},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the Conference on Learning Theory (COLT), Paris, France}\n}\n@article{anandkumar2016analyzing,\n\ttitle        = {Analyzing tensor power method dynamics in overcomplete regime},\n\tauthor       = {Anandkumar, Anima and Ge, Rong and Janzamin, Majid},\n\tyear         = 2016,\n\tjournal      = {JMLR}\n}\n@inproceedings{anandkumar2016efficient,\n\ttitle        = {Efficient approaches for escaping higher order saddle points in non-convex optimization},\n\tauthor       = {Anandkumar, Animashree and Ge, Rong},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1602.05908},\n\tbooktitle    = {Conference on learning theory},\n\tpages        = {81--102},\n\torganization = {PMLR}\n}\n@inproceedings{AnandkumarEtal:community12,\n\ttitle        = {{A Tensor Spectral Approach to Learning Mixed Membership Community Models}},\n\tauthor       = {A. Anandkumar and R. Ge and D. Hsu and S. M. Kakade},\n\tyear         = 2013,\n\tmonth        = jun,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@article{AnandkumarEtal:communityimplementation13,\n\ttitle        = {{Fast Detection of Overlapping Communities via Online Tensor Methods}},\n\tauthor       = {F. Huang and U. N. Niranjan and M. Hakeem and A. Anandkumar},\n\tyear         = 2013,\n\tmonth        = sep,\n\tjournal      = {ArXiv 1309.0787}\n}\n@article{AnandkumarEtal:lda12,\n\ttitle        = {{Two SVDs Suffice: Spectral Decompositions for Probabilistic Topic Modeling and Latent Dirichlet Allocation}},\n\tauthor       = {A. Anandkumar and D. P. Foster and D. Hsu and S. M. Kakade and Y. K. Liu},\n\tyear         = 2013,\n\tmonth        = jul,\n\tjournal      = {to appear in the special issue of Algorithmica on New Theoretical Challenges in Machine Learning},\n\tnote         = {arXiv:1204.6703},\n\teprint       = {arXiv:1204.6703}\n}\n@inproceedings{AnandkumarEtal:NIPS13,\n\ttitle        = {{When are Overcomplete Topic Models Identifiable? Uniqueness of Tensor Tucker Decompositions with Structured Sparsity}},\n\tauthor       = {A. Anandkumar and D. Hsu and M. Janzamin and S. M. Kakade},\n\tyear         = 2013,\n\tmonth        = dec,\n\tbooktitle    = {Neural Information Processing (NIPS)}\n}\n@article{AnandkumarEtal:tensor12,\n\ttitle        = {{Tensor Methods for Learning Latent Variable Models}},\n\tauthor       = {A. Anandkumar and R. Ge and D. Hsu and S. M. Kakade and M. Telgarsky},\n\tyear         = 2012,\n\tmonth        = oct,\n\tjournal      = {Available at arXiv:1210.7559}\n}\n@inproceedings{AnandkumarHsuKakade:graphmixturesNIPS12,\n\ttitle        = {Learning Mixtures of Tree Graphical Models},\n\tauthor       = {A. Anandkumar and D. Hsu and F. Huang and S.M. Kakade},\n\tyear         = 2012,\n\tbooktitle    = {Advances in Neural Information Processing Systems 25}\n}\n@mastersthesis{anca2009math,\n\ttitle        = {Natural Language and Mathematics Processing for Applicable Theorem Search},\n\tauthor       = {Stefan Anca},\n\tyear         = 2009,\n\tschool       = {Jacobs University Bremen}\n}\n@book{andersen1995linear,\n\ttitle        = {Linear and graphical models for the multivariate complex normal distribution},\n\tauthor       = {\n\t\tHeidi H. Andersen and Malene Hojbjerre and Dorte Sorensen and Poul\n\n\t\tSvante Eriksen\n\t},\n\tyear         = 1995,\n\tpublisher    = {Springer-Verlag},\n\tseries       = {Lecture notes in statistics},\n\tisbn         = 9780387945217,\n\tlccn         = 95019290,\n\towner        = {leili},\n\ttimestamp    = {2010.11.13}\n}\n@inproceedings{AndersenLang06WWW,\n\ttitle        = {Communities from seed sets},\n\tauthor       = {Andersen, Reid and Lang, Kevin J.},\n\tyear         = 2006,\n\tseries       = {WWW '06},\n\tpages        = {223--232}\n}\n@inproceedings{AndersenLang2008,\n\ttitle        = {An algorithm for improving graph partitions},\n\tauthor       = {Andersen, Reid and Lang, Kevin J.},\n\tyear         = 2008,\n\tseries       = {SODA},\n\tpages        = {651--660}\n}\n@inproceedings{AndersenPeres09,\n\ttitle        = {Finding sparse cuts locally using evolving sets},\n\tauthor       = {Reid Andersen and Yuval Peres},\n\tyear         = 2009,\n\tseries       = {STOC}\n}\n@article{anderson1949estimation,\n\ttitle        = {Estimation of the parameters of a single equation in a complete system of stochastic equations},\n\tauthor       = {Theodore W. Anderson and Herman Rubin},\n\tyear         = 1949,\n\tjournal      = {The Annals of Mathematical Statistics},\n\tpages        = {46--63}\n}\n@article{anderson1950asymptotic,\n\ttitle        = {The asymptotic properties of estimates of the parameters of a single equation in a complete system of stochastic equations},\n\tauthor       = {Theodore W. Anderson and Herman Rubin},\n\tyear         = 1950,\n\tjournal      = {The Annals of Mathematical Statistics},\n\tpages        = {570--582}\n}\n@book{anderson1979optimal,\n\ttitle        = {Optimal Filtering},\n\tauthor       = {Brian D. O. Anderson and John B. Moore},\n\tyear         = 1979,\n\tpublisher    = {Prentice Hall},\n\taddress      = {New York}\n}\n@article{Anderson2014,\n\ttitle        = {{An Efficient Algorithm for Unweighted Spectral Graph Sparsification}},\n\tauthor       = {Anderson, David G. and Gu, Ming and Melgaard, Christopher},\n\tyear         = 2014,\n\tmonth        = oct,\n\tjournal      = {ArXiv e-prints},\n\tvolume       = {abs/1410.4273},\n\turl          = {http://arxiv.org/abs/1410.4273v1},\n\teprint       = {1410.4273}\n}\n@inproceedings{anderson2014blessing,\n\ttitle        = {The more, the merrier: the blessing of dimensionality for learning large {G}aussian mixtures},\n\tauthor       = {Joseph Anderson and Mikhail Belkin and Navin Goyal and Luis Rademacher and James R. Voss},\n\tyear         = 2014,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@inproceedings{anderson2015spectral,\n\ttitle        = {{Spectral Gap Error Bounds for Improving CUR Matrix Decomposition and the Nystr\\\"{o}m Method}},\n\tauthor       = {David Anderson and Simon Du and Michael Mahoney and Christopher Melgaard and Kunming Wu and Ming Gu},\n\tyear         = 2015,\n\tmonth        = {09--12 May},\n\tbooktitle    = {Proceedings of the Eighteenth International Conference on Artificial Intelligence and Statistics},\n\tpublisher    = {PMLR},\n\taddress      = {San Diego, California, USA},\n\tseries       = {Proceedings of Machine Learning Research},\n\tvolume       = 38,\n\tpages        = {19--27},\n\turl          = {http://proceedings.mlr.press/v38/anderson15.html},\n\teditor       = {Guy Lebanon and S. V. N. Vishwanathan},\n\tpdf          = {http://proceedings.mlr.press/v38/anderson15.pdf},\n\tabstract     = {The CUR matrix decomposition and the related Nyström method build low-rank approximations of data matrices by selecting a small number of representative rows and columns of the data. Here, we introduce novel \\emphspectral gap error bounds that judiciously exploit the potentially rapid spectrum decay in the input matrix, a most common occurrence in machine learning and data analysis. Our error bounds are much tighter than existing ones for matrices with rapid spectrum decay, and they justify the use of a constant amount of oversampling relative to the rank parameter k, i.e, when the number of columns/rows is \\ell=k+ O(1). We demonstrate our analysis on a novel deterministic algorithm, \\emphStableCUR, which additionally eliminates a previously unrecognized source of potential instability in CUR decompositions. While our algorithm accepts any method of row and column selection, we implement it with a recent column selection scheme with strong singular value bounds. Empirical results on various classes of real world data matrices demonstrate that our algorithm is as efficient as and often outperforms competing algorithms.}\n}\n@inproceedings{anderson2018butd,\n\ttitle        = {Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering},\n\tauthor       = {Peter Anderson and X. He and C. Buehler and Damien Teney and Mark Johnson and Stephen Gould and Lei Zhang},\n\tyear         = 2018,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {6077--6086}\n}\n@inproceedings{anderson2018vision,\n\ttitle        = {Vision-and-language navigation: Interpreting visually-grounded navigation instructions in real environments},\n\tauthor       = {Peter Anderson and Qi Wu and Damien Teney and Jake Bruce and Mark Johnson and Niko S{\\\"u}nderhauf and Ian Reid and Stephen Gould and Anton van den Hengel},\n\tyear         = 2018,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@article{anderson2020neurosymbolic,\n\ttitle        = {Neurosymbolic reinforcement learning with formally verified exploration},\n\tauthor       = {Anderson, Greg and Verma, Abhinav and Dillig, Isil and Chaudhuri, Swarat},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2009.12612}\n}\n@inproceedings{ando07,\n\ttitle        = {Two-view feature generation model for semi-supervised learning},\n\tauthor       = {R. Ando and T. Zhang},\n\tyear         = 2007,\n\tbooktitle    = {ICML}\n}\n@inproceedings{ando2007two,\n\ttitle        = {Two-view feature generation model for semi-supervised learning},\n\tauthor       = {Rie Kubota Ando and Tong Zhang},\n\tyear         = 2007,\n\tbooktitle    = {Conference on Learning Theory (COLT)},\n\tpages        = {25--32}\n}\n@inproceedings{ando2017deep,\n\ttitle        = {Deep over-sampling framework for classifying imbalanced data},\n\tauthor       = {Ando, Shin and Huang, Chun Yuan},\n\tyear         = 2017,\n\tbooktitle    = {Joint European Conference on Machine Learning and Knowledge Discovery in Databases},\n\tpages        = {770--785}\n}\n@phdthesis{Andoni2009thesis,\n\ttitle        = {Nearest Neighbor Search: the Old, the New, and the Impossible},\n\tauthor       = {Andoni, Alexandr},\n\tyear         = 2009,\n\tschool       = {MIT}\n}\n@inproceedings{andoni2014learning,\n\ttitle        = {Learning sparse polynomial functions},\n\tauthor       = {Andoni, Alexandr and Panigrahy, Rina and Valiant, Gregory and Zhang, Li},\n\tyear         = 2014,\n\tbooktitle    = {Proceedings of the Twenty-Fifth Annual ACM-SIAM Symposium on Discrete Algorithms},\n\tpages        = {500--510},\n\torganization = {Society for Industrial and Applied Mathematics}\n}\n@article{andor2016globally,\n\ttitle        = {Globally normalized transition-based neural networks},\n\tauthor       = {Daniel Andor and Chris Alberti and David Weiss and Aliaksei Severyn and Alessandro Presta and Kuzman Ganchev and Slav Petrov and Michael Collins},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1603.06042}\n}\n@inproceedings{andre2002state,\n\ttitle        = {State abstraction for programmable reinforcement learning agents},\n\tauthor       = {David Andre and Stuart J Russell},\n\tyear         = 2002,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tpages        = {119--125}\n}\n@phdthesis{andre2003programmable,\n\ttitle        = {Programmable reinforcement learning agents},\n\tauthor       = {D. Andre},\n\tyear         = 2003,\n\tschool       = {University of California, Berkeley}\n}\n@article{andreas2013generative,\n\ttitle        = {A Generative Model of Vector Space Semantics},\n\tauthor       = {Andreas, Jacob and Ghahramani, Zoubin},\n\tyear         = 2013,\n\tjournal      = {Transactions of the Association for Computational Linguistics}\n}\n@inproceedings{andreas2014grounding,\n\ttitle        = {Grounding Language with Points and Paths in Continuous Spaces},\n\tauthor       = {Jacob Andreas and Dan Klein},\n\tyear         = 2014,\n\tbooktitle    = {Computational Natural Language Learning (CoNLL)},\n\tpages        = {58--67}\n}\n@inproceedings{andreas2014when,\n\ttitle        = {When and why are log-linear models self-normalizing?},\n\tauthor       = {Jacob Andreas and Dan Klein},\n\tyear         = 2014,\n\tbooktitle    = {Proceedings of the Annual Meeting of the North American Chapter of the Association for Computational Linguistics}\n}\n@inproceedings{andreas2015alignment,\n\ttitle        = {Alignment-Based Compositional Semantics for Instruction Following},\n\tauthor       = {Jacob Andreas and Dan Klein},\n\tyear         = 2015,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{andreas2016learning,\n\ttitle        = {Learning to Compose Neural Networks for Question Answering},\n\tauthor       = {Jacob  Andreas and Marcus   Rohrbach and Trevor   Darrell and Dan  Klein},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {1545--1554}\n}\n@inproceedings{andreas2016neural,\n\ttitle        = {Neural module networks},\n\tauthor       = {Jacob Andreas and Marcus Rohrbach and Trevor Darrell and Dan Klein},\n\tyear         = 2016,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@inproceedings{andreas2016reasoning,\n\ttitle        = {Reasoning about Pragmatics with Neural Listeners and Speakers},\n\tauthor       = {Jacob  Andreas and Dan  Klein},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1173--1182}\n}\n@article{andreas2017learning,\n\ttitle        = {Learning with Latent Language},\n\tauthor       = {Jacob Andreas and Dan Klein and Sergey Levine},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1711.00482}\n}\n@inproceedings{andreas2017sketches,\n\ttitle        = {Modular Multitask Reinforcement Learning with Policy Sketches},\n\tauthor       = {Jacob Andreas and Dan Klein and Sergey Levine},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{andreas2020geca,\n\ttitle        = {Good-Enough Compositional Data Augmentation},\n\tauthor       = {Jacob Andreas},\n\tyear         = 2020,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{andreassen2021evolution,\n\ttitle        = {The Evolution of Out-of-Distribution Robustness Throughout Fine-Tuning},\n\tauthor       = {Anders Andreassen and Yasaman Bahri and Behnam Neyshabur and Rebecca Roelofs},\n\tyear         = 2021,\n\tjournal      = {arXiv}\n}\n@inproceedings{andrew2013deep,\n\ttitle        = {Deep canonical correlation analysis},\n\tauthor       = {Galen Andrew and Raman Arora and Jeff Bilmes and Karen Livescu},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1247--1255}\n}\n@inproceedings{andrews2012name,\n\ttitle        = {Name phylogeny: A generative model of string variation},\n\tauthor       = {Nocholas Andrews and Jason Eisner and Mark Dredze},\n\tyear         = 2012,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {344--355}\n}\n@inproceedings{andrieu2005line,\n\ttitle        = {On-line Parameter Estimation in General State-Space Models},\n\tauthor       = {Andrieu, C. and Doucet, A. and Tadic, V.},\n\tyear         = 2005,\n\tbooktitle    = {Proceedings of the 44th Conference on Decision and Control},\n\tpages        = {332--337}\n}\n@article{andrieu2008tutorial,\n\ttitle        = {A tutorial on adaptive {MCMC}},\n\tauthor       = {Christophe Andrieu and Johannes Thoms},\n\tyear         = 2008,\n\tjournal      = {Statistics and Computing},\n\tvolume       = 18,\n\tnumber       = 4,\n\tpages        = {343--373}\n}\n@article{andrieu2010particle,\n\ttitle        = {Particle {M}arkov chain {M}onte {C}arlo methods},\n\tauthor       = {Christophe Andrieu and Arnaud Doucet and Roman Holenstein},\n\tyear         = 2010,\n\tjournal      = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)},\n\tvolume       = 72,\n\tnumber       = 3,\n\tpages        = {269--342}\n}\n@article{androutsopoulos2010survey,\n\ttitle        = {A survey of paraphrasing and textual entailment methods},\n\tauthor       = {Ion Androutsopoulos and Prodromos Malakasiotis},\n\tyear         = 2010,\n\tjournal      = {Journal of Artificial Intelligence Research (JAIR)},\n\tvolume       = 38,\n\tpages        = {135--187}\n}\n@article{androutsopoulos95nlidb,\n\ttitle        = {Natural Language Interfaces to Databases -- An Introduction},\n\tauthor       = {I. Androutsopoulos and G. D. Ritchie and P. Thanisch},\n\tyear         = 1995,\n\tjournal      = {Journal of Natural Language Engineering},\n\tvolume       = 1,\n\tpages        = {29--81}\n}\n@inproceedings{andrychowicz2016learning,\n\ttitle        = {Learning to learn by gradient descent by gradient descent},\n\tauthor       = {Marcin Andrychowicz and Misha Denil and Sergio Gomez and Matthew W Hoffman and David Pfau and Tom Schaul and Brendan Shillingford and Nando De Freitas},\n\tyear         = 2016,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {3981--3989}\n}\n@article{andrychowicz2017hindsight,\n\ttitle        = {Hindsight Experience Replay},\n\tauthor       = {Marcin Andrychowicz and Filip Wolski and Alex Ray and Jonas Schneider and Rachel Fong and Peter Welinder and Bob McGrew and Josh Tobin and Pieter Abbeel and Wojciech Zaremba},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1707.01495}\n}\n@inproceedings{angeli10generation,\n\ttitle        = {A Simple Domain-Independent Probabilistic Approach to Generation},\n\tauthor       = {Gabor Angeli and Percy Liang and Dan Klein},\n\tyear         = 2010,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{angeli2014combining,\n\ttitle        = {Combining distant and partial supervision for relation extraction},\n\tauthor       = {Gabor Angeli and Julie Tibshirani and Jean Y Wu and Christopher D Manning},\n\tyear         = 2014,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{angeli2014naturalli,\n\ttitle        = {NaturalLI: Natural Logic Inference for Common Sense Reasoning},\n\tauthor       = {Gabor Angeli and Christopher D. Manning},\n\tyear         = 2014,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{angeli2015openie,\n\ttitle        = {Leveraging Linguistic Structure for Open Domain Information Extraction},\n\tauthor       = {Gabor Angeli and Melvin Johnson Premkumar and Christopher D. Manning},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{angeli2016naturalli,\n\ttitle        = {Combining Natural Logic and Shallow Reasoning for Question Answering},\n\tauthor       = {Gabor Angeli and Neha Nayak and Christopher D. Manning},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{angluin88queries,\n\ttitle        = {Queries and concept learning},\n\tauthor       = {D. Angluin},\n\tyear         = 1988,\n\tjournal      = {Machine Learning},\n\tvolume       = 2,\n\tnumber       = 4,\n\tpages        = {319--342}\n}\n@book{angrist2009econometrics,\n\ttitle        = {Mostly Harmless Econometrics: An Empiricist's Companian},\n\tauthor       = {Joashua D. Angrist and Jorn-Steffen Pischke},\n\tyear         = 2009,\n\tpublisher    = {Princeton University Press}\n}\n@inproceedings{anguita2013har,\n\ttitle        = {A Public Domain Dataset for Human Activity Recognition Using Smartphones},\n\tauthor       = {Davide Anguita and Alessandro Ghio and Luca Oneto and Xavier Parra and Jorge L. Reyes-Ortiz},\n\tyear         = 2013,\n\tbooktitle    = {21st European Symposium on Artificial Neural Networks, Computational Intelligence and Machine Learning (ESANN)}\n}\n@article{angwin2016machine,\n\ttitle        = {Machine bias: There’s software used across the country to predict future criminals. and it’s biased against blacks},\n\tauthor       = {Julia Angwin and Jeff Larson and Surya Mattu and Lauren Kirchner},\n\tyear         = 2016,\n\tjournal      = {ProPublica},\n\tvolume       = 23\n}\n@article{anstreicher2002improved,\n\ttitle        = {Improved complexity for maximum volume inscribed ellipsoids},\n\tauthor       = {Anstreicher, Kurt M.},\n\tyear         = 2002,\n\tjournal      = {SIAM Journal on Optimization},\n\tpublisher    = {SIAM},\n\tvolume       = 13,\n\tnumber       = 2,\n\tpages        = {309--320}\n}\n@article{antoniak74dpmix,\n\ttitle        = {Mixtures of {D}irichlet Processes with Applications to {B}ayesian Nonparametric Problems},\n\tauthor       = {C. E. Antoniak},\n\tyear         = 1974,\n\tjournal      = {Annals of Statistics},\n\tvolume       = 2,\n\tpages        = {1152--1174}\n}\n@article{antoniou2017data,\n\ttitle        = {Data augmentation generative adversarial networks},\n\tauthor       = {Antreas Antoniou and Amos Storkey and Harrison Edwards},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1711.04340}\n}\n@inproceedings{AO-lp-coordinate,\n\ttitle        = {{Nearly-Linear Time Positive LP Solver with Faster Convergence Rate}},\n\tauthor       = {{Allen-Zhu}, Zeyuan and Orecchia, Lorenzo},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the 47th Annual ACM Symposium on Theory of Computing},\n\tseries       = {STOC~'15}\n}\n@inproceedings{AO-lp-parallel,\n\ttitle        = {Using Optimization to Break the Epsilon Barrier: A Faster and Simpler Width-Independent Algorithm for Solving Positive Linear Programs in Parallel},\n\tauthor       = {{Allen-Zhu}, Zeyuan and Orecchia, Lorenzo},\n\tyear         = 2015,\n\tmonth        = jul,\n\tjournal      = {ArXiv e-prints},\n\tbooktitle    = {Proceedings of the 26th ACM-SIAM Symposium on Discrete Algorithms},\n\tseries       = {SODA~'15},\n\tvolume       = {abs/1407.1925},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de}\n}\n@article{AO-survey-nesterov,\n\ttitle        = {Linear Coupling: An Ultimate Unification of Gradient and Mirror Descent},\n\tauthor       = {{Allen-Zhu}, Zeyuan and Orecchia, Lorenzo},\n\tyear         = 2014,\n\tmonth        = jul,\n\tjournal      = {ArXiv e-prints},\n\tvolume       = {abs/1407.1537},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de}\n}\n@inproceedings{apvz14,\n\ttitle        = {Learning polynomials with neural networks},\n\tauthor       = {Andoni, Alexandr and Panigrahy, Rina and Valiant, Gregory and Zhang, Li},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1908--1916}\n}\n@inproceedings{arasu2003extracting,\n\ttitle        = {Extracting structured data from web pages},\n\tauthor       = {Arvind Arasu and Hector Garcia-Molina},\n\tyear         = 2003,\n\tbooktitle    = {ACM SIGMOD international conference on Management of data},\n\tpages        = {337--348}\n}\n@inproceedings{ardila2020common,\n\ttitle        = {Common Voice: A Massively-Multilingual Speech Corpus},\n\tauthor       = {Rosana Ardila and Megan Branson and Kelly Davis and Michael Kohler and Josh Meyer and Michael Henretty and Reuben Morais and Lindsay Saunders and Francis Tyers and Gregor Weber},\n\tyear         = 2020,\n\tbooktitle    = {Language Resources and Evaluation Conference (LREC)},\n\tpages        = {4218--4222}\n}\n@article{arefyev2020lssurvey,\n\ttitle        = {A Comparative Study of Lexical Substitution Approaches based on Neural Language Models},\n\tauthor       = {Nikolay Arefyev and Boris Sheludko and Alexander Podolskiy and Alexander Panchenko},\n\tyear         = 2020,\n\tjournal      = {arXiv}\n}\n@article{argall2009survey,\n\ttitle        = {A survey of robot learning from demonstration},\n\tauthor       = {B. Argall and S. Chernova and M. Veloso and B. Browning},\n\tyear         = 2009,\n\tjournal      = {RAS},\n\tvolume       = 57\n}\n@article{argall2018autonomy,\n\ttitle        = {Autonomy in rehabilitation robotics: an intersection},\n\tauthor       = {Brenna D Argall},\n\tyear         = 2018,\n\tjournal      = {Annual Review of Control, Robotics, and Autonomous Systems},\n\tvolume       = 1,\n\tpages        = {441--463}\n}\n@inproceedings{argyriou07feature,\n\ttitle        = {Multi-task feature learning},\n\tauthor       = {A. Argyriou and T. Evgeniou and M. Pontil},\n\tyear         = 2007,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {41--48}\n}\n@inproceedings{arikan2002interactive,\n\ttitle        = {Interactive motion generation from examples},\n\tauthor       = {Okan Arikan and D. A. Forsyth},\n\tyear         = 2002,\n\tbooktitle    = {\n\t\tSIGGRAPH '02: Proceedings of the 29th annual conference on Computer\n\n\t\tgraphics and interactive techniques\n\t},\n\tlocation     = {San Antonio, Texas},\n\tpublisher    = {ACM Press},\n\taddress      = {New York, NY, USA},\n\tpages        = {483--490},\n\tdoi          = {http://doi.acm.org/10.1145/566570.566606},\n\tisbn         = {1-58113-521-1}\n}\n@inproceedings{ariola97cyclic,\n\ttitle        = {Cyclic lambda calculi},\n\tauthor       = {Zena M. Ariola and Stefan Blom},\n\tyear         = 1997,\n\tbooktitle    = {Theoretical Aspects of Computer Software},\n\tpages        = {77--106}\n}\n@inproceedings{aristidou2008predicting,\n\ttitle        = {\n\t\tPredicting Missing Markers to Drive Real-Time Centre of Rotation\n\n\t\tEstimation\n\t},\n\tauthor       = {Aristidou, Andreas and Cameron, Jonathan and Lasenby, Joan},\n\tyear         = 2008,\n\tbooktitle    = {\n\t\tAMDO '08: Proceedings of the 5th international conference on Articulated\n\n\t\tMotion and Deformable Objects\n\t},\n\tlocation     = {Port d'Andratx, Mallorca, Spain},\n\tpublisher    = {Springer-Verlag},\n\taddress      = {Berlin, Heidelberg},\n\tpages        = {238--247},\n\tdoi          = {http://dx.doi.org/10.1007/978-3-540-70517-8_23},\n\tisbn         = {978-3-540-70516-1}\n}\n@inproceedings{arjovsky2017gan,\n\ttitle        = {Towards Principled Methods for Training Generative Adversarial Networks},\n\tauthor       = {Martin Arjovsky and Leon Bottou},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{arjovsky2019invariant,\n\ttitle        = {Invariant risk minimization},\n\tauthor       = {Arjovsky, Martin and Bottou, L{\\'e}on and Gulrajani, Ishaan and Lopez-Paz, David},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1907.02893}\n}\n@inproceedings{arlot10penalty,\n\ttitle        = {Data-driven calibration of linear estimators with minimal penalties},\n\tauthor       = {Sylvain Arlot and Francis Bach},\n\tyear         = 2010,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {46--54}\n}\n@phdthesis{armando2008sketch,\n\ttitle        = {Program Synthesis by Sketching},\n\tauthor       = {Armando Solar-Lezama},\n\tyear         = 2008,\n\tschool       = {University of California at Berkeley}\n}\n@techreport{AroLiLiaMaetal15,\n\ttitle        = {A Latent Variable Model Approach to {PMI}-based Word Embeddings},\n\tauthor       = {Sanjeev Arora and Yuanzhi Li and Yingyu Liang and Tengyu Ma and Andrej Risteski},\n\tyear         = 2015,\n\tnote         = {\\url{http://arxiv.org/abs/1502.03520}},\n\tinstitution  = {ArXiV}\n}\n@inproceedings{aronson2018eye,\n\ttitle        = {Eye-hand behavior in human-robot shared manipulation},\n\tauthor       = {Reuben M Aronson and Thiago Santini and Thomas C K{\\\"u}bler and Enkelejda Kasneci and Siddhartha Srinivasa and Henny Admoni},\n\tyear         = 2018,\n\tbooktitle    = {ACM/IEEE International Conference on Human Robot Interaction (HRI)},\n\tpages        = {4--13}\n}\n@inproceedings{arora15simple,\n\ttitle        = {Simple, Efficient, and Neural Algorithms for Sparse Coding},\n\tauthor       = {Sanjeev Arora and Rong Ge and Tengyu Ma and Ankur Moitra},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of The 28th Conference on Learning Theory, {COLT} 2015, Paris, France, July 3-6, 2015},\n\tpages        = {113--149},\n\turl          = {http://jmlr.org/proceedings/papers/v40/Arora15.html},\n\tcrossref     = {DBLP:conf/colt/2015},\n\ttimestamp    = {Tue, 12 Jul 2016 21:51:13 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/colt/AroraGMM15},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org},\n\tpp           = {113–149}\n}\n@inproceedings{arora16inferencetopic,\n\ttitle        = {Provable Algorithms for Inference in Topic Models},\n\tauthor       = {Sanjeev Arora and Rong Ge and Frederic Koehler and Tengyu Ma and Ankur Moitra},\n\tyear         = 2016,\n\tbooktitle    = {Proceedings of the 33nd International Conference on Machine Learning, {ICML} 2016, New York City, NY, USA, June 19-24, 2016},\n\tpages        = {2859--2867},\n\turl          = {http://jmlr.org/proceedings/papers/v48/arorab16.html},\n\tcrossref     = {DBLP:conf/icml/2016},\n\ttimestamp    = {Tue, 03 Jan 2017 13:40:36 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/icml/AroraGKMM16},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{arora2009interactive,\n\ttitle        = {Interactive annotation learning with indirect feature voting},\n\tauthor       = {Shilpa Arora and Eric Nyberg},\n\tyear         = 2009,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {55--60}\n}\n@inproceedings{arora2012learning,\n\ttitle        = {Learning topic models--going beyond {SVD}},\n\tauthor       = {Sanjeev Arora and Rong Ge and Ankur Moitra},\n\tyear         = 2012,\n\tbooktitle    = {Foundations of Computer Science (FOCS)}\n}\n@article{Arora2013,\n\ttitle        = {{New Algorithms for Learning Incoherent and Overcomplete Dictionaries}},\n\tauthor       = {{Arora}, S. and {Ge}, R. and {Moitra}, A.},\n\tyear         = 2013,\n\tmonth        = aug,\n\tjournal      = {ArXiv e-prints}\n}\n@inproceedings{arora2013practical,\n\ttitle        = {A practical algorithm for topic modeling with provable guarantees},\n\tauthor       = {Arora, Sanjeev and Ge, Rong and Halpern, Yonatan and Mimno, David and Moitra, Ankur and Sontag, David and Wu, Yichen and Zhu, Michael},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {280--288},\n\torganization = {PMLR}\n}\n@article{arora2014more,\n\ttitle        = {More algorithms for provable dictionary learning},\n\tauthor       = {Arora, Sanjeev and Bhaskara, Aditya and Ge, Rong and Ma, Tengyu},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1401.0579}\n}\n@inproceedings{arora2014provable,\n\ttitle        = {Provable bounds for learning some deep representations},\n\tauthor       = {Arora, Sanjeev and Bhaskara, Aditya and Ge, Rong and Ma, Tengyu},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {584--592},\n\turl          = {http://jmlr.org/proceedings/papers/v32/arora14.html},\n\tcrossref     = {DBLP:conf/icml/2014},\n\ttimestamp    = {Sun, 26 Oct 2014 02:38:30 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/icml/AroraBGM14},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@article{arora2015deep,\n\ttitle        = {Why are deep nets reversible: A simple theory, with implications for training},\n\tauthor       = {Arora, Sanjeev and Liang, Yingyu and Ma, Tengyu},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1511.05653}\n}\n@article{arora2015rand,\n\ttitle        = {Rand-walk: A latent variable model approach to word embeddings},\n\tauthor       = {Arora, Sanjeev and Li, Yuanzhi and Liang, Yingyu and Ma, Tengyu and Risteski, Andrej},\n\tyear         = 2015,\n\tjournal      = {Transactions of the Association for Computational Linguistics}\n}\n@article{arora2015simple,\n\ttitle        = {Simple, efficient, and neural algorithms for sparse coding},\n\tauthor       = {Arora, Sanjeev and Ge, Rong and Ma, Tengyu and Moitra, Ankur},\n\tyear         = 2015,\n\tbooktitle    = {Conference on Learning Theory},\n\tpublisher    = {Proceedings of Machine Learning Research},\n\tpages        = {113--149}\n}\n@article{arora2016latent,\n\ttitle        = {A latent variable model approach to pmi-based word embeddings},\n\tauthor       = {Arora, Sanjeev and Li, Yuanzhi and Liang, Yingyu and Ma, Tengyu and Risteski, Andrej},\n\tyear         = 2016,\n\tjournal      = {Transactions of the Association for Computational Linguistics},\n\tpublisher    = {MIT Press},\n\tvolume       = 4,\n\tpages        = {385--399}\n}\n@article{arora2016linear,\n\ttitle        = {Linear algebraic structure of word senses, with applications to polysemy},\n\tauthor       = {Arora, Sanjeev and Li, Yuanzhi and Liang, Yingyu and Ma, Tengyu and Risteski, Andrej},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1601.03764}\n}\n@inproceedings{arora2016provable,\n\ttitle        = {Provable Algorithms for Inference in Topic Models},\n\tauthor       = {Arora, Sanjeev and Ge, Rong and Koehler, Frederic and Ma, Tengyu and Moitra, Ankur},\n\tyear         = 2016,\n\tbooktitle    = {The 33rd International Conference on Machine Learning (ICML 2016). arXiv preprint arXiv:1605.08491}\n}\n@article{arora2016simple,\n\ttitle        = {A simple but tough-to-beat baseline for sentence embeddings},\n\tauthor       = {Arora, Sanjeev and Liang, Yingyu and Ma, Tengyu},\n\tyear         = 2016,\n\tbooktitle    = {5th International Conference on Learning Representations (ICLR 2017)}\n}\n@article{arora2016understanding,\n\ttitle        = {Understanding deep neural networks with rectified linear units},\n\tauthor       = {Arora, Raman and Basu, Amitabh and Mianjy, Poorya and Mukherjee, Anirbit},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1611.01491}\n}\n@article{arora2017gan,\n\ttitle        = {Generalization and Equilibrium in Generative Adversarial Nets (GANs)},\n\tauthor       = {Sanjeev Arora and Rong Ge and Yingyu Liang and Tengyu Ma and Yi Zhang},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@inproceedings{arora2017generalization,\n\ttitle        = {Generalization and equilibrium in generative adversarial nets ({GANs})},\n\tauthor       = {Arora, Sanjeev and Ge, Rong and Liang, Yingyu and Ma, Tengyu and Zhang, Yi},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning}\n}\n@inproceedings{arora2017provable,\n\ttitle        = {Provable learning of noisy-OR networks},\n\tauthor       = {Sanjeev Arora and Rong Ge and Tengyu Ma and Andrej Risteski},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the 49th Annual {ACM} {SIGACT} Symposium on Theory of Computing, {STOC} 2017, Montreal, QC, Canada, June 19-23, 2017},\n\tpages        = {1057--1066},\n\tdoi          = {10.1145/3055399.3055482},\n\turl          = {http://doi.acm.org/10.1145/3055399.3055482},\n\tcrossref     = {DBLP:conf/stoc/2017},\n\ttimestamp    = {Sat, 17 Jun 2017 18:46:57 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/stoc/Arora0MR17},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@article{arora2018convergence,\n\ttitle        = {A Convergence Analysis of Gradient Descent for Deep Linear Neural Networks},\n\tauthor       = {Arora, Sanjeev and Cohen, Nadav and Golowich, Noah and Hu, Wei},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1810.02281}\n}\n@article{arora2018linear,\n\ttitle        = {Linear Algebraic Structure of Word Senses, with Applications to Polysemy},\n\tauthor       = {Sanjeev Arora and Yuanzhi Li and Yingyu Liang and Tengyu Ma and Andrej Risteski},\n\tyear         = 2018,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 6\n}\n@article{arora2018optimization,\n\ttitle        = {On the optimization of deep networks: Implicit acceleration by overparameterization},\n\tauthor       = {Arora, Sanjeev and Cohen, Nadav and Hazan, Elad},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1802.06509},\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {244--253}\n}\n@article{arora2018stronger,\n\ttitle        = {Stronger generalization bounds for deep nets via a compression approach},\n\tauthor       = {Arora, Sanjeev and Ge, Rong and Neyshabur, Behnam and Zhang, Yi},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1802.05296}\n}\n@article{arora2018theoretical,\n\ttitle        = {Theoretical analysis of auto rate-tuning by batch normalization},\n\tauthor       = {Arora, Sanjeev and Li, Zhiyuan and Lyu, Kaifeng},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1812.03981}\n}\n@article{arora2019exact,\n\ttitle        = {On exact computation with an infinitely wide neural net},\n\tauthor       = {Arora, Sanjeev and Du, Simon S and Hu, Wei and Li, Zhiyuan and Salakhutdinov, Ruslan and Wang, Ruosong},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1904.11955},\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 32,\n\tpages        = {},\n\turl          = {https://proceedings.neurips.cc/paper/2019/file/dbc4d84bfcfe2284ba11beffb853a8c4-Paper.pdf},\n\teditor       = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\\textquotesingle Alch\\'{e}-Buc and E. Fox and R. Garnett}\n}\n@article{arora2019fine,\n\ttitle        = {Fine-grained analysis of optimization and generalization for overparameterized two-layer neural networks},\n\tauthor       = {Arora, Sanjeev and Du, Simon S and Hu, Wei and Li, Zhiyuan and Wang, Ruosong},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1901.08584},\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {322--332},\n\torganization = {PMLR}\n}\n@inproceedings{arora2019implicit,\n\ttitle        = {Implicit regularization in deep matrix factorization},\n\tauthor       = {Arora, Sanjeev and Cohen, Nadav and Hu, Wei and Luo, Yuping},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {7411--7422}\n}\n@inproceedings{arora2019theoretical,\n\ttitle        = {A theoretical analysis of contrastive unsupervised representation learning},\n\tauthor       = {Arora, Sanjeev and Khandeparkar, Hrishikesh and Khodak, Mikhail and Plevrakis, Orestis and Saunshi, Nikunj},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1902.09229},\n\tbooktitle    = {International Conference on Machine Learning}\n}\n@article{arora2020dropout,\n\ttitle        = {Dropout: Explicit Forms and Capacity Control},\n\tauthor       = {Arora, Raman and Bartlett, Peter and Mianjy, Poorya and Srebro, Nathan},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2003.03397}\n}\n@inproceedings{arora2020harnessing,\n\ttitle        = {Harnessing the Power of Infinitely Wide Deep Nets on Small-data Tasks},\n\tauthor       = {Sanjeev Arora and Simon S. Du and Zhiyuan Li and Ruslan Salakhutdinov and Ruosong Wang and Dingli Yu},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Learning Representations},\n\turl          = {https://openreview.net/forum?id=rkl8sJBYvH}\n}\n@inproceedings{arora2020provable,\n\ttitle        = {Provable representation learning for imitation learning via bi-level optimization},\n\tauthor       = {Arora, Sanjeev and Du, Simon and Kakade, Sham and Luo, Yuping and Saunshi, Nikunj},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {367--376},\n\torganization = {PMLR}\n}\n@book{AroraBarak,\n\ttitle        = {Computational Complexity - {A} Modern Approach},\n\tauthor       = {Sanjeev Arora and Boaz Barak},\n\tyear         = 2009,\n\tpublisher    = {Cambridge University Press},\n\tisbn         = {978-0-521-42426-4},\n\turl          = {http://www.cambridge.org/catalogue/catalogue.asp?isbn=9780521424264},\n\ttimestamp    = {Mon, 29 Sep 2014 03:39:22 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/books/daglib/0023084},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{AroraGHMMSWZ13,\n\ttitle        = {A Practical Algorithm for Topic Modeling with Provable Guarantees},\n\tauthor       = {Sanjeev Arora and Rong Ge and Yonatan Halpern and David M. Mimno and Ankur Moitra and David Sontag and Yichen Wu and Michael Zhu},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of the 30th International Conference on Machine Learning, {ICML} 2013, Atlanta, GA, USA, 16-21 June 2013},\n\tpages        = {280--288}\n}\n@inproceedings{AroraGM14,\n\ttitle        = {New Algorithms for Learning Incoherent and Overcomplete Dictionaries},\n\tauthor       = {Sanjeev Arora and Rong Ge and Ankur Moitra},\n\tyear         = 2014,\n\tjournal      = {CoRR},\n\tbooktitle    = {Proceedings of The 27th Conference on Learning Theory, {COLT} 2014, Barcelona, Spain, June 13-15, 2014},\n\tvolume       = {abs/1308.6273},\n\tpages        = {779--806},\n\turl          = {http://jmlr.org/proceedings/papers/v35/arora14.html},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de},\n\tee           = {http://arxiv.org/abs/1308.6273},\n\tcrossref     = {DBLP:conf/colt/2014},\n\ttimestamp    = {Sun, 26 Oct 2014 02:37:38 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/colt/AroraGM14}\n}\n@inproceedings{AroraKale2007,\n\ttitle        = {{A combinatorial, primal-dual approach to semidefinite programs}},\n\tauthor       = {Arora, Sanjeev and Kale, Satyen},\n\tyear         = 2007,\n\tbooktitle    = {Proceedings of the thirty-ninth annual ACM symposium on Theory of computing - STOC '07},\n\tpublisher    = {ACM Press},\n\taddress      = {New York, New York, USA},\n\tpages        = 227,\n\tdoi          = {10.1145/1250790.1250823},\n\tisbn         = 9781595936318,\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Arora, Kale - 2007 - A combinatorial, primal-dual approach to semidefinite programs.pdf:pdf},\n\tmendeley-groups = {Algorithms/Multiplicative Weight,Algorithms/Multiplicative Weight/SDP}\n}\n@article{AroraKannan:Mixtures,\n\ttitle        = {LEARNING MIXTURES OF SEPARATED NONSPHERICAL GAUSSIANS},\n\tauthor       = {Sanjeev Arora and Ravi Kannan},\n\tyear         = 2005,\n\tjournal      = {The Annals of Applied Probability},\n\tvolume       = 15,\n\tnumber       = {1A},\n\tpages        = {69--92}\n}\n@inproceedings{arpit2017memorization,\n\ttitle        = {A Closer Look at Memorization in Deep Networks},\n\tauthor       = {Devansh Arpit and Stanislaw Jastrzebski and Nicolas Ballas and David Krueger and Emmanuel Bengio and Maxinder S. Kanwal and Tegan Maharaj and Asja Fischer and Aaron Courville and Yoshua Bengio and Simon Lacoste-Julien},\n\tyear         = 2017,\n\tmonth        = {06--11 Aug},\n\tbooktitle    = {Proceedings of the 34th International Conference on Machine Learning},\n\tpublisher    = {PMLR},\n\tseries       = {Proceedings of Machine Learning Research},\n\tvolume       = 70,\n\tpages        = {233--242},\n\turl          = {https://proceedings.mlr.press/v70/arpit17a.html},\n\teditor       = {Precup, Doina and Teh, Yee Whye},\n\tpdf          = {http://proceedings.mlr.press/v70/arpit17a/arpit17a.pdf}\n}\n@article{arpit2019benefits,\n\ttitle        = {The Benefits of Over-parameterization at Initialization in Deep ReLU Networks},\n\tauthor       = {Arpit, Devansh and Bengio, Yoshua},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1901.03611}\n}\n@inproceedings{arrieta2018should,\n\ttitle        = {Should We Treat Data as Labor? {Moving} beyond ``Free''},\n\tauthor       = {Imanol Arrieta-Ibarra and Leonard Goff and Diego Jim{\\'e}nez-Hern{\\'a}ndez and Jaron Lanier and E Glen Weyl},\n\tyear         = 2018,\n\tbooktitle    = {American Economic Association Papers and Proceedings},\n\tvolume       = 108,\n\tpages        = {38--42}\n}\n@article{arrow1973theory,\n\ttitle        = {The theory of discrimination},\n\tauthor       = {Kenneth Arrow},\n\tyear         = 1973,\n\tjournal      = {Discrimination in labor markets},\n\tvolume       = 3,\n\tnumber       = 10,\n\tpages        = {3--33}\n}\n@article{arslan2017decentralized,\n\ttitle        = {Decentralized {Q}-learning for stochastic teams and games},\n\tauthor       = {Arslan, G{\\\"u}rdal and Y{\\\"u}ksel, Serdar},\n\tyear         = 2017,\n\tjournal      = {IEEE Transactions on Automatic Control},\n\tpublisher    = {IEEE},\n\tvolume       = 62,\n\tnumber       = 4,\n\tpages        = {1545--1558}\n}\n@article{artemiadis2010emg,\n\ttitle        = {{EMG}-based control of a robot arm using low-dimensional embeddings},\n\tauthor       = {Panagiotis K Artemiadis and Kostas J Kyriakopoulos},\n\tyear         = 2010,\n\tjournal      = {IEEE Transactions on Robotics (T-RO)},\n\tvolume       = 26,\n\tpages        = {393--398}\n}\n@article{artetxe2017nmt,\n\ttitle        = {Unsupervised Neural Machine Translation},\n\tauthor       = {Mikel Artetxe and Gorka Labaka and Eneko Agirre and Kyunghyun Cho},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1710.11041}\n}\n@article{artetxe2018unsupervised,\n\ttitle        = {Unsupervised statistical machine translation},\n\tauthor       = {Mikel Artetxe and Gorka Labaka and Eneko Agirre},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1809.01272}\n}\n@inproceedings{artzi11conversations,\n\ttitle        = {Bootstrapping Semantic Parsers from Conversations},\n\tauthor       = {Yoav Artzi and Luke Zettlemoyer},\n\tyear         = 2011,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {421--432}\n}\n@article{artzi2013uw,\n\ttitle        = {{UW} {SPF}: The {U}niversity of {W}ashington Semantic Parsing Framework},\n\tauthor       = {Yoav Artzi and Luke Zettlemoyer},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1311.3011}\n}\n@article{artzi2013weakly,\n\ttitle        = {Weakly supervised learning of semantic parsers for mapping instructions to actions},\n\tauthor       = {Yoav Artzi and Luke Zettlemoyer},\n\tyear         = 2013,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 1,\n\tpages        = {49--62}\n}\n@inproceedings{artzi2015broad,\n\ttitle        = {Broad-coverage {CCG} Semantic Parsing with {AMR}},\n\tauthor       = {Yoav Artzi and Kenton Lee Luke Zettlemoyer},\n\tyear         = 2015,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{arulampalam2002tutorial,\n\ttitle        = {A tutorial on particle filters for on-line non-linear/non-{G}aussian {B}ayesian tracking},\n\tauthor       = {Sanjeev Arulampalam and Simon Maskell and Neil Gordon and Tim Clapp},\n\tyear         = 2002,\n\tjournal      = {IEEE Transactions on Signal Processing},\n\tvolume       = 50,\n\tnumber       = 2,\n\tpages        = {174--188}\n}\n@inproceedings{arumugam2017accurately,\n\ttitle        = {Accurately and Efficiently Interpreting Human-Robot Instructions of Varying Granularities},\n\tauthor       = {Dilip Arumugam and Siddharth Karamcheti and Nakul Gopalan and Lawson L. S. Wong and Stefanie Tellex},\n\tyear         = 2017,\n\tbooktitle    = {Robotics: Science and Systems (RSS)}\n}\n@article{ARV,\n\ttitle        = {Expander flows, geometric embeddings and graph partitioning},\n\tauthor       = {Arora, Sanjeev and Rao, Satish and Vazirani, Umesh},\n\tyear         = 2009,\n\tjournal      = {Journal of the ACM (JACM)},\n\tpublisher    = {ACM},\n\tvolume       = 56,\n\tnumber       = 2,\n\tpages        = 5\n}\n@article{ARV09,\n\ttitle        = {Expander flows, geometric embeddings and graph partitioning},\n\tauthor       = {Sanjeev Arora and Satish Rao and Umesh V. Vazirani},\n\tyear         = 2009,\n\tjournal      = {Journal of the ACM},\n\tvolume       = 56,\n\tnumber       = 2,\n\tee           = {http://doi.acm.org/10.1145/1502793.1502794},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de}\n}\n@article{arxivCohenKKPPRS18,\n\ttitle        = {Solving Directed Laplacian Systems in Nearly-Linear Time through Sparse {LU} Factorizations},\n\tauthor       = {Michael B. Cohen and Jonathan A. Kelner and Rasmus Kyng and John Peebles and Richard Peng and Anup B. Rao and Aaron Sidford},\n\tyear         = 2018,\n\tjournal      = {CoRR},\n\tbooktitle    = {59th {IEEE} Annual Symposium on Foundations of Computer Science, {FOCS} 2018, Paris, France, October 7-9, 2018},\n\tvolume       = {abs/1811.10722},\n\tpages        = {898--909}\n}\n@article{arxivCohenKPPSV16,\n\ttitle        = {Faster Algorithms for Computing the Stationary Distribution, Simulating Random Walks, and More},\n\tauthor       = {Michael B. Cohen and Jonathan A. Kelner and John Peebles and Richard Peng and Aaron Sidford and Adrian Vladu},\n\tyear         = 2016,\n\tjournal      = {CoRR},\n\tbooktitle    = {{IEEE} 57th Annual Symposium on Foundations of Computer Science, {FOCS} 2016, 9-11 October 2016, Hyatt Regency, New Brunswick, New Jersey, {USA}},\n\tvolume       = {abs/1608.03270},\n\tpages        = {583--592}\n}\n@inproceedings{arzate2020survey,\n\ttitle        = {A survey on interactive reinforcement learning: Design principles and open challenges},\n\tauthor       = {Christian Arzate Cruz and Takeo Igarashi},\n\tyear         = 2020,\n\tbooktitle    = {Proceedings of the 2020 ACM Designing Interactive Systems Conference},\n\tpages        = {1195--1209}\n}\n@inproceedings{Asadpour2010,\n\ttitle        = {{An $O(\\log n / \\log \\log n )$-approximation Algorithm for the Asymmetric Traveling Salesman Problem}},\n\tauthor       = {Asadpour, Arash and Goemans, Michel X. and Mądry, Aleksander and Gharan, Shayan Oveis and Saberi, Amin},\n\tyear         = 2010,\n\tbooktitle    = {Proceedings of the Twenty-First Annual ACM-SIAM Symposium on Discrete Algorithms - SODA '10},\n\tpages        = {379--389},\n\tisbn         = {0001405101},\n\tfile         = {:D$\\backslash$:/Mendeley Desktop/Asadpour et al. - 2010 - An O ( log n log log n ) -approximation Algorithm for the Asymmetric Traveling Salesman Problem.pdf:pdf},\n\tmendeley-groups = {Algorithms/Traveling Salesman}\n}\n@inproceedings{asher2016catan,\n\ttitle        = {Discourse Structure and Dialogue Acts in Multiparty Dialogue: the {STAC} Corpus},\n\tauthor       = {Nicholas Asher and Julie Hunter and Mathieu Morey and Farah Benamara and Stergos Afantenos},\n\tyear         = 2016,\n\tbooktitle    = {Language Resources and Evaluation Conference (LREC)}\n}\n@inproceedings{ashok2014wizard,\n\ttitle        = {Wizard-of-{O}z evaluation of speech-driven web browsing interface for people with vision impairments},\n\tauthor       = {Vikas Ashok and Yevgen Borodin and Svetlana Stoyanchev and Yury Puzis and I. V. Ramakrishnan},\n\tyear         = 2014,\n\tbooktitle    = {Web for All Conference}\n}\n@article{ashtiani2017sample,\n\ttitle        = {Sample-Efficient Learning of Mixtures},\n\tauthor       = {Hassan Ashtiani and Shai Ben-David and Abbas Mehrabian},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@inproceedings{aslam2006statistical,\n\ttitle        = {A statistical method for system evaluation using incomplete judgments},\n\tauthor       = {Javed A. Aslam and Virgil Pavlu and Emine Yilmaz},\n\tyear         = 2006,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)},\n\tpages        = {541--548}\n}\n@inproceedings{asm08,\n\ttitle        = {Fitted {Q}-iteration in continuous action-space MDPs},\n\tauthor       = {Antos, Andr{\\'a}s and Szepesv{\\'a}ri, Csaba and Munos, R{\\'e}mi},\n\tyear         = 2008,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {9--16}\n}\n@article{asm08a,\n\ttitle        = {Learning near-optimal policies with Bellman-residual minimization based fitted policy iteration and a single sample path},\n\tauthor       = {Antos, Andr{\\'a}s and Szepesv{\\'a}ri, Csaba and Munos, R{\\'e}mi},\n\tyear         = 2008,\n\tjournal      = {Machine Learning},\n\tpublisher    = {Springer},\n\tvolume       = 71,\n\tnumber       = 1,\n\tpages        = {89--129}\n}\n@article{assouad1983deux,\n\ttitle        = {Deux remarques sur l'estimation},\n\tauthor       = {Patrice Assouad},\n\tyear         = 1983,\n\tjournal      = {Comptes rendus des s{\\'e}ances de l'Acad{\\'e}mie des sciences. S{\\'e}rie 1, Math{\\'e}matique},\n\tvolume       = 296,\n\tnumber       = 23,\n\tpages        = {1021--1024}\n}\n@incollection{asuncion2011distributed,\n\ttitle        = {Distributed Gibbs Sampling for Latent Variable Models},\n\tauthor       = {Asuncion, A. and Smyth, P. and Welling, M. and Newman, D. and Porteous, I. and Triglia, S.},\n\tyear         = 2011,\n\tbooktitle    = {Scaling Up Machine Learning: Parallel and Distributed Approaches},\n\tpublisher    = {Cambridge Univ Pr}\n}\n@article{athalye2017synthesizing,\n\ttitle        = {Synthesizing robust adversarial examples},\n\tauthor       = {Anish Athalye and Ilya Sutskever},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1707.07397}\n}\n@inproceedings{athalye2018obfuscated,\n\ttitle        = {Obfuscated gradients give a false sense of security: Circumventing defenses to adversarial examples},\n\tauthor       = {Anish Athalye and Nicholas Carlini and David Wagner},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{athanasopoulou2014low,\n\ttitle        = {Low-Dimensional Manifold Distributional Semantic Models},\n\tauthor       = {Georgia Athanasopoulou and Elias Iosif and Alexandros Potamianos},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)},\n\tpages        = {731--740}\n}\n@article{athey2015measure,\n\ttitle        = {A measure of robustness to misspecification},\n\tauthor       = {Susan Athey and Guido Imbens},\n\tyear         = 2015,\n\tjournal      = {The American Economic Review},\n\tvolume       = 105,\n\tnumber       = 5,\n\tpages        = {476--480}\n}\n@article{athreya1978new,\n\ttitle        = {A new approach to the limit theory of recurrent {M}arkov chains},\n\tauthor       = {Krishna B Athreya and P Ney},\n\tyear         = 1978,\n\tjournal      = {Transactions of the American Mathematical Society},\n\tvolume       = 245,\n\tpages        = {493--501}\n}\n@inproceedings{atkeson1997robot,\n\ttitle        = {Robot learning from demonstration},\n\tauthor       = {Christopher G Atkeson and Stefan Schaal},\n\tyear         = 1997,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tvolume       = 97,\n\tpages        = {12--20}\n}\n@inproceedings{attenberg2010why,\n\ttitle        = {Why Label when you can Search? Alternatives to Active Learning for Applying Human Resources to Build Classification Models Under Extreme Class Imbalance},\n\tauthor       = {Josh Attenberg and Foster Provost},\n\tyear         = 2010,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)}\n}\n@article{attene2013tox21,\n\ttitle        = {The Tox21 robotic platform for the assessment of environmental chemicals--from vision to reality},\n\tauthor       = {Matias S Attene-Ramos and Nicole Miller and Ruili Huang and Sam Michael and Misha Itkin and Robert J Kavlock and Christopher P Austin and Paul Shinn and Anton Simeonov and Raymond R Tice and others},\n\tyear         = 2013,\n\tjournal      = {Drug discovery today},\n\tvolume       = 18,\n\tnumber       = 15,\n\tpages        = {716--723}\n}\n@inproceedings{attias2019improved,\n\ttitle        = {Improved Generalization Bounds for Robust Learning},\n\tauthor       = {Idan Attias and Aryeh Kontorovich and Yishay Mansour},\n\tyear         = 2019,\n\tbooktitle    = {Algorithmic Learning Theory},\n\tpages        = {162--183}\n}\n@article{attouch2010proximal,\n\ttitle        = {Proximal alternating minimization and projection methods for nonconvex problems: An approach based on the Kurdyka-{\\L}ojasiewicz inequality},\n\tauthor       = {Attouch, H{\\'e}dy and Bolte, J{\\'e}r{\\^o}me and Redont, Patrick and Soubeyran, Antoine},\n\tyear         = 2010,\n\tjournal      = {Mathematics of operations research},\n\tpublisher    = {INFORMS},\n\tvolume       = 35,\n\tnumber       = 2,\n\tpages        = {438--457}\n}\n@article{atzmon2016compositions,\n\ttitle        = {Learning to generalize to new compositions in image understanding},\n\tauthor       = {Yuval Atzmon and Jonathan Berant and Vahid Kezami and Amir Globerson and Gal Chechik},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1608.07639}\n}\n@article{aubin2021linear,\n\ttitle        = {Linear unit-tests for invariance discovery},\n\tauthor       = {Aubin, Benjamin and S{\\l}owik, Agnieszka and Arjovsky, Martin and Bottou, Leon and Lopez-Paz, David},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2102.10867}\n}\n@article{audibert2009exploration,\n\ttitle        = {Exploration--exploitation tradeoff using variance estimates in multi-armed bandits},\n\tauthor       = {Jean-Yves Audibert and R{'e}mi Munos and Csaba Szepesv{'a}ri},\n\tyear         = 2009,\n\tjournal      = {Theoretical Computer Science},\n\tvolume       = 410,\n\tnumber       = 19,\n\tpages        = {1876--1902}\n}\n@article{audibert2011minimax,\n\ttitle        = {Minimax Policies for Combinatorial Prediction Games},\n\tauthor       = {Audibert, Jean-Yves and Bubeck, S{\\'e}bastien and Lugosi, G{\\'a}bor},\n\tyear         = 2011,\n\tjournal      = {Proceedings of COLT 2011}\n}\n@article{auer02nonstochastic,\n\ttitle        = {The Nonstochastic Multiarmed Bandit Problem},\n\tauthor       = {Peter Auer and Nicol\\`{o} {Cesa-Bianchi} and Yoav Freund and Robert E. Schapire},\n\tyear         = 2002,\n\tjournal      = {SIAM Journal on Computing},\n\tvolume       = 32,\n\tnumber       = 1,\n\tpages        = {48--77}\n}\n@inproceedings{Auer1995,\n\ttitle        = {{Gambling in a rigged casino: The adversarial multi-armed bandit problem}},\n\tauthor       = {Auer, Peter and {Cesa-Bianchi}, Nicol\\`{o} and Freund, Yoav and Schapire, Robert E.},\n\tyear         = 1995,\n\tbooktitle    = {Proceedings of IEEE 36th Annual Foundations of Computer Science},\n\tpublisher    = {IEEE Comput. Soc. Press},\n\tpages        = {322--331},\n\tdoi          = {10.1109/SFCS.1995.492488},\n\tisbn         = {0-8186-7183-1},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Unknown - Unknown - Gambling in a rigged casino The adversarial multi-armed bandit problem.pdf:pdf},\n\tmendeley-groups = {Optimization/Bandit}\n}\n@inproceedings{auer1995gambling,\n\ttitle        = {Gambling in a rigged casino: The adversarial multi-armed bandit problem},\n\tauthor       = {P. Auer and N. Cesa-Bianchi and Y. Freund and R. E. Schapire},\n\tyear         = 1995,\n\tbooktitle    = {International Conference on Robotics and Automation (ICRA)},\n\tpages        = {322--322}\n}\n@article{auer1996exponentially,\n\ttitle        = {Exponentially many local minima for single neurons},\n\tauthor       = {Auer, Peter and Herbster, Mark and Warmuth, Manfred K and others},\n\tyear         = 1996,\n\tjournal      = {Advances in neural information processing systems},\n\tpublisher    = {Citeseer},\n\tpages        = {316--322}\n}\n@inproceedings{auer1997multiple,\n\ttitle        = {On Learning From Multi-Instance Examples: Empirical Evaluation of a Theoretical Approach},\n\tauthor       = {Peter Auer},\n\tyear         = 1997,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {21--29}\n}\n@article{auer2002finite,\n\ttitle        = {Finite-time analysis of the multiarmed bandit problem},\n\tauthor       = {Peter Auer and Nicolo Cesa-Bianchi and Paul Fischer},\n\tyear         = 2002,\n\tjournal      = {Machine learning},\n\tvolume       = 47,\n\tnumber       = 2,\n\tpages        = {235--256}\n}\n@article{Auer2002nonstochastic,\n\ttitle        = {The nonstochastic multiarmed bandit problem},\n\tauthor       = {Auer, Peter and Cesa-Bianchi, Nicolo and Freund, Yoav and Schapire, Robert E},\n\tyear         = 2002,\n\tjournal      = {SIAM journal on computing},\n\tpublisher    = {SIAM},\n\tvolume       = 32,\n\tnumber       = 1,\n\tpages        = {48--77}\n}\n@article{Auer2002stochastic,\n\ttitle        = {{Finite-time analysis of the multiarmed bandit problem}},\n\tauthor       = {Auer, Peter and {Cesa-Bianchi}, Nicol\\`{o} and Fischer, Paul},\n\tyear         = 2002,\n\tjournal      = {Machine Learning},\n\tvolume       = 47,\n\tnumber       = {2-3},\n\tpages        = {235--256},\n\tdoi          = {10.1023/A:1013689704352},\n\tannote       = {\n\t\tThis is for the case when there is a fixed (but unknown) distribution where the feedbacks are generated.\n\n\t\tIt is different from the other type of bandit work where there is no distribution.\n\t},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Auer, Cesa-Bianchi, Fischer - 2002 - Finite-time analysis of the multiarmed bandit problem.pdf:pdf},\n\tmendeley-groups = {Optimization/Bandit}\n}\n@inproceedings{auer2007dbpedia,\n\ttitle        = {{DB}pedia: A nucleus for a web of open data},\n\tauthor       = {Sören Auer and Christian Bizer and Georgi Kobilarov and Jens Lehmann and Richard Cyganiak and Zachary G. Ives},\n\tyear         = 2007,\n\tbooktitle    = {International semantic web conference and Asian semantic web conference (ISWC/ASWC)},\n\tpages        = {722--735}\n}\n@inproceedings{auer2007logarithmic,\n\ttitle        = {Logarithmic online regret bounds for undiscounted reinforcement learning},\n\tauthor       = {Auer, Peter and Ortner, Ronald},\n\tyear         = 2007,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {49--56}\n}\n@article{auffinger2013complexity,\n\ttitle        = {Complexity of random smooth functions on the high-dimensional sphere},\n\tauthor       = {Auffinger, Antonio and Arous, Gerard Ben and others},\n\tyear         = 2013,\n\tjournal      = {The Annals of Probability},\n\tpublisher    = {Institute of Mathematical Statistics},\n\tvolume       = 41,\n\tnumber       = 6,\n\tpages        = {4214--4247}\n}\n@article{auffinger2013random,\n\ttitle        = {Random matrices and complexity of spin glasses},\n\tauthor       = {Auffinger, Antonio and Arous, G{\\'e}rard Ben and {\\v{C}}ern{\\`y}, Ji{\\v{r}}{\\'\\i}},\n\tyear         = 2013,\n\tjournal      = {Communications on Pure and Applied Mathematics},\n\tpublisher    = {Wiley Online Library},\n\tvolume       = 66,\n\tnumber       = 2,\n\tpages        = {165--201}\n}\n@inproceedings{auli2011efficient,\n\ttitle        = {Efficient {CCG} parsing: A* versus adaptive supertagging},\n\tauthor       = {Miachel Auli and Adam Lopez},\n\tyear         = 2011,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{aumann1995backward,\n\ttitle        = {Backward induction and common knowledge of rationality},\n\tauthor       = {Robert J Aumann},\n\tyear         = 1995,\n\tjournal      = {Games and Economic Behavior},\n\tvolume       = 8,\n\tnumber       = 1,\n\tpages        = {6--19}\n}\n@article{austin2008exchangeable,\n\ttitle        = {On exchangeable random variables and the statistics of large graphs and hypergraphs},\n\tauthor       = {T. Austin},\n\tyear         = 2008,\n\tjournal      = {Probab. Survey},\n\tvolume       = 5,\n\tpages        = {80--145}\n}\n@book{austin62do,\n\ttitle        = {How to do Things with Words: The {W}illiam {J}ames Lectures delivered at {H}arvard University in 1955},\n\tauthor       = {John Langshaw Austin},\n\tyear         = 1962,\n\tpublisher    = {Oxford}\n}\n@article{aviv2017human,\n\ttitle        = {The human cell atlas},\n\tauthor       = {Regev Aviv and Sarah A Teichmann and Eric S Lander and Amit Ido and Benoist Christophe and Birney Ewan and Bodenmiller Bernd and Peter Campbell and Carninci Piero and Clatworthy Menna and others},\n\tyear         = 2017,\n\tjournal      = {Elife},\n\tvolume       = 6\n}\n@article{avsec2019deep,\n\ttitle        = {Deep learning at base-resolution reveals motif syntax of the cis-regulatory code},\n\tauthor       = {{\\v{Z}}iga Avsec and Melanie Weilert and Avanti Shrikumar and Amr Alexandari and Sabrina Krueger and Khyati Dalal and Robin Fropf and Charles McAnany and Julien Gagneur and Anshul Kundaje and Julia Zeitlinger},\n\tyear         = 2019,\n\tjournal      = {bioRxiv}\n}\n@article{awasthi2012improved,\n\ttitle        = {Improved spectral-norm bounds for clustering},\n\tauthor       = {Pranjal Awasthi and Or Sheffet},\n\tyear         = 2012,\n\tjournal      = {Approximation, Randomization, and Combinatorial Optimization},\n\tpages        = {37--49}\n}\n@inproceedings{awasthi2013learning,\n\ttitle        = {Learning Using Local Membership Queries},\n\tauthor       = {Pranjal Awasthi and Vitaly Feldman and Varun Kanade},\n\tyear         = 2013,\n\tbooktitle    = {Conference on Learning Theory (COLT)},\n\tpages        = {398--431}\n}\n@inproceedings{awasthi2014learning,\n\ttitle        = {Learning mixtures of ranking models},\n\tauthor       = {Awasthi, Pranjal and Blum, Avrim and Sheffet, Or and Vijayaraghavan, Aravindan},\n\tyear         = 2014,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {2609--2617}\n}\n@inproceedings{awasthi2014power,\n\ttitle        = {The power of localization for efficiently learning linear separators with noise},\n\tauthor       = {Pranjal Awasthi and Maria Florina Balcan and Philip M. Long},\n\tyear         = 2014,\n\tbooktitle    = {Symposium on Theory of Computing (STOC)},\n\tpages        = {449--458}\n}\n@inproceedings{awerbuch2004adaptive,\n\ttitle        = {Adaptive routing with end-to-end feedback: Distributed learning and geometric approaches},\n\tauthor       = {Awerbuch, Baruch and Kleinberg, Robert D},\n\tyear         = 2004,\n\tbooktitle    = {Proceedings of the thirty-sixth annual ACM symposium on Theory of computing},\n\tpages        = {45--53},\n\torganization = {ACM}\n}\n@article{Awerbuch2008,\n\ttitle        = {{Stateless distributed gradient descent for positive linear programs}},\n\tauthor       = {Awerbuch, Baruch and Khandekar, Rohit},\n\tyear         = 2008,\n\tjournal      = {Proceedings of the fourtieth annual ACM symposium on Theory of computing - STOC 08},\n\tpublisher    = {ACM Press},\n\taddress      = {New York, New York, USA},\n\tpages        = 691,\n\tdoi          = {10.1145/1374376.1374476},\n\tisbn         = 9781605580470,\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Awerbuch, Khandekar - 2008 - Stateless distributed gradient descent for positive linear programs.pdf:pdf},\n\tkeywords     = {convergence,distributed and stateless algorithms,fast,gradient descent,linear programming},\n\tmendeley-groups = {Algorithms/Multiplicative Weight/LP}\n}\n@inproceedings{AwerbuchAzarKhandekar2008soda,\n\ttitle        = {Fast Load Balancing via Bounded Best Response},\n\tauthor       = {Awerbuch, Baruch and Azar, Yossi and Khandekar, Rohit},\n\tyear         = 2008,\n\tbooktitle    = {Proceedings of the Nineteenth Annual ACM-SIAM Symposium on Discrete Algorithms},\n\tlocation     = {San Francisco, California},\n\tpublisher    = {Society for Industrial and Applied Mathematics},\n\taddress      = {Philadelphia, PA, USA},\n\tseries       = {SODA '08},\n\tpages        = {314--322},\n\tnumpages     = 9,\n\tacmid        = 1347117\n}\n@incollection{AwerbuchKhandekar2008latin,\n\ttitle        = {Stateless near optimal flow control with poly-logarithmic convergence},\n\tauthor       = {Awerbuch, Baruch and Khandekar, Rohit},\n\tyear         = 2008,\n\tbooktitle    = {LATIN 2008: Theoretical Informatics},\n\tpublisher    = {Springer},\n\tpages        = {580--592}\n}\n@article{AwerbuchKhandekar2009DistributedComputing,\n\ttitle        = {Greedy distributed optimization of multi-commodity flows},\n\tauthor       = {Awerbuch, Baruch and Khandekar, Rohit},\n\tyear         = 2009,\n\tjournal      = {Distributed Computing},\n\tpublisher    = {Springer-Verlag},\n\tvolume       = 21,\n\tnumber       = 5,\n\tpages        = {317--329},\n\tdoi          = {10.1007/s00446-008-0074-0},\n\tissn         = {0178-2770},\n\tkeywords     = {Multi-commodity flows; Distributed algorithms; Statelessness; Self-stabilization}\n}\n@article{AwerbuchKR2012,\n\ttitle        = {{Distributed algorithms for multicommodity flow problems via approximate steepest descent framework}},\n\tauthor       = {Awerbuch, Baruch and Khandekar, Rohit and Rao, Satish},\n\tyear         = 2012,\n\tmonth        = dec,\n\tjournal      = {ACM Transactions on Algorithms},\n\tvolume       = 9,\n\tnumber       = 1,\n\tpages        = {1--14},\n\tdoi          = {10.1145/2390176.2390179},\n\tissn         = 15496325,\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Awerbuch, Khandekar, Rao - 2012 - Distributed algorithms for multicommodity flow problems via approximate steepest descent framework.pdf:pdf},\n\tmendeley-groups = {Algorithms/Multiplicative Weight/Flow}\n}\n@inproceedings{AY2015-coord,\n\ttitle        = {Even Faster Accelerated Coordinate Descent Using Non-Uniform Sampling},\n\tauthor       = {{Allen-Zhu}, Zeyuan and Richt\\'arik, Peter and Qu, Zheng and Yuan, Yang},\n\tyear         = 2016,\n\tbooktitle    = {ICML}\n}\n@inproceedings{AY2015-univr,\n\ttitle        = {{Improved SVRG for Non-Strongly-Convex or Sum-of-Non-Convex Objectives}},\n\tauthor       = {{Allen-Zhu}, Zeyuan and Yuan, Yang},\n\tyear         = 2016,\n\tbooktitle    = {ICML}\n}\n@inproceedings{aydemir2011search,\n\ttitle        = {Search in the real world: Active visual object search based on spatial relations},\n\tauthor       = {A. Aydemir and K. Sjoo and J. Folkesson and A. Pronobis and P. Jensfelt},\n\tyear         = 2011,\n\tbooktitle    = {International Conference on Robotics and Automation (ICRA)},\n\tpages        = {2818--2824}\n}\n@inproceedings{ayoub2020model,\n\ttitle        = {Model-Based Reinforcement Learning with Value-Targeted Regression},\n\tauthor       = {Ayoub, Alex and Jia, Zeyu and Szepesvari, Csaba and Wang, Mengdi and Yang, Lin F},\n\tyear         = 2020,\n\tbooktitle    = {Proceedings of the 37th International Conference on Machine Learning}\n}\n@article{aytar2018playing,\n\ttitle        = {Playing hard exploration games by watching YouTube},\n\tauthor       = {Y. Aytar and T. Pfaff and D Budden and T. L. Paine and Z. Wang and N. de Freitas},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.11592}\n}\n@article{azar2011reinforcement,\n\ttitle        = {Reinforcement learning with a near optimal rate of convergence},\n\tauthor       = {Azar, Mohammad Gheshlaghi and Munos, R{\\'e}mi and Ghavamzadeh, Mohammad and Kappen, Hilbert},\n\tyear         = 2011\n}\n@inproceedings{azar2011speedy,\n\ttitle        = {Speedy Q-learning},\n\tauthor       = {Azar, Mohammad Gheshlaghi and Munos, Remi and Ghavamzadeh, Mohammad and Kappen, Hilbert},\n\tyear         = 2011,\n\tbooktitle    = {Advances in neural information processing systems}\n}\n@article{azar2012sample,\n\ttitle        = {On the sample complexity of reinforcement learning with a generative model},\n\tauthor       = {Azar, Mohammad Gheshlaghi and Munos, R{\\'e}mi and Kappen, Bert},\n\tyear         = 2012,\n\tjournal      = {arXiv preprint arXiv:1206.6461}\n}\n@book{azar2012theory,\n\ttitle        = {On the theory of reinforcement learning: methods, convergence analysis and sample complexity},\n\tauthor       = {Azar, Mohammad Gheshlaghi},\n\tyear         = 2012,\n\tpublisher    = {UB Nijmegen [host]}\n}\n@article{azar2013minimax,\n\ttitle        = {Minimax {PAC} bounds on the sample complexity of reinforcement learning with a generative model},\n\tauthor       = {Azar, Mohammad Gheshlaghi and Munos, R{\\'e}mi and Kappen, Hilbert J},\n\tyear         = 2013,\n\tjournal      = {Machine learning},\n\tpublisher    = {Springer},\n\tvolume       = 91,\n\tnumber       = 3,\n\tpages        = {325--349}\n}\n@inproceedings{azar2017minimax,\n\ttitle        = {Minimax regret bounds for reinforcement learning},\n\tauthor       = {Azar, Mohammad Gheshlaghi and Osband, Ian and Munos, R{\\'e}mi},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the 34th International Conference on Machine Learning},\n\tpages        = {263--272}\n}\n@inproceedings{azaria2016instructable,\n\ttitle        = {Instructable Intelligent Personal Agent},\n\tauthor       = {Amos Azaria and Jayant Krishnamurthy and Tom M. Mitchell},\n\tyear         = 2016,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tpages        = {2681--2689}\n}\n@article{azizi2021big,\n\ttitle        = {Big self-supervised models advance medical image classification},\n\tauthor       = {Shekoofeh Azizi and Basil Mustafa and Fiona Ryan and Zachary Beaver and Jan Freyberg and Jonathan Deaton and Aaron Loh and Alan Karthikesalingam and Simon Kornblith and Ting Chen and others},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2101.05224}\n}\n@article{azizyan2013density,\n\ttitle        = {Density-sensitive semisupervised inference},\n\tauthor       = {Azizyan, Martin and Singh, Aarti and Wasserman, Larry and others},\n\tyear         = 2013,\n\tjournal      = {The Annals of Statistics},\n\tpublisher    = {Institute of Mathematical Statistics},\n\tvolume       = 41,\n\tnumber       = 2,\n\tpages        = {751--771}\n}\n@article{azizzadenesheli2016contextual,\n\ttitle        = {Reinforcement Learning in Rich-Observation MDPs using Spectral Methods},\n\tauthor       = {Azizzadenesheli, Kamyar and Lazaric, Alessandro and Anandkumar, Animashree},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1611.03907}\n}\n@article{azizzadenesheli2016reinforcement,\n\ttitle        = {Reinforcement learning of POMDPs using spectral methods},\n\tauthor       = {Azizzadenesheli, Kamyar and Lazaric, Alessandro and Anandkumar, Animashree},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1602.07764}\n}\n@inproceedings{azizzadenesheli2019reglabel,\n\ttitle        = {Regularized Learning for Domain Adaptation under Label Shifts},\n\tauthor       = {Kamyar Azizzadenesheli and Anqi Liu and Fanny Yang and Animashree Anandkumar},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{azzalini2012some,\n\ttitle        = {Some properties of skew-symmetric distributions},\n\tauthor       = {Adelchi Azzalini and Giuliana Regoli},\n\tyear         = 2012,\n\tjournal      = {Annals of the Institute of Statistical Mathematics},\n\tvolume       = 64,\n\tnumber       = 4,\n\tpages        = {857--879}\n}\n@article{b94,\n\ttitle        = {Approximation and estimation bounds for artificial neural networks},\n\tauthor       = {Barron, Andrew R},\n\tyear         = 1994,\n\tjournal      = {Machine learning},\n\tpublisher    = {Springer},\n\tvolume       = 14,\n\tnumber       = 1,\n\tpages        = {115--133}\n}\n@inproceedings{ba2013adaptive,\n\ttitle        = {Adaptive dropout for training deep neural networks},\n\tauthor       = {Jimmy Ba and Brendan Frey},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {3084--3092}\n}\n@inproceedings{ba2015multiple,\n\ttitle        = {Multiple object recognition with visual attention},\n\tauthor       = {Jimmy Ba and Volodymyr Mnih and Koray Kavukcuoglu},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@book{baader2003description,\n\ttitle        = {The description logic handbook: theory, implementation, and applications},\n\tauthor       = {Franz Baader},\n\tyear         = 2003,\n\tpublisher    = {Cambridge University Press}\n}\n@inproceedings{baarslag2016negotiation,\n\ttitle        = {Negotiation as an Interaction Mechanism for Deciding App Permissions},\n\tauthor       = {Tim Baarslag and Alper T. Alan and Richard C. Gomer and Ilaria Liccardi and Helia Marreiros and Enrico Gerding and M. C. Schraefel},\n\tyear         = 2016,\n\tbooktitle    = {Conference on Human Factors in Computing Systems (CHI)}\n}\n@inproceedings{babaioff2009characterizing,\n\ttitle        = {Characterizing truthful multi-armed bandit mechanisms},\n\tauthor       = {Babaioff, Moshe and Sharma, Yogeshwer and Slivkins, Aleksandrs},\n\tyear         = 2009,\n\tbooktitle    = {Proceedings of the 10th ACM conference on Electronic commerce},\n\tpages        = {79--88},\n\torganization = {ACM}\n}\n@inproceedings{babenko2009visual,\n\ttitle        = {Visual tracking with online multiple instance learning},\n\tauthor       = {Boris Babenko and Ming-Hsuan Yang and Serge Belongie},\n\tyear         = 2009,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {983--990}\n}\n@inproceedings{bacchus1996rewarding,\n\ttitle        = {Rewarding behaviors},\n\tauthor       = {Bacchus, Fahiem and Boutilier, Craig and Grove, Adam},\n\tyear         = 1996,\n\tbooktitle    = {Proceedings of the National Conference on Artificial Intelligence},\n\tpages        = {1160--1167}\n}\n@inproceedings{bach17structure,\n\ttitle        = {Learning the Structure of Generative Models without Labeled Data},\n\tauthor       = {Bach, Stephen H. and He, Bryan and Ratner, Alexander and R'e, Christopher},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{bach2010self,\n\ttitle        = {Self-concordant analysis for logistic regression},\n\tauthor       = {Francis Bach and others},\n\tyear         = 2010,\n\tjournal      = {Electronic Journal of Statistics},\n\tvolume       = 4,\n\tpages        = {384--414}\n}\n@inproceedings{bach2010structured,\n\ttitle        = {Structured sparsity-inducing norms through submodular functions},\n\tauthor       = {Francis R. Bach},\n\tyear         = 2010,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {118--126}\n}\n@article{bachman2019learning,\n\ttitle        = {Learning representations by maximizing mutual information across views},\n\tauthor       = {Bachman, Philip and Hjelm, R Devon and Buchwalter, William},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1906.00910}\n}\n@inproceedings{Bachpaper,\n\ttitle        = {A stochastic gradient method with an exponential convergence \\_rate for finite training sets},\n\tauthor       = {Roux, Nicolas L and Schmidt, Mark and Bach, Francis R},\n\tyear         = 2012,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {2663--2671}\n}\n@incollection{Backprop,\n\ttitle        = {Neurocomputing: foundations of research},\n\tauthor       = {Rumelhart, David E. and Hinton, Geoffrey E. and Williams, Ronald J.},\n\tyear         = 1988,\n\tpublisher    = {MIT Press},\n\taddress      = {Cambridge, MA, USA},\n\tpages        = {696--699},\n\tisbn         = {0-262-01097-6},\n\turl          = {http://dl.acm.org/citation.cfm?id=65669.104451},\n\teditor       = {Anderson, James A. and Rosenfeld, Edward},\n\tchapter      = {Learning representations by back-propagating errors},\n\tacmid        = 104451,\n\tnumpages     = 4\n}\n@inproceedings{bacon2017option,\n\ttitle        = {The Option-Critic Architecture},\n\tauthor       = {P. Bacon and J. Harb and D. Precup},\n\tyear         = 2017,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tpages        = {1726--1734}\n}\n@book{bacsar2008optimal,\n\ttitle        = {{H}-infinity optimal control and related minimax design problems: a dynamic game approach},\n\tauthor       = {Tamer Ba{\\c{s}}ar and Pierre Bernhard},\n\tyear         = 2008,\n\tpublisher    = {Springer Science \\& Business Media}\n}\n@article{bader2008discussion,\n\ttitle        = {Discussion tracking in Enron email using {PARAFAC}},\n\tauthor       = {Bader, Brett W and Berry, Michael W and Browne, Murray},\n\tyear         = 2008,\n\tjournal      = {Survey of Text Mining II},\n\tvolume       = 1,\n\tpages        = {147--163}\n}\n@inproceedings{bader2019getafix,\n\ttitle        = {Getafix: Learning to fix bugs automatically},\n\tauthor       = {Johannes Bader and Andrew Scott and Michael Pradel and Satish Chandra},\n\tyear         = 2019,\n\tbooktitle    = {Object-Oriented Programming, Systems, Languages, and Applications (OOPSLA)}\n}\n@article{badgeley2019deep,\n\ttitle        = {Deep learning predicts hip fracture using confounding patient and healthcare variables},\n\tauthor       = {Marcus A Badgeley and John R Zech and Luke Oakden-Rayner and Benjamin S Glicksberg and Manway Liu and William Gale and Michael V McConnell and Bethany Percha and Thomas M Snyder and Joel T Dudley},\n\tyear         = 2019,\n\tjournal      = {npj Digital Medicine},\n\tvolume       = 2\n}\n@article{badia2020never,\n\ttitle        = {Never Give Up: Learning Directed Exploration Strategies},\n\tauthor       = {Adri{\\`a} Puigdom{\\`e}nech Badia and Pablo Sprechmann and Alex Vitvitskyi and Daniel Guo and Bilal Piot and Steven Kapturowski and Olivier Tieleman and Mart{'\\i}n Arjovsky and Alexander Pritzel and Andew Bolt and others},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2002.06038}\n}\n@inproceedings{Badoiu2002,\n\ttitle        = {{Approximate clustering via core-sets}},\n\tauthor       = {{B{\\u{a}}doiu}, Mihai and {Har-Peled}, Sariel and Indyk, Piotr},\n\tyear         = 2002,\n\tbooktitle    = {Proceedings of the thiry-fourth annual ACM symposium on Theory of computing - STOC '02},\n\tpublisher    = {ACM Press},\n\taddress      = {New York, New York, USA},\n\tpages        = 250,\n\tdoi          = {10.1145/509907.509947},\n\tisbn         = 1581134959,\n\tmendeley-groups = {Algorithms/Computational Geometry}\n}\n@article{baes2009estimate,\n\ttitle        = {Estimate sequence methods: extensions and approximations},\n\tauthor       = {Baes, Michel},\n\tyear         = 2009,\n\tjournal      = {Institute for Operations Research, ETH, Z{\\\"u}rich, Switzerland}\n}\n@inproceedings{bagnell2004policy,\n\ttitle        = {Policy search by dynamic programming},\n\tauthor       = {Bagnell, J Andrew and Kakade, Sham M and Schneider, Jeff G and Ng, Andrew Y},\n\tyear         = 2004,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {831--838}\n}\n@inproceedings{bagnell2005robust,\n\ttitle        = {Robust supervised learning},\n\tauthor       = {J Andrew Bagnell},\n\tyear         = 2005,\n\tbooktitle    = {Proceedings of the 20th national conference on Artificial intelligence-Volume 2},\n\tpages        = {714--719}\n}\n@article{bagnoli2005logconcave,\n\ttitle        = {Log-concave probability and its applications},\n\tauthor       = {Mark Bagnoli and Ted Bergstrom},\n\tyear         = 2005,\n\tjournal      = {Economic Theory},\n\tvolume       = 26,\n\tpages        = {445--469}\n}\n@inproceedings{bahdanau2015neural,\n\ttitle        = {Neural machine translation by jointly learning to align and translate},\n\tauthor       = {Dzmitry Bahdanau and Kyunghyun Cho and Yoshua Bengio},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{bahdanau2017actor,\n\ttitle        = {An actor-critic algorithm for sequence prediction},\n\tauthor       = {Dzmitry Bahdanau and Philemon Brakel and Kelvin Xu and Anirudh Goyal and Ryan Lowe and Joelle Pineau and Aaron Courville and Yoshua Bengio},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{bahdanau2019reward,\n\ttitle        = {Learning to Understand Goal Specifications by Modelling Reward},\n\tauthor       = {Dzmitry Bahdanau and Felix Hill and Jan Leike and Edward Hughes and S. A. Hosseini and Pushmeet Kohli and Edward Grefenstette},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{bai2019beyond,\n\ttitle        = {Beyond Linearization: On Quadratic and Higher-Order Approximation of Wide Neural Networks},\n\tauthor       = {Bai, Yu and Lee, Jason D},\n\tyear         = 2020,\n\tjournal      = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{bai2019provably,\n\ttitle        = {Provably efficient q-learning with low switching cost},\n\tauthor       = {Bai, Yu and Xie, Tengyang and Jiang, Nan and Wang, Yu-Xiang},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {8004--8013}\n}\n@article{bai2020provable,\n\ttitle        = {Provable Self-Play Algorithms for Competitive Reinforcement Learning},\n\tauthor       = {Bai, Yu and Jin, Chi},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2002.04017}\n}\n@article{Bailly11,\n\ttitle        = {Quadratic weighted automata: Spectral algorithm and likelihood maximization},\n\tauthor       = {R. Bailly},\n\tyear         = 2011,\n\tjournal      = {Journal of Machine Learning Research}\n}\n@inproceedings{bailly2010spectral,\n\ttitle        = {A spectral approach for probabilistic grammatical inference on trees},\n\tauthor       = {R. Bailly and A. Habrard and F. Denis},\n\tyear         = 2010,\n\tbooktitle    = {Algorithmic Learning Theory},\n\tpages        = {74--88}\n}\n@article{bair2006prediction,\n\ttitle        = {Prediction by supervised principal components},\n\tauthor       = {Eric Bair and Trevor Hastie and Debashis Paul and Robert Tibshirani},\n\tyear         = 2006,\n\tjournal      = {Journal of the American Statistical Association (JASA)},\n\tvolume       = 101,\n\tnumber       = 473,\n\tpages        = {119--137}\n}\n@inproceedings{bajcsy2017learning,\n\ttitle        = {Learning Robot Objectives from Physical Human Interaction},\n\tauthor       = {Andrea Bajcsy and Dylan P. Losey and M. O'Malley and A. Dragan},\n\tyear         = 2017,\n\tbooktitle    = {Conference on Robot Learning (CORL)}\n}\n@article{bakker03task,\n\ttitle        = {Task clustering and gating for {B}ayesian multitask learning},\n\tauthor       = {B. Bakker and T. Heskes},\n\tyear         = 2003,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 4,\n\tpages        = {83--99}\n}\n@incollection{bakry1985diffusions,\n\ttitle        = {Diffusions hypercontractives},\n\tauthor       = {Dominique Bakry and Michel {\\'E}mery},\n\tyear         = 1985,\n\tbooktitle    = {S{\\'e}minaire de Probabilit{\\'e}s XIX 1983/84},\n\tpages        = {177--206}\n}\n@inproceedings{balaji2018metareg,\n\ttitle        = {Metareg: Towards domain generalization using meta-regularization},\n\tauthor       = {Yogesh Balaji and Swami Sankaranarayanan and Rama Chellappa},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {998--1008}\n}\n@article{balakrishnan2016statistical,\n\ttitle        = {Statistical guarantees for the EM algorithm: From population to sample-based analysis},\n\tauthor       = {Balakrishnan, Sivaraman and Wainwright, Martin J and Yu, Bin},\n\tyear         = 2016,\n\tjournal      = {Annals of Stat},\n\tpublisher    = {Institute of Mathematical Statistics},\n\tvolume       = 45,\n\tnumber       = 1,\n\tpages        = {77--120}\n}\n@inproceedings{balakrishnan2017computationally,\n\ttitle        = {Computationally Efficient Robust Sparse Estimation in High Dimensions},\n\tauthor       = {Balakrishnan, Sivaraman and Du, Simon S. and Li, Jerry and Singh, Aarti},\n\tyear         = 2017,\n\tmonth        = {07--10 Jul},\n\tbooktitle    = {Proceedings of the 2017 Conference on Learning Theory},\n\tpublisher    = {PMLR},\n\tseries       = {Proceedings of Machine Learning Research},\n\tvolume       = 65,\n\tpages        = {169--212},\n\turl          = {http://proceedings.mlr.press/v65/balakrishnan17a.html},\n\teditor       = {Kale, Satyen and Shamir, Ohad},\n\tpdf          = {http://proceedings.mlr.press/v65/balakrishnan17a/balakrishnan17a.pdf},\n\tabstract     = {Many conventional statistical procedures are extremely sensitive to seemingly minor deviations from modeling assumptions. This problem is exacerbated in modern high-dimensional settings, where the problem dimension can grow with and possibly exceed the sample size. We consider the problem of robust estimation of sparse functionals, and provide a computationally and statistically efficient algorithm in the high-dimensional setting. Our theory identifies a unified set of deterministic conditions under which our algorithm guarantees accurate recovery. By further establishing that these deterministic conditions hold with high-probability for a wide range of statistical models, our theory applies to many problems of considerable interest including sparse mean and covariance estimation; sparse linear regression; and sparse generalized linear models. In certain settings, such as the detection and estimation of sparse principal components in the spiked covariance model, our general theory does not yield optimal sample complexity, and we provide a novel algorithm based on the same intuition which is able to take advantage of further structure of the problem to achieve nearly optimal rates.}\n}\n@article{balakrishnan2017hypothesis,\n\ttitle        = {Hypothesis Testing for High-Dimensional Multinomials: A Selective Review},\n\tauthor       = {Sivaraman Balakrishnan and Larry Wasserman},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1712.06120}\n}\n@inproceedings{balakrishnan2017sparse,\n\ttitle        = {Computationally Efficient Robust Sparse Estimation in High Dimensions},\n\tauthor       = {Sivaraman Balakrishnan and Simon S. Du and Jerry Li and Aarti Singh},\n\tyear         = 2017,\n\tbooktitle    = {Conference on Learning Theory (COLT)},\n\tpages        = {169--212}\n}\n@article{balamurugan2016stochastic,\n\ttitle        = {Stochastic Variance Reduction Methods for Saddle-Point Problems},\n\tauthor       = {Balamurugan, P and Bach, Francis},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1605.06398}\n}\n@article{balasubramanian2011unsupervised,\n\ttitle        = {Unsupervised supervised learning {II}: Margin-based classification without labels},\n\tauthor       = {Krishnakumar Balasubramanian and Pinar Donmez and Guy Lebanon},\n\tyear         = 2011,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 12,\n\tpages        = {3119--3145}\n}\n@article{balcan2005co,\n\ttitle        = {Co-training and expansion: Towards bridging theory and practice},\n\tauthor       = {Balcan, Maria-Florina and Blum, Avrim and Yang, Ke},\n\tyear         = 2005,\n\tjournal      = {Advances in neural information processing systems},\n\tpublisher    = {MIT Press},\n\tvolume       = 17,\n\tpages        = {89--96}\n}\n@inproceedings{balcan2007margin,\n\ttitle        = {Margin based active learning},\n\tauthor       = {Maria-Florina Balcan and Andrei Broder and Tong Zhang},\n\tyear         = 2007,\n\tbooktitle    = {International Conference on Computational Learning Theory}\n}\n@inproceedings{balcan2008discriminative,\n\ttitle        = {A discriminative framework for clustering via similarity functions},\n\tauthor       = {Maria-Florina Balcan and Avrim Blum and Santosh Vempala},\n\tyear         = 2008,\n\tbooktitle    = {Symposium on Theory of Computing (STOC)},\n\tpages        = {671--680}\n}\n@inproceedings{balcan2009agnostic,\n\ttitle        = {Agnostic clustering},\n\tauthor       = {Maria Florina Balcan and Heiko R{\\\"o}glin and Shang-Hua Teng},\n\tyear         = 2009,\n\tbooktitle    = {International Conference on Algorithmic Learning Theory},\n\tpages        = {384--398}\n}\n@article{balcan2010discriminative,\n\ttitle        = {A discriminative model for semi-supervised learning},\n\tauthor       = {Maria-Florina Balcan and Avrim Blum},\n\tyear         = 2010,\n\tjournal      = {Journal of the ACM (JACM)},\n\tvolume       = 57,\n\tnumber       = 3\n}\n@inproceedings{balcan2013active,\n\ttitle        = {Active and passive learning of linear separators under log-concave distributions},\n\tauthor       = {Maria-Florina Balcan and Phil Long},\n\tyear         = 2013,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@inproceedings{balcan2016improved,\n\ttitle        = {An Improved Gap-Dependency Analysis of the Noisy Power Method},\n\tauthor       = {Maria-Florina Balcan and Simon Shaolei Du and Yining Wang and Adams Wei Yu},\n\tyear         = 2016,\n\tmonth        = {23--26 Jun},\n\tbooktitle    = {29th Annual Conference on Learning Theory},\n\tpublisher    = {PMLR},\n\taddress      = {Columbia University, New York, New York, USA},\n\tseries       = {Proceedings of Machine Learning Research},\n\tvolume       = 49,\n\tpages        = {284--309},\n\turl          = {http://proceedings.mlr.press/v49/balcan16a.html},\n\teditor       = {Vitaly Feldman and Alexander Rakhlin and Ohad Shamir},\n\tpdf          = {http://proceedings.mlr.press/v49/balcan16a.pdf},\n\tabstract     = {We consider the \\emphnoisy power method algorithm, which has wide applications in machine learning and statistics, especially those related to principal component analysis (PCA) under resource (communication, memory or privacy) constraints. Existing analysis of the noisy power method shows an unsatisfactory dependency over the “consecutive\" spectral gap (\\sigma_k-\\sigma_k+1) of an input data matrix, which could be very small and hence limits the algorithm’s applicability. In this paper, we present a new analysis of the noisy power method that achieves improved gap dependency for both sample complexity and noise tolerance bounds. More specifically, we improve the dependency over (\\sigma_k-\\sigma_k+1) to dependency over (\\sigma_k-\\sigma_q+1), where q is an intermediate algorithm parameter and could be much larger than the target rank k. Our proofs are built upon a novel characterization of proximity between two subspaces that differ from canonical angle characterizations analyzed in previous works. Finally, we apply our improved bounds to distributed private PCA and memory-efficient streaming PCA and obtain bounds that are superior to existing results in the literature.}\n}\n@article{baldi1989neural,\n\ttitle        = {Neural networks and principal component analysis: Learning from examples without local minima},\n\tauthor       = {Baldi, Pierre and Hornik, Kurt},\n\tyear         = 1989,\n\tmonth        = jan,\n\tjournal      = {Neural networks},\n\tpublisher    = {Elsevier},\n\taddress      = {Oxford, UK, UK},\n\tvolume       = 2,\n\tnumber       = 1,\n\tpages        = {53--58},\n\tdoi          = {10.1016/0893-6080(89)90014-2},\n\tissn         = {0893-6080},\n\turl          = {http://dx.doi.org/10.1016/0893-6080(89)90014-2},\n\tissue_date   = 1989,\n\tnumpages     = 6,\n\tacmid        = 70362\n}\n@inproceedings{baldi2013understanding,\n\ttitle        = {Understanding dropout},\n\tauthor       = {Pierre Baldi and Peter J Sadowski},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {2814--2822}\n}\n@article{baldi2014dropout,\n\ttitle        = {The dropout learning algorithm},\n\tauthor       = {Pierre Baldi and Peter Sadowski},\n\tyear         = 2014,\n\tjournal      = {Artificial intelligence},\n\tvolume       = 210,\n\tpages        = {78--122}\n}\n@inproceedings{baldridge02ccg,\n\ttitle        = {Coupling {CCG} with Hybrid Logic Dependency Semantics},\n\tauthor       = {Jason Baldridge and Geert-Jan M. Kruijff},\n\tyear         = 2002,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {319--326}\n}\n@article{balle2014spectral,\n\ttitle        = {Spectral learning of weighted automata - a forward-backward perspective},\n\tauthor       = {Borja Balle and Xavier Carreras and Franco M. Luque and Ariadna Quattoni},\n\tyear         = 2014,\n\tjournal      = {Machine Learning},\n\tvolume       = 96,\n\tnumber       = 1,\n\tpages        = {33--63}\n}\n@article{balog2016deepcoder,\n\ttitle        = {Deepcoder: Learning to write programs},\n\tauthor       = {Matej Balog and Alexander L Gaunt and Marc Brockschmidt and Sebastian Nowozin and Daniel Tarlow},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1611.01989}\n}\n@inproceedings{Balsubramani2013-incrementalPCA,\n\ttitle        = {The fast convergence of incremental pca},\n\tauthor       = {Balsubramani, Akshay and Dasgupta, Sanjoy and Freund, Yoav},\n\tyear         = 2013,\n\tbooktitle    = {NIPS},\n\tpages        = {3174--3182}\n}\n@inproceedings{balsubramani2015scalable,\n\ttitle        = {Scalable semi-supervised aggregation of classifiers},\n\tauthor       = {Akshay Balsubramani and Yoav Freund},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1351--1359}\n}\n@article{balsubramani2016learning,\n\ttitle        = {Learning to Abstain from Binary Prediction},\n\tauthor       = {Akshay Balsubramani},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1602.08151}\n}\n@article{baltruvsaitis2017multimodal,\n\ttitle        = {Multimodal Machine Learning: A Survey and Taxonomy},\n\tauthor       = {Tadas Baltru{\\v{s}}aitis and Chaitanya Ahuja and Louis-Philippe Morency},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1705.09406}\n}\n@inproceedings{balzano2010column,\n\ttitle        = {Column subset selection with missing data},\n\tauthor       = {Balzano, Laura and Nowak, Robert and Bajwa, Waheed},\n\tyear         = 2010,\n\tbooktitle    = {NIPS Workshop on Low-Rank Methods for Large-Scale Machine Learning},\n\tvolume       = 1,\n\torganization = {Citeseer}\n}\n@inproceedings{banarescu2013amr,\n\ttitle        = {Abstract Meaning Representation for Sembanking},\n\tauthor       = {Laura Banarescu and Claire Bonial Shu Cai and Madalina Georgescu and Kira Griffitt and Ulf Hermjakob and Kevin Knight and Philipp Koehn and Martha Palmer and Nathan Schneider},\n\tyear         = 2013,\n\tbooktitle    = {7th Linguistic Annotation Workshop and Interoperability with Discourse}\n}\n@inproceedings{bandeira2014multireference,\n\ttitle        = {Multireference alignment using semidefinite programming},\n\tauthor       = {Bandeira, Afonso S and Charikar, Moses and Singer, Amit and Zhu, Andy},\n\tyear         = 2014,\n\tbooktitle    = {Proceedings of the 5th conference on Innovations in theoretical computer science},\n\tpages        = {459--470},\n\torganization = {ACM}\n}\n@article{bandeira2014sharp,\n\ttitle        = {Sharp nonasymptotic bounds on the norm of random matrices with independent entries},\n\tauthor       = {Afonso S. Bandeira and Ramon van Handel},\n\tyear         = 2014,\n\tjournal      = {arXiv}\n}\n@inproceedings{bandeira2016low,\n\ttitle        = {On the low-rank approach for semidefinite programs arising in synchronization and community detection},\n\tauthor       = {Bandeira, Afonso S and Boumal, Nicolas and Voroninski, Vladislav},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1602.04426},\n\tbooktitle    = {Conference on learning theory},\n\tpages        = {361--382},\n\torganization = {PMLR}\n}\n@article{bandi2018detection,\n\ttitle        = {From detection of individual metastases to classification of lymph node status at the patient level: the {CAMELYON17} challenge},\n\tauthor       = {Peter Bandi and Oscar Geessink and Quirine Manson and Marcory Van Dijk and Maschenka Balkenhol and Meyke Hermsen and Babak Ehteshami Bejnordi and Byungjae Lee and Kyunghyun Paeng and Aoxiao Zhong and others},\n\tyear         = 2018,\n\tjournal      = {IEEE Transactions on Medical Imaging},\n\tvolume       = 38,\n\tnumber       = 2,\n\tpages        = {550--560}\n}\n@article{banerjee2005clustering,\n\ttitle        = {Clustering with Bregman divergences},\n\tauthor       = {Banerjee, Arindam and Merugu, Srujana and Dhillon, Inderjit S. and Ghosh, Joydeep},\n\tyear         = 2005,\n\tjournal      = {The Journal of Machine Learning Research},\n\tpublisher    = {JMLR. org},\n\tvolume       = 6,\n\tpages        = {1705--1749}\n}\n@inproceedings{banerjee2005meteor,\n\ttitle        = {METEOR: An automatic metric for mt evaluation with improved correlation with human judgments},\n\tauthor       = {Satanjeev Banerjee and Alon Lavie},\n\tyear         = 2005,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{banko2007open,\n\ttitle        = {Open Information Extraction from the Web},\n\tauthor       = {Michele Banko and Michael J Cafarella and Stephen Soderland and Matthew Broadhead and Oren Etzioni},\n\tyear         = 2007,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)},\n\tpages        = {2670--2676}\n}\n@article{banks2016information,\n\ttitle        = {Information-theoretic thresholds for community detection in sparse networks},\n\tauthor       = {Jess Banks and Christopher Moore},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@inproceedings{bannard2005paraphrasing,\n\ttitle        = {Paraphrasing with bilingual parallel corpora},\n\tauthor       = {Colin Bannard and Chris Callison-Burch},\n\tyear         = 2005,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {597--604}\n}\n@inproceedings{bansal2006automatic,\n\ttitle        = {Automatic Generation of Peephole Superoptimizers},\n\tauthor       = {Sorav Bansal and Alex Aiken},\n\tyear         = 2006,\n\tbooktitle    = {Architectural Support for Programming Languages and Operating Systems (ASPLOS)}\n}\n@inproceedings{Bansal2011,\n\ttitle        = {{Min-max Graph Partitioning and Small Set Expansion}},\n\tauthor       = {Bansal, Nikhil and Feige, Uriel and Krauthgamer, Robert and Makarychev, Konstantin and Nagarajan, Viswanath and Naor, Joseph (Seffi) and Schwartz, Roy},\n\tyear         = 2011,\n\tmonth        = oct,\n\tjournal      = {SIAM Journal on Computing},\n\tbooktitle    = {2011 IEEE 52nd Annual Symposium on Foundations of Computer Science},\n\tpublisher    = {IEEE},\n\tvolume       = 43,\n\tnumber       = 2,\n\tpages        = {17--26},\n\tdoi          = {10.1109/FOCS.2011.79},\n\tisbn         = {978-0-7695-4571-4},\n\tabstract     = {We study graph partitioning problems from a min-max perspective, in which an input graph on n vertices should be partitioned into k parts, and the objective is to minimize the maximum number of edges leaving a single part. The two main versions we consider are where the k parts need to be of equal-size, and where they must separate a set of k given terminals. We consider a common generalization of these two problems, and design for it an \\$O(\\backslash sqrt\\{\\backslash log n\\backslash log k\\})\\$-approximation algorithm. This improves over an \\$O(\\backslash log\\^{}2 n)\\$ approximation for the second version, and roughly \\$O(k\\backslash log n)\\$ approximation for the first version that follows from other previous work. We also give an improved O(1)-approximation algorithm for graphs that exclude any fixed minor. Our algorithm uses a new procedure for solving the Small-Set Expansion problem. In this problem, we are given a graph G and the goal is to find a non-empty set \\$S\\backslash subseteq V\\$ of size \\$|S| \\backslash leq \\backslash rho n\\$ with minimum edge-expansion. We give an \\$O(\\backslash sqrt\\{\\backslash log\\{n\\}\\backslash log\\{(1/\\backslash rho)\\}\\})\\$ bicriteria approximation algorithm for the general case of Small-Set Expansion, and O(1) approximation algorithm for graphs that exclude any fixed minor.},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {1110.4319},\n\teprint       = {1110.4319},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Bansal et al. - 2011 - Min-max Graph Partitioning and Small Set Expansion.pdf:pdf},\n\tmendeley-groups = {Algorithms/Sparsest Cut,Algorithms/Small Set Expansion,Algorithms/Sparsest Cut/SSE}\n}\n@inproceedings{bansal2014provable,\n\ttitle        = {A provable {SVD}-based algorithm for learning topics in dominant admixture corpus},\n\tauthor       = {Trapit Bansal and Chiranjib Bhattacharyya and Ravindran Kannan},\n\tyear         = 2014,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{bansal2017hamilton,\n\ttitle        = {Hamilton-Jacobi reachability: A brief overview and recent advances},\n\tauthor       = {Bansal, Somil and Chen, Mo and Herbert, Sylvia and Tomlin, Claire J},\n\tyear         = 2017,\n\tbooktitle    = {2017 IEEE 56th Annual Conference on Decision and Control (CDC)},\n\tpages        = {2242--2253},\n\torganization = {IEEE}\n}\n@article{bansal2020self,\n\ttitle        = {For self-supervised learning, Rationality implies generalization, provably},\n\tauthor       = {Bansal, Yamini and Kaplun, Gal and Barak, Boaz},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.08508}\n}\n@article{bao2014approximation,\n\ttitle        = {Approximation analysis of convolutional neural networks},\n\tauthor       = {Bao, Chenglong and Li, Qianxiao and Shen, Zuowei and Tai, Cheng and Wu, Lei and Xiang, Xueshuang},\n\tyear         = 2014,\n\tjournal      = {work},\n\tvolume       = 65\n}\n@inproceedings{bao2014qa,\n\ttitle        = {Knowledge-based Question Answering as Machine Translation},\n\tauthor       = {J. Bao and Nan Duan and Ming Zhou and Tiejun Zhao},\n\tyear         = 2014,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{bao2016constraint,\n\ttitle        = {Constraint-Based Question Answering with Knowledge Graph},\n\tauthor       = {Junwei Bao and Nan Duan and Zhao Yan and Ming Zhou and Tiejun Zhao},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)}\n}\n@book{bar1964language,\n\ttitle        = {Language and Information: Selected Essays on Their Theory and Application},\n\tauthor       = {Y Bar-Hillel},\n\tyear         = 1964,\n\tpublisher    = {Addison-Wesley/The Jerusalem Academic Press}\n}\n@inproceedings{barak2012hypercontractivity,\n\ttitle        = {Hypercontractivity, sum-of-squares proofs, and their applications},\n\tauthor       = {Boaz Barak and Fernando Brand{\\~a}o and Aram Harrow and Jonathan Kelner and David Steurer and Yuan Zhou},\n\tyear         = 2012,\n\tbooktitle    = {Symposium on Theory of Computing (STOC)},\n\tpages        = {307--326}\n}\n@article{barak2014,\n\ttitle        = {Dictionary Learning and Tensor Decomposition via the Sum-of-Squares Method},\n\tauthor       = {Barak, Boaz and Kelner, Jonathan and Steurer, David},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1407.1543}\n}\n@article{barak2014sum,\n\ttitle        = {Sum-of-squares proofs and the quest toward optimal algorithms},\n\tauthor       = {Barak, Boaz and Steurer, David},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1404.5236}\n}\n@inproceedings{barak2016nearly,\n\ttitle        = {A nearly tight sum-of-squares lower bound for the planted clique problem},\n\tauthor       = {Boaz Barak and Samuel B. Hopkins and Jonathan Kelner and Pravesh Kothari and Ankur Moitra and Aaron Potechin},\n\tyear         = 2016,\n\tbooktitle    = {Foundations of Computer Science (FOCS)},\n\tpages        = {428--437}\n}\n@misc{barak2016tutorial,\n\ttitle        = {Proofs, beliefs, and algorithms through the lens of sum-of-squares},\n\tauthor       = {Boaz Barak and David Steurer},\n\tyear         = 2016,\n\thowpublished = {\\url{https://www.sumofsquares.org/public/index.html}}\n}\n@inproceedings{baram2017end,\n\ttitle        = {End-to-end differentiable adversarial imitation learning},\n\tauthor       = {N. Baram and O. Anschel and I. Caspi and S. Mannor},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {390--399}\n}\n@article{baraniuk2008simple,\n\ttitle        = {A simple proof of the restricted isometry property for random matrices},\n\tauthor       = {Baraniuk, Richard and Davenport, Mark and DeVore, Ronald and Wakin, Michael},\n\tyear         = 2008,\n\tjournal      = {Constructive Approximation},\n\tpublisher    = {Springer},\n\tvolume       = 28,\n\tnumber       = 3,\n\tpages        = {253--263}\n}\n@article{barany2012notes,\n\ttitle        = {Notes about the {C}arath{\\'e}odory number},\n\tauthor       = {Imre B{\\'a}r{\\'a}ny and Roman Karasev},\n\tyear         = 2012,\n\tjournal      = {Discrete \\& Computational Geometry},\n\tvolume       = 48,\n\tnumber       = 3,\n\tpages        = {783--792}\n}\n@inproceedings{barber2003algorithm,\n\ttitle        = {The {IM} algorithm: a variational approach to information maximization},\n\tauthor       = {David Barber and Felix V Agakov},\n\tyear         = 2003,\n\tbooktitle    = {Advances in neural information processing systems}\n}\n@article{barber2019conformal,\n\ttitle        = {Conformal prediction under covariate shift},\n\tauthor       = {Barber, Rina Foygel and Candes, Emmanuel J and Ramdas, Aaditya and Tibshirani, Ryan J},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1904.06019}\n}\n@article{barber2019limits,\n\ttitle        = {The limits of distribution-free conditional predictive inference},\n\tauthor       = {Barber, Rina Foygel and Candes, Emmanuel J and Ramdas, Aaditya and Tibshirani, Ryan J},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1903.04684}\n}\n@article{barber2019predictive,\n\ttitle        = {Predictive inference with the jackknife+},\n\tauthor       = {Barber, Rina Foygel and Candes, Emmanuel J and Ramdas, Aaditya and Tibshirani, Ryan J},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1905.02928}\n}\n@article{barbu05swendsen,\n\ttitle        = {Generalizing {S}wendsen-{W}ang to sampling arbitrary posterior probabilities},\n\tauthor       = {A. Barbu and S. C. Zhu},\n\tyear         = 2005,\n\tjournal      = {IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI)},\n\tvolume       = 27,\n\tpages        = {1239--1253}\n}\n@article{barbu2009training,\n\ttitle        = {Training an active random field for real-time image denoising},\n\tauthor       = {Adrian Barbu},\n\tyear         = 2009,\n\tjournal      = {IEEE Transactions on Image Processing},\n\tvolume       = 18,\n\tnumber       = 11,\n\tpages        = {2451--2462}\n}\n@inproceedings{barbu2019objectnet,\n\ttitle        = {Objectnet: A large-scale bias-controlled dataset for pushing the limits of object recognition models},\n\tauthor       = {Andrei Barbu and David Mayo and Julian Alverio and William Luo and Christopher Wang and Dan Gutfreund and Josh Tenenbaum and Boris Katz},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {9453--9463}\n}\n@inproceedings{barcelo11grpah,\n\ttitle        = {Querying Graph Patterns},\n\tauthor       = {Pablo Barcelo and Leonid Libkin and Juan Reutter},\n\tyear         = 2011,\n\tbooktitle    = {Symposium on Principles of Database Systems}\n}\n@article{bard1991some,\n\ttitle        = {Some properties of the bilevel programming problem},\n\tauthor       = {Jonathan F Bard},\n\tyear         = 1991,\n\tjournal      = {Journal of optimization theory and applications},\n\tvolume       = 68,\n\tnumber       = 2,\n\tpages        = {371--378}\n}\n@book{bard1999,\n\ttitle        = {Practical Bilevel Optimization: Algorithms and Applications},\n\tauthor       = {Jonathan F. Bard},\n\tyear         = 1999,\n\tpublisher    = {Springer}\n}\n@article{bardes2021vicreg,\n\ttitle        = {VICReg: Variance-Invariance-Covariance Regularization for Self-Supervised Learning},\n\tauthor       = {Bardes, Adrien and Ponce, Jean and LeCun, Yann},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2105.04906}\n}\n@article{bardet2018functional,\n\ttitle        = {Functional inequalities for {G}aussian convolutions of compactly supported measures: explicit bounds and dimension dependence},\n\tauthor       = {Jean-Baptiste Bardet and Natha{\\\"e}l Gozlan and Florent Malrieu and Pierre-Andr{\\'e} Zitt},\n\tyear         = 2018,\n\tjournal      = {Bernoulli},\n\tvolume       = 24,\n\tpages        = {333--353}\n}\n@inproceedings{barhaim2008,\n\ttitle        = {Efficient Semantic Deduction and Approximate Matching over Compact Parse Forests},\n\tauthor       = {Roy Bar-Haim and Jonathan Berant and Ido Dagan and Iddo Greental and Shachar Mirkin and Eyal Shnarch and Idan Szpektor},\n\tyear         = {2008 2008},\n\tbooktitle    = {Text Analysis Conference}\n}\n@inproceedings{barhaim2009forest,\n\ttitle        = {A Compact Forest for Scalable Inference over Entailment and Paraphrase Rules},\n\tauthor       = {Roy Bar-Haim and Jonathan Berant and Ido Dagan},\n\tyear         = 2009,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{barker02continuations,\n\ttitle        = {Continuations and the nature of quantification},\n\tauthor       = {Chris Barker},\n\tyear         = 2002,\n\tjournal      = {Natural Language Semantics},\n\tvolume       = 10,\n\tpages        = {211--242}\n}\n@article{barocas2016,\n\ttitle        = {Big Data's Disparate Impact},\n\tauthor       = {Solon Barocas and Andrew D. Selbst},\n\tyear         = 2016,\n\tjournal      = {104 California Law Review},\n\tvolume       = 3,\n\tpages        = {671--732}\n}\n@inproceedings{baroni2010nouns,\n\ttitle        = {Nouns are vectors, adjectives are matrices: Representing adjective-noun constructions in semantic space},\n\tauthor       = {Marco Baroni and Roberto Zamparelli},\n\tyear         = 2010,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1183--1193}\n}\n@article{barreno2010security,\n\ttitle        = {The security of machine learning},\n\tauthor       = {Marco Barreno and Blaine Nelson and Anthony D. Joseph and J. D. Tygar},\n\tyear         = 2010,\n\tjournal      = {Machine Learning},\n\tvolume       = 81,\n\tnumber       = 2,\n\tpages        = {121--148}\n}\n@article{barreto2011computing,\n\ttitle        = {Computing the stationary distribution of a finite Markov chain through stochastic factorization},\n\tauthor       = {Barreto, Andr{\\'e} MS and Fragoso, Marcelo D},\n\tyear         = 2011,\n\tjournal      = {SIAM Journal on Matrix Analysis and Applications},\n\tpublisher    = {SIAM}\n}\n@inproceedings{barreto2011reinforcement,\n\ttitle        = {Reinforcement learning using kernel-based stochastic factorization},\n\tauthor       = {Barreto, Andre and Precup, Doina and Pineau, Joelle},\n\tyear         = 2011,\n\tbooktitle    = {Advances in Neural Information Processing Systems}\n}\n@article{barreto2014policy,\n\ttitle        = {Policy iteration based on stochastic factorization},\n\tauthor       = {Barreto, Andr\\'e M. S. and Pineau, Joelle and Precup, Doina},\n\tyear         = 2014,\n\tjournal      = {J. Artificial Intelligence Res.},\n\tvolume       = 50,\n\tpages        = {763--803},\n\tissn         = {1076-9757},\n\tfjournal     = {Journal of Artificial Intelligence Research},\n\tmrclass      = {90C40 (68T20 90C39)},\n\tmrnumber     = 3254852,\n\tmrreviewer   = {Masayuki Horiguchi}\n}\n@inproceedings{barrio2016comprehension,\n\ttitle        = {Improving the Comprehension of Numbers in the News},\n\tauthor       = {Pablo J. Barrio and Daniel G. Goldstein and Jake M. Hofman},\n\tyear         = 2016,\n\tbooktitle    = {Conference on Human Factors in Computing Systems (CHI)}\n}\n@article{barron1993universal,\n\ttitle        = {Universal approximation bounds for superpositions of a sigmoidal function},\n\tauthor       = {Barron, Andrew R},\n\tyear         = 1993,\n\tjournal      = {IEEE Transactions on Information theory},\n\tpublisher    = {IEEE},\n\tvolume       = 39,\n\tnumber       = 3,\n\tpages        = {930--945}\n}\n@book{barroso2009datacenter,\n\ttitle        = {\n\t\tThe Datacenter as a Computer: An Introduction to the Design of Warehouse-Scale\n\n\t\tMachines\n\t},\n\tauthor       = {Barroso, Luiz A. and H\\\"{o}lzle, Urs},\n\tyear         = 2009,\n\tpublisher    = {Morgan and Claypool Publishers},\n\tisbn         = {159829556X, 9781598295566},\n\tedition      = {1st},\n\tabstract     = {\n\t\tAs computation continues to move into the cloud, the computing platform\n\n\t\tof interest no longer re- sembles a pizza box or a refrigerator,\n\n\t\tbut a warehouse full of computers. These new large datacenters are\n\n\t\tquite different from traditional hosting facilities of earlier times\n\n\t\tand cannot be viewed simply as a collection of co-located servers.\n\n\t\tLarge portions of the hardware and software resources in these facilities\n\n\t\tmust work in concert to efficiently deliver good levels of Internet\n\n\t\tservice performance, something that can only be achieved by a holistic\n\n\t\tapproach to their design and deployment. In other words, we must\n\n\t\ttreat the datacenter itself as one massive warehouse-scale computer\n\n\t\t(WSC). We describe the architecture of WSCs, the main factors influencing\n\n\t\ttheir design, operation, and cost structure, and the characteristics\n\n\t\tof their software base. We hope it will be useful to architects and\n\n\t\tprogrammers of today's WSCs, as well as those of future many-core\n\n\t\tplatforms which may one day implement the equivalent of today's WSCs\n\n\t\ton a single board.\n\t},\n\tcomment      = {\n\t\tPretty extensive description of the reasons behind scaling out vs.\n\n\t\tscaling up with commodity hardware and the resulting implications.\n\t},\n\tkeywords     = {datacenter, google},\n\tmyurl        = {http://www.morganclaypool.com/doi/abs/10.2200/S00193ED1V01Y200905CAC006}\n}\n@inproceedings{barry2013manipulation,\n\ttitle        = {Manipulation with multiple action types},\n\tauthor       = {J. Barry and K. Hsiao and L. P. Kaelbling and T. Lozano-P{'e}rez},\n\tyear         = 2013,\n\tbooktitle    = {Expermental Robotics},\n\tpages        = {531--545}\n}\n@inproceedings{BartalByersRaz1997,\n\ttitle        = {{Global optimization using local information with applications to flow control}},\n\tauthor       = {Bartal, Yair and Byers, John W. and Raz, Danny},\n\tyear         = 1997,\n\tbooktitle    = {Proceedings 38th Annual Symposium on Foundations of Computer Science},\n\tpublisher    = {IEEE Comput. Soc},\n\tpages        = {303--312},\n\tdoi          = {10.1109/SFCS.1997.646119},\n\tisbn         = {0-8186-8197-7},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Bartal, Byers, Raz - 1997 - Global optimization using local information with applications to flow control.pdf:pdf},\n\tmendeley-groups = {Algorithms/Multiplicative Weight/LP}\n}\n@article{BartalByersRaz2004,\n\ttitle        = {{Fast, Distributed Approximation Algorithms for Positive Linear Programming with Applications to Flow Control}},\n\tauthor       = {Bartal, Yair and Byers, John W. and Raz, Danny},\n\tyear         = 2004,\n\tmonth        = jan,\n\tjournal      = {SIAM Journal on Computing},\n\tvolume       = 33,\n\tnumber       = 6,\n\tpages        = {1261--1279},\n\tdoi          = {10.1137/S0097539700379383},\n\tissn         = {0097-5397},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Bartal, Byers, Raz - 2004 - Fast, Distributed Approximation Algorithms for Positive Linear Programming with Applications to Flow Control.pdf:pdf},\n\tkeywords     = {1,10,1137,68w15,68w25,ams subject classifications,approximation algorithm,doi,environment must make decisions,flow control,introduction,linear programming,primal-dual,processors in a distributed,s0097539700379383},\n\tmendeley-groups = {Algorithms/Multiplicative Weight/LP}\n}\n@inproceedings{bartlett01rademacher,\n\ttitle        = {{R}ademacher and {G}aussian complexities: Risk bounds and structural results},\n\tauthor       = {P. L. Bartlett and S. Mendelson},\n\tyear         = 2001,\n\tbooktitle    = {Conference on Learning Theory (COLT)},\n\tpages        = {224--240}\n}\n@article{bartlett05local,\n\ttitle        = {Local {R}ademacher complexities},\n\tauthor       = {Peter L. Bartlett and Olivier Bousquet and Shahar Mendelson},\n\tyear         = 2005,\n\tjournal      = {Annals of Statistics},\n\tvolume       = 33,\n\tnumber       = 4,\n\tpages        = {1497--1537}\n}\n@inproceedings{bartlett1992learning,\n\ttitle        = {Learning with a slowly changing distribution},\n\tauthor       = {Peter L. Bartlett},\n\tyear         = 1992,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@article{bartlett1996learning,\n\ttitle        = {Learning Changing Concepts by Exploiting the Structure of Change},\n\tauthor       = {Peter L. Bartlett and Shai Ben-David and Sanjeev R. Kulkarni},\n\tyear         = 1996,\n\tjournal      = {Machine Learning},\n\tvolume       = 41\n}\n@article{bartlett2002rademacher,\n\ttitle        = {Rademacher and Gaussian complexities: Risk bounds and structural results},\n\tauthor       = {Bartlett, Peter L and Mendelson, Shahar},\n\tyear         = 2002,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 3,\n\tnumber       = {Nov},\n\tpages        = {463--482}\n}\n@article{bartlett2008classification,\n\ttitle        = {Classification with a reject option using a hinge loss},\n\tauthor       = {Peter L Bartlett and Marten H Wegkamp},\n\tyear         = 2008,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 9,\n\tnumber       = {0},\n\tpages        = {1823--1840}\n}\n@article{bartlett2008high,\n\ttitle        = {High-probability regret bounds for bandit online linear optimization},\n\tauthor       = {Bartlett, Peter L and Dani, Varsha and Hayes, Thomas and Kakade, Sham and Rakhlin, Alexander and Tewari, Ambuj},\n\tyear         = 2008,\n\tbooktitle    = {COLT 2008},\n\tfile         = {:D$\\backslash$:/Mendeley Desktop/Bartlett et al. - 2008 - High-probability regret bounds for bandit online linear optimization.pdf:pdf},\n\tmendeley-groups = {Optimization/Bandit}\n}\n@inproceedings{bartlett2009regal,\n\ttitle        = {REGAL: a regularization based algorithm for reinforcement learning in weakly communicating MDPs},\n\tauthor       = {Bartlett, Peter L and Tewari, Ambuj},\n\tyear         = 2009,\n\tjournal      = {arXiv preprint arXiv:1205.2661},\n\tbooktitle    = {Proceedings of the 25th Conference on Uncertainty in Artificial Intelligence (UAI 2009))}\n}\n@inproceedings{bartlett2017spectral,\n\ttitle        = {Spectrally-normalized margin bounds for neural networks},\n\tauthor       = {Peter Bartlett and Dylan J. Foster and Matus Telgarsky},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{bartlett2017spectrally,\n\ttitle        = {Spectrally-normalized margin bounds for neural networks},\n\tauthor       = {Bartlett, Peter and Foster, Dylan J and Telgarsky, Matus},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1706.08498}\n}\n@article{bartlett2019benign,\n\ttitle        = {Benign Overfitting in Linear Regression},\n\tauthor       = {Peter L. Bartlett and Philip M. Long and G´abor Lugosi and Alexander Tsigler},\n\tyear         = 2019,\n\tjournal      = {arXiv}\n}\n@article{bartlett2019nearly,\n\ttitle        = {Nearly-tight VC-dimension and pseudodimension bounds for piecewise linear neural networks},\n\tauthor       = {Bartlett, Peter L and Harvey, Nick and Liaw, Christopher and Mehrabian, Abbas},\n\tyear         = 2019,\n\tjournal      = {The Journal of Machine Learning Research},\n\tpublisher    = {JMLR. org},\n\tvolume       = 20,\n\tnumber       = 1,\n\tpages        = {2285--2301}\n}\n@article{bartlett53approximate,\n\ttitle        = {Approximate confidence intervals. {II}. {M}ore than one unknown parameter},\n\tauthor       = {M. S. Bartlett},\n\tyear         = 1953,\n\tjournal      = {Biometrika},\n\tvolume       = 40,\n\tpages        = {306--317}\n}\n@book{barto1998reinforcement,\n\ttitle        = {Reinforcement learning: An introduction},\n\tauthor       = {Barto, Andrew G},\n\tyear         = 1998,\n\tpublisher    = {MIT press}\n}\n@article{barvinok95problems,\n\ttitle        = {Problems of Distance Geometry and Convex Properties of Quadratic Maps},\n\tauthor       = {A. I. Barvinok},\n\tyear         = 1995,\n\tjournal      = {Discrete \\& Computational Geometry},\n\tvolume       = 13,\n\tpages        = {189--202}\n}\n@inproceedings{barzilay04content,\n\ttitle        = {Catching the Drift: Probabilistic Content Models, with Applications to Generation and Summarization},\n\tauthor       = {Regina Barzilay and Lillian Lee},\n\tyear         = 2004,\n\tbooktitle    = {Human Language Technology and North American Association for Computational Linguistics (HLT/NAACL)}\n}\n@inproceedings{barzilay05content,\n\ttitle        = {Collective Content Selection for Concept-To-Text Generation},\n\tauthor       = {Regina Barzilay and Mirella Lapata},\n\tyear         = 2005,\n\tbooktitle    = {Human Language Technology and Empirical Methods in Natural Language Processing (HLT/EMNLP)},\n\tpages        = {331--338}\n}\n@inproceedings{barzilay06aggregation,\n\ttitle        = {Aggregation via Set Partitioning for Natural Language Generation},\n\tauthor       = {Regina Barzilay and Mirella Lapata},\n\tyear         = 2006,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{barzilay08coherence,\n\ttitle        = {Modeling Local Coherence: An Entity-based Approach},\n\tauthor       = {Regina Barzilay and Mirella Lapata},\n\tyear         = 2008,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 34,\n\tpages        = {1--34}\n}\n@inproceedings{barzilay2003learning,\n\ttitle        = {Learning to paraphrase: An unsupervised approach using multiple-sequence alignment},\n\tauthor       = {Regina Barzilay and Lillian Lee},\n\tyear         = 2003,\n\tbooktitle    = {Human Language Technology and North American Association for Computational Linguistics (HLT/NAACL)},\n\tpages        = {16--23}\n}\n@inproceedings{bash2007cool,\n\ttitle        = {\n\t\tCool job allocation: measuring the power savings of placing jobs\n\n\t\tat cooling-efficient locations in the data center\n\t},\n\tauthor       = {Bash, Cullen and Forman, George},\n\tyear         = 2007,\n\tbooktitle    = {\n\t\t2007 USENIX Annual Technical Conference on Proceedings of the USENIX\n\n\t\tAnnual Technical Conference\n\t},\n\tlocation     = {Santa Clara, CA},\n\tpublisher    = {USENIX Association},\n\taddress      = {Berkeley, CA, USA},\n\tpages        = {29:1--29:6},\n\tisbn         = {999-8888-77-6},\n\tacmid        = 1364414,\n\tarticleno    = 29,\n\tmyurl        = {http://dl.acm.org/citation.cfm?id=1364385.1364414},\n\tnumpages     = 6\n}\n@article{basseville1988detecting,\n\ttitle        = {Detecting changes in signals and systems--A survey},\n\tauthor       = {Mich{\\`e}le Basseville},\n\tyear         = 1988,\n\tjournal      = {Automatica},\n\tvolume       = 24,\n\tnumber       = 3,\n\tpages        = {309--326}\n}\n@article{bassiri2011interactional,\n\ttitle        = {Interactional feedback and the impact of attitude and motivation on noticing l2 form},\n\tauthor       = {Mohammad Amin Bassiri},\n\tyear         = 2011,\n\tjournal      = {English Language and Literature Studies},\n\tvolume       = 1,\n\tnumber       = 2,\n\tpages        = {61--73}\n}\n@inproceedings{bastani2016measuring,\n\ttitle        = {Measuring neural net robustness with constraints},\n\tauthor       = {Osbert Bastani and Yani Ioannou and Leonidas Lampropoulos and Dimitrios Vytiniotis and Aditya Nori and Antonio Criminisi},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {2613--2621}\n}\n@inproceedings{bastani2017synthesizing,\n\ttitle        = {Synthesizing Program Input Grammars},\n\tauthor       = {Osbert Bastani and Rahul Sharma and Alex Aiken and Percy Liang},\n\tyear         = 2017,\n\tbooktitle    = {Programming Language Design and Implementation (PLDI)}\n}\n@inproceedings{bastani2018active,\n\ttitle        = {Active Learning of Points-To Specifications},\n\tauthor       = {Osbert Bastani and Rahul Sharma and Alex Aiken and Percy Liang},\n\tyear         = 2018,\n\tbooktitle    = {Programming Language Design and Implementation (PLDI)}\n}\n@article{batson2012twice,\n\ttitle        = {Twice-ramanujan sparsifiers},\n\tauthor       = {Batson, Joshua and Spielman, Daniel A and Srivastava, Nikhil},\n\tyear         = 2012,\n\tmonth        = may,\n\tjournal      = {SIAM Journal on Computing},\n\tpublisher    = {SIAM},\n\taddress      = {New York, New York, USA},\n\tvolume       = 41,\n\tnumber       = 6,\n\tpages        = {1704--1721},\n\tdoi          = {10.1137/130949117},\n\tisbn         = 9781605585062,\n\tissn         = {0036-1445},\n\tabstract     = {We prove that every graph has a spectral sparsifier with a number of edges linear in its number of vertices. As linear-sized spectral sparsifiers of complete graphs are expanders, our sparsifiers of arbitrary graphs can be viewed as generalizations of expander graphs. In particular, we prove that for every \\$d>1\\$ and every undirected, weighted graph \\$G=(V,E,w)\\$ on \\$n\\$ vertices, there exists a weighted graph \\$H=(V,F,\\backslash tilde\\{w\\})\\$ with at most \\$\\backslash ceil\\{d(n-1)\\}\\$ edges such that for every \\$x \\backslash in \\backslash R\\^{}\\{V\\}\\$, $\\backslash$[ x\\^{}\\{T\\}L\\_\\{G\\}x $\\backslash$leq x\\^{}\\{T\\}L\\_\\{H\\}x $\\backslash$leq ($\\backslash$frac\\{d+1+2$\\backslash$sqrt\\{d\\}\\}\\{d+1-2$\\backslash$sqrt\\{d\\}\\})$\\backslash$cdot x\\^{}\\{T\\}L\\_\\{G\\}x $\\backslash$] where \\$L\\_\\{G\\}\\$ and \\$L\\_\\{H\\}\\$ are the Laplacian matrices of \\$G\\$ and \\$H\\$, respectively. Thus, \\$H\\$ approximates \\$G\\$ spectrally at least as well as a Ramanujan expander with \\$dn/2\\$ edges approximates the complete graph. We give an elementary deterministic polynomial time algorithm for constructing \\$H\\$.},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {0808.0163},\n\teprint       = {0808.0163},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Batson, Spielman, Srivastava - 2009 - Twice-\\{R\\}amanujan Sparsifiers.pdf:pdf},\n\tmendeley-groups = {Algorithms/Sparsification}\n}\n@inproceedings{bau2017network,\n\ttitle        = {Network dissection: Quantifying interpretability of deep visual representations},\n\tauthor       = {David Bau and Bolei Zhou and Aditya Khosla and Aude Oliva and Antonio Torralba},\n\tyear         = 2017,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {6541--6549}\n}\n@phdthesis{Bau96,\n\ttitle        = {Projection Algorithms and Monotone Operators},\n\tauthor       = {Bauschke, Heinz H.},\n\tyear         = 1996,\n\taddress      = {Simon Fraser University},\n\tisbn         = {0-612-16789-5},\n\tadvisor      = {Borwein, Jonathan M.}\n}\n@article{baum1970maximization,\n\ttitle        = {\n\t\tA Maximization Technique Occurring in the Statistical Analysis of\n\n\t\tProbabilistic Functions of {M}arkov Chains\n\t},\n\tauthor       = {Baum, Leonard E. and Petrie, Ted and Soules, George and Weiss, Norman},\n\tyear         = 1970,\n\tjournal      = {The Annals of Mathematical Statistics},\n\tpublisher    = {Institute of Mathematical Statistics},\n\tvolume       = 41,\n\tnumber       = 1,\n\tpages        = {164--171},\n\tissn         = {00034851},\n\tcopyright    = {Copyright ? 1970 Institute of Mathematical Statistics},\n\tjstor_formatteddate = {Feb., 1970},\n\tlanguage     = {English},\n\tmyurl        = {http://www.jstor.org/stable/2239727}\n}\n@article{baum1990polynomial,\n\ttitle        = {A polynomial time algorithm that learns two hidden unit nets},\n\tauthor       = {Baum, Eric B},\n\tyear         = 1990,\n\tjournal      = {Neural Computation},\n\tpublisher    = {MIT Press},\n\tvolume       = 2,\n\tnumber       = 4,\n\tpages        = {510--522}\n}\n@article{baumann2019spoken,\n\ttitle        = {The {Spoken {W}ikipedia Corpus} collection: Harvesting, alignment and an application to hyperlistening},\n\tauthor       = {Timo Baumann and Arne K{\\\"o}hn and Felix Hennig},\n\tyear         = 2019,\n\tjournal      = {Language Resources and Evaluation},\n\tvolume       = 53,\n\tnumber       = 2,\n\tpages        = {303--329}\n}\n@article{baxter2000model,\n\ttitle        = {A model of inductive bias learning},\n\tauthor       = {Baxter, Jonathan},\n\tyear         = 2000,\n\tjournal      = {Journal of artificial intelligence research}\n}\n@article{Bazanella08,\n\ttitle        = {Iterative minimization of $H_2$ control performance criteria},\n\tauthor       = {Alexandre S. Bazanella and Michel Gevers and Ljubisa Miskovic and Brian D.O. Anderson},\n\tyear         = 2008,\n\tjournal      = {Automatica},\n\tvolume       = 44,\n\tpages        = {2549--2559},\n\tdate-added   = {2016-04-02 19:00:08 +0000},\n\tdate-modified = {2016-04-02 19:01:05 +0000}\n}\n@inproceedings{baziotis2019seq,\n\ttitle        = {SEQ\\({}^{\\mbox{3}}\\): Differentiable Sequence-to-Sequence-to-Sequence Autoencoder for Unsupervised Abstractive Sentence Compression},\n\tauthor       = {Christos Baziotis and Ion Androutsopoulos and Ioannis Konstas and Alexandros Potamianos},\n\tyear         = 2019,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{bbc2020gcse,\n\ttitle        = {A-levels and GCSEs: How did the exam algorithm work?},\n\tauthor       = {BBC},\n\tyear         = 2020,\n\tjournal      = {The British Broadcasting Corporation},\n\turl          = {https://www.bbc.com/news/explainers-53807730}\n}\n@article{BBL97,\n\ttitle        = {The method of cyclic projections for closed convex sets in {H}ilbert space},\n\tauthor       = {Bauschke, Heinz H. and Borwein, Jonathan M. and Lewis, Adrian S.},\n\tyear         = 1997,\n\tjournal      = {Contemp. Math.},\n\tpublisher    = {Amer. Math. Soc.},\n\tvolume       = 204,\n\tpages        = {1--38},\n\tdoi          = {10.1090/conm/204/02620},\n\turl          = {http://dx.doi.org/10.1090/conm/204/02620},\n\tmrclass      = {49M45 (47H99 47N10 65F10 90C25)},\n\tmrnumber     = 1442992,\n\tmrreviewer   = {Alfredo N. Iusem}\n}\n@article{BCNN11,\n\ttitle        = {On the use of stochastic hessian information in optimization methods for machine learning},\n\tauthor       = {Byrd, Richard H and Chin, Gillian M and Neveitt, Will and Nocedal, Jorge},\n\tyear         = 2011,\n\tjournal      = {SIAM Journal on Optimization},\n\tpublisher    = {SIAM},\n\tvolume       = 21,\n\tnumber       = 3,\n\tpages        = {977--995}\n}\n@article{bdl18,\n\ttitle        = {Complexity of Training {R}e{LU} Neural Network},\n\tauthor       = {Boob, Digvijay and Dey, Santanu S and Lan, Guanghui},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1809.10787}\n}\n@inproceedings{beal02ihmm,\n\ttitle        = {The infinite hidden {M}arkov model},\n\tauthor       = {M. Beal and Z. Ghahramani and C. Rasmussen},\n\tyear         = 2002,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {577--584}\n}\n@article{beame2017time,\n\ttitle        = {Time-Space Tradeoffs for Learning from Small Test Spaces: Learning Low Degree Polynomial Functions},\n\tauthor       = {Paul Beame and Shayan Oveis Gharan and Xin Yang},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@inproceedings{beberg2009folding,\n\ttitle        = {Folding@home: Lessons from eight years of volunteer distributed computing},\n\tauthor       = {Adam L Beberg and Daniel L Ensign and Guha Jayachandran and Siraj Khaliq and Vijay S Pande},\n\tyear         = 2009,\n\tbooktitle    = {2009 IEEE International Symposium on Parallel \\& Distributed Processing},\n\tpages        = {1--8}\n}\n@article{bechavod2017penalizing,\n\ttitle        = {Penalizing unfairness in binary classification},\n\tauthor       = {Yahav Bechavod and Katrina Ligett},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1707.00044}\n}\n@article{beck2011systematic,\n\ttitle        = {Systematic analysis of breast cancer morphology uncovers stromal features associated with survival},\n\tauthor       = {Andrew H Beck and Ankur R Sangoi and Samuel Leung and Robert J Marinelli and Torsten O Nielsen and Marc J Van De Vijver and Robert B West and Matt Van De Rijn and Daphne Koller},\n\tyear         = 2011,\n\tjournal      = {Science},\n\tvolume       = 3,\n\tnumber       = 108\n}\n@article{Beck2012smoothing,\n\ttitle        = {Smoothing and first order methods: A unified framework},\n\tauthor       = {Beck, Amir and Teboulle, Marc},\n\tyear         = 2012,\n\tjournal      = {SIAM Journal on Optimization},\n\tpublisher    = {SIAM},\n\tvolume       = 22,\n\tnumber       = 2,\n\tpages        = {557--580}\n}\n@book{becker2010economics,\n\ttitle        = {The economics of discrimination},\n\tauthor       = {Gary S Becker},\n\tyear         = 2010,\n\tpublisher    = {University of Chicago press}\n}\n@article{beckmann2005tensorial,\n\ttitle        = {Tensorial extensions of independent component analysis for multisubject {FMRI} analysis},\n\tauthor       = {Christian F Beckmann and Stephen M Smith},\n\tyear         = 2005,\n\tjournal      = {Neuroimage},\n\tvolume       = 25,\n\tnumber       = 1,\n\tpages        = {294--311}\n}\n@inproceedings{beede2020human,\n\ttitle        = {A Human-Centered Evaluation of a Deep Learning System Deployed in Clinics for the Detection of Diabetic Retinopathy},\n\tauthor       = {Emma Beede and Elizabeth Baylor and Fred Hersch and Anna Iurchenko and Lauren Wilcox and Paisan Ruamviboonsuk and Laura M Vardoulakis},\n\tyear         = 2020,\n\tbooktitle    = {Conference on Human Factors in Computing Systems (CHI)},\n\tpages        = {1--12}\n}\n@inproceedings{Beery_2018_ECCV,\n\ttitle        = {Recognition in Terra Incognita},\n\tauthor       = {Beery, Sara and Van Horn, Grant and Perona, Pietro},\n\tyear         = 2018,\n\tmonth        = {September},\n\tbooktitle    = {Proceedings of the European Conference on Computer Vision (ECCV)}\n}\n@inproceedings{beery2018recognition,\n\ttitle        = {Recognition in terra incognita},\n\tauthor       = {Sara Beery and Grant Van Horn and Pietro Perona},\n\tyear         = 2018,\n\tbooktitle    = {European Conference on Computer Vision (ECCV)},\n\tpages        = {456--473}\n}\n@article{beery2020iwildcam,\n\ttitle        = {The iWildCam 2020 Competition Dataset},\n\tauthor       = {Sara Beery and Elijah Cole and Arvi Gjoka},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2004.10340}\n}\n@inproceedings{beetz2011robotic,\n\ttitle        = {Robotic roommates making pancakes},\n\tauthor       = {M. Beetz and U. Klank and I. Kresse and A. Maldonado and L. Mosenlechner and D. Pangercic and T. Ruhr and M. Tenorth},\n\tyear         = 2011,\n\tbooktitle    = {Humanoids}\n}\n@article{behboudian2020useful,\n\ttitle        = {Useful Policy Invariant Shaping from Arbitrary Advice},\n\tauthor       = {Paniz Behboudian and Yash Satsangi and Matthew E. Taylor and Anna Harutyunyan and Michael Bowling},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2011.01297}\n}\n@article{behzadan2017vulnerability,\n\ttitle        = {Vulnerability of Deep Reinforcement Learning to Policy Induction Attacks},\n\tauthor       = {Vahid Behzadan and Arslan Munir},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@article{bejnordi2017diagnostic,\n\ttitle        = {Diagnostic assessment of deep learning algorithms for detection of lymph node metastases in women with breast cancer},\n\tauthor       = {Babak Ehteshami Bejnordi and Mitko Veta and Paul Johannes Van Diest and Bram Van Ginneken and Nico Karssemeijer and Geert Litjens and Jeroen AWM Van Der Laak and Meyke Hermsen and Quirine F Manson and Maschenka Balkenhol and others},\n\tyear         = 2017,\n\tjournal      = {Jama},\n\tvolume       = 318,\n\tnumber       = 22,\n\tpages        = {2199--2210}\n}\n@inproceedings{belanger2015linear,\n\ttitle        = {A Linear Dynamical System Model for Text},\n\tauthor       = {Belanger, David and Kakade, Sham M.},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the 32nd International Conference on Machine Learning}\n}\n@inproceedings{belinkov2018synthetic,\n\ttitle        = {Synthetic and natural noise both break neural machine translation},\n\tauthor       = {Yonatan Belinkov and Yonatan Bisk},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{belinkov2019premise,\n\ttitle        = {Don't Take the Premise for Granted: Mitigating Artifacts in Natural Language Inference},\n\tauthor       = {Yonatan Belinkov and Adam Poliak and S. Shieber and Benjamin Van Durme and Alexander M. Rush},\n\tyear         = 2019,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{belkin2018understand,\n\ttitle        = {To understand deep learning we need to understand kernel learning},\n\tauthor       = {Mikhail Belkin and Siyuan Ma and Soumik Mandal},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{belkin2019reconciling,\n\ttitle        = {Reconciling modern machine-learning practice and the classical bias--variance trade-off},\n\tauthor       = {Mikhail Belkin and Daniel Hsu and Siyuan Ma and Soumik Mandal},\n\tyear         = 2019,\n\tjournal      = {Science},\n\tvolume       = 116,\n\tnumber       = 32\n}\n@article{belkin2019two,\n\ttitle        = {Two models of double descent for weak features},\n\tauthor       = {Mikhail Belkin and Daniel Hsu and Ji Xu},\n\tyear         = 2019,\n\tjournal      = {arXiv}\n}\n@article{bellamy2020evaluating,\n\ttitle        = {Evaluating Progress on Machine Learning for Longitudinal Electronic Healthcare Data},\n\tauthor       = {David Bellamy and Leo Celi and Andrew L Beam},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.01149}\n}\n@article{bellemare2013arcade,\n\ttitle        = {The Arcade Learning Environment: An evaluation platform for general agents},\n\tauthor       = {M. G. Bellemare and Y. Naddaf and J. Veness and M. Bowling},\n\tyear         = 2013,\n\tjournal      = {Journal of Artificial Intelligence Research (JAIR)},\n\tvolume       = 47,\n\tpages        = {253--279}\n}\n@inproceedings{bellemare2014skip,\n\ttitle        = {Skip context tree switching},\n\tauthor       = {Bellemare, Marc and Veness, Joel and Talvitie, Erik},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {1458--1466}\n}\n@inproceedings{bellemare2016unifying,\n\ttitle        = {Unifying count-based exploration and intrinsic motivation},\n\tauthor       = {Bellemare, Marc and Srinivasan, Sriram and Ostrovski, Georg and Schaul, Tom and Saxton, David and Munos, Remi},\n\tyear         = 2016,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {1471--1479}\n}\n@article{bellemare2020autonomous,\n\ttitle        = {Autonomous navigation of stratospheric balloons using reinforcement learning},\n\tauthor       = {Marc G. Bellemare and Salvatore Candido and Pablo Samuel Castro and Jun Gong and Marlos C. Machado and Subhodeep Moitra and Sameera S. Ponda and Ziyu Wang},\n\tyear         = 2020,\n\tjournal      = {Nature},\n\tvolume       = 588\n}\n@book{bellman1957dynamic,\n\ttitle        = {Dynamic Programming},\n\tauthor       = {Bellman, Richard},\n\tyear         = 1957,\n\tpublisher    = {Princeton University Press, Princeton, NJ}\n}\n@article{bellot2020generalization,\n\ttitle        = {Generalization and invariances in the presence of unobserved confounding},\n\tauthor       = {Bellot, Alexis and van der Schaar, Mihaela},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.10653}\n}\n@article{belsky2015quantification,\n\ttitle        = {Quantification of biological aging in young adults},\n\tauthor       = {Daniel W Belsky and Avshalom Caspi and Renate Houts and Harvey J Cohen and David L Corcoran and Andrea Danese and HonaLee Harrington and Salomon Israel and Morgan E Levine and Jonathan D Schaefer and others},\n\tyear         = 2015,\n\tjournal      = {Proceedings of the National Academy of Sciences},\n\tvolume       = 112,\n\tnumber       = 30\n}\n@inproceedings{beltagy2014probabilistic,\n\ttitle        = {Probabilistic Soft Logic for Semantic Textual Similarity},\n\tauthor       = {Islam Beltagy and Katherin Erk and Raymond J. Mooney},\n\tyear         = 2014,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{beltagy2017representing,\n\ttitle        = {Representing meaning with a combination of logical and distributional models},\n\tauthor       = {Islam Beltagy and Stephen Roller and Pengxiang Cheng and Katrin Erk and Raymond J Mooney},\n\tyear         = 2017,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 42\n}\n@article{belz08sumtime,\n\ttitle        = {Automatic generation of weather forecast texts using comprehensive probabilistic generation-space models},\n\tauthor       = {Anja Belz},\n\tyear         = 2008,\n\tjournal      = {Natural Language Engineering},\n\tvolume       = 14,\n\tnumber       = 4,\n\tpages        = {1--26}\n}\n@inproceedings{belz09sumtime,\n\ttitle        = {System building cost vs. output quality in data-to-text generation},\n\tauthor       = {Anja Belz and Eric Kow},\n\tyear         = 2009,\n\tbooktitle    = {European Workshop on Natural Language Generation},\n\tpages        = {16--24}\n}\n@article{ben2007analysis,\n\ttitle        = {Analysis of representations for domain adaptation},\n\tauthor       = {Ben-David, Shai and Blitzer, John and Crammer, Koby and Pereira, Fernando and others},\n\tyear         = 2007,\n\tjournal      = {Advances in neural information processing systems},\n\tpublisher    = {MIT; 1998},\n\tvolume       = 19,\n\tpages        = 137\n}\n@inproceedings{ben2010impossibility,\n\ttitle        = {Impossibility theorems for domain adaptation},\n\tauthor       = {Ben-David, Shai and Lu, Tyler and Luu, Teresa and P{\\'a}l, D{\\'a}vid},\n\tyear         = 2010,\n\tbooktitle    = {International Conference on Artificial Intelligence and Statistics},\n\tpages        = {129--136}\n}\n@article{ben2010theory,\n\ttitle        = {A theory of learning from different domains},\n\tauthor       = {Ben-David, Shai and Blitzer, John and Crammer, Koby and Kulesza, Alex and Pereira, Fernando and Vaughan, Jennifer Wortman},\n\tyear         = 2010,\n\tjournal      = {Machine learning},\n\tpublisher    = {Springer},\n\tvolume       = 79,\n\tnumber       = {1-2},\n\tpages        = {151--175}\n}\n@inproceedings{ben2012hardness,\n\ttitle        = {On the hardness of domain adaptation and the utility of unlabeled target samples},\n\tauthor       = {Ben-David, Shai and Urner, Ruth},\n\tyear         = 2012,\n\tbooktitle    = {International Conference on Algorithmic Learning Theory},\n\tpages        = {139--153},\n\torganization = {Springer}\n}\n@techreport{BenczurKarger02,\n\ttitle        = {{Randomized Approximation Schemes for Cuts and Flows in Capacitated Graphs}},\n\tauthor       = {Bencz\\'{u}r, Andr\\'{a}s A. and Karger, David R.},\n\tyear         = 2002,\n\tmonth        = jul,\n\tbooktitle    = {arXiv preprint cs/0207078},\n\tpages        = {1--20},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {cs/0207078},\n\teprint       = {0207078},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/110a03446ced34ac8baaf80534e7433c45797196.pdf:pdf},\n\tmendeley-groups = {Algorithms/Sparsification},\n\tprimaryclass = {cs}\n}\n@inproceedings{BenczurKarger96,\n\ttitle        = {{Approximating s-t minimum cuts in $\\tilde{O}(n^2)$ time}},\n\tauthor       = {Bencz\\'{u}r, Andr\\'{a}s A. and Karger, David R.},\n\tyear         = 1996,\n\tbooktitle    = {Proceedings of the twenty-eighth annual ACM symposium on Theory of computing - STOC '96},\n\tpublisher    = {ACM Press},\n\taddress      = {New York, New York, USA},\n\tpages        = {47--55},\n\tdoi          = {10.1145/237814.237827},\n\tisbn         = {0897917855},\n\tmendeley-groups = {Algorithms/Sparsification}\n}\n@article{bendall2014single,\n\ttitle        = {Single-cell trajectory detection uncovers progression and regulatory coordination in human {B} cell development},\n\tauthor       = {Sean C Bendall and Kara L Davis and El-ad David Amir and Michelle D Tadmor and Erin F Simonds and Tiffany J Chen and Daniel K Shenfeld and Garry P Nolan and Dana Pe'er},\n\tyear         = 2014,\n\tjournal      = {Cell},\n\tvolume       = 157,\n\tnumber       = 3,\n\tpages        = {714--725}\n}\n@inproceedings{bendavid2006analysis,\n\ttitle        = {Analysis of representations for domain adaptation},\n\tauthor       = {Shai Ben-David and John Blitzer and Koby Crammer and Fernando Pereira},\n\tyear         = 2006,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {137--144}\n}\n@misc{bender2017buildit,\n\ttitle        = {Build It, Break It: The Language Edition},\n\tauthor       = {Emily M. Bender and Hal {Daum{\\'e} III} and Allyson Ettinger and Harita Kannan and Sudha Rao and Ephraim Rothschild},\n\tyear         = 2017,\n\thowpublished = {\\url{https://bibinlp.umiacs.umd.edu/}}\n}\n@article{bender2018data,\n\ttitle        = {Data statements for natural language processing: Toward mitigating system bias and enabling better science},\n\tauthor       = {Emily M Bender and Batya Friedman},\n\tyear         = 2018,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 6,\n\tpages        = {587--604}\n}\n@article{benedek1991learnability,\n\ttitle        = {Learnability with respect to fixed distributions},\n\tauthor       = {Benedek, Gyora M and Itai, Alon},\n\tyear         = 1991,\n\tjournal      = {Theoretical Computer Science},\n\tpublisher    = {Elsevier},\n\tvolume       = 86,\n\tnumber       = 2,\n\tpages        = {377--389}\n}\n@inproceedings{bengio1991learning,\n\ttitle        = {Learning a synaptic learning rule},\n\tauthor       = {Y Bengio and S Bengio and J Cloutier},\n\tyear         = 1991,\n\tbooktitle    = {IJCNN-91-Seattle International Joint Conference on Neural Networks},\n\tvolume       = 2,\n\tpages        = {969--969}\n}\n@inproceedings{bengio1992optimization,\n\ttitle        = {On the optimization of a synaptic learning rule},\n\tauthor       = {Samy Bengio and Yoshua Bengio and Jocelyn Cloutier and Jan Gecsei},\n\tyear         = 1992,\n\tbooktitle    = {Preprints Conf. Optimality in Artificial and Biological Neural Networks},\n\tvolume       = 2\n}\n@article{bengio2003neural,\n\ttitle        = {A neural probabilistic language model},\n\tauthor       = {Yoshua Bengio and Rejean Ducharme and Pascal Vincent and Christian Jauvin},\n\tyear         = 2003,\n\tjournal      = {Journal of machine learning research},\n\tvolume       = 3,\n\tnumber       = {0},\n\tpages        = {1137--1155}\n}\n@incollection{bengio2006neural,\n\ttitle        = {Neural probabilistic language models},\n\tauthor       = {Bengio, Yoshua and Schwenk, Holger and Sen{\\'e}cal, Jean-S{\\'e}bastien and Morin, Fr{\\'e}deric and Gauvain, Jean-Luc},\n\tyear         = 2006,\n\tbooktitle    = {Innovations in Machine Learning}\n}\n@article{Bengio2009,\n\ttitle        = {Learning deep architectures for {AI}},\n\tauthor       = {Bengio, Yoshua},\n\tyear         = 2009,\n\tmonth        = jan,\n\tjournal      = {Foundations and Trends in Machine Learning},\n\tpublisher    = {Now Publishers Inc.},\n\taddress      = {Hanover, MA, USA},\n\tvolume       = 2,\n\tnumber       = 1,\n\tpages        = {1--127},\n\tdoi          = {10.1561/2200000006},\n\tissn         = {1935-8237},\n\turl          = {http://dx.doi.org/10.1561/2200000006},\n\tnote         = {Also published as a book. Now Publishers, 2009.},\n\tacmid        = 1658424,\n\tfile         = {:..\\\\Citations\\\\deepsurvey.pdf:PDF},\n\tissue_date   = {January 2009},\n\tnumpages     = 127\n}\n@inproceedings{bengio2011expressive,\n\ttitle        = {On the expressive power of deep architectures},\n\tauthor       = {Bengio, Yoshua and Delalleau, Olivier},\n\tyear         = 2011,\n\tbooktitle    = {International conference on algorithmic learning theory},\n\tpages        = {18--36},\n\torganization = {Springer}\n}\n@article{bengio2012unsupervised,\n\ttitle        = {Unsupervised feature learning and deep learning: A review and new perspectives},\n\tauthor       = {Y. Bengio and A. Courville and P. Vincent},\n\tyear         = 2012,\n\tjournal      = {arXiv preprint arXiv:1206.5538}\n}\n@article{Bengio2013,\n\ttitle        = {Representation Learning: A Review and New Perspectives},\n\tauthor       = {Yoshua Bengio and Aaron C. Courville and Pascal Vincent},\n\tyear         = 2013,\n\tjournal      = {IEEE Trans. Pattern Anal. Mach. Intell.},\n\tpublisher    = {IEEE},\n\tvolume       = 35,\n\tnumber       = 8,\n\tpages        = {1798--1828},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de},\n\tee           = {http://doi.ieeecomputersociety.org/10.1109/TPAMI.2013.50},\n\towner        = {rongge},\n\ttimestamp    = {2013.09.25}\n}\n@article{bengio2013estimating,\n\ttitle        = {Estimating or Propagating Gradients Through Stochastic Neurons for Conditional Computation},\n\tauthor       = {Yoshua Bengio and Nicholas Leonard and Aaron Courville},\n\tyear         = 2013,\n\tjournal      = {arXiv}\n}\n@article{bengio2013representation,\n\ttitle        = {Representation learning: A review and new perspectives},\n\tauthor       = {Bengio, Yoshua and Courville, Aaron and Vincent, Pascal},\n\tyear         = 2013,\n\tjournal      = {IEEE Transactions on Pattern Analysis and Machine Intelligence},\n\tpublisher    = {IEEE},\n\tvolume       = 35,\n\tnumber       = 8,\n\tpages        = {1798--1828},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de},\n\tee           = {http://doi.ieeecomputersociety.org/10.1109/TPAMI.2013.50},\n\towner        = {rongge},\n\ttimestamp    = {2013.09.25}\n}\n@inproceedings{bengio2015scheduled,\n\ttitle        = {Scheduled sampling for sequence prediction with recurrent neural networks},\n\tauthor       = {Samy Bengio and Oriol Vinyals and Navdeep Jaitly and Noam Shazeer},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1171--1179}\n}\n@article{bennett1962probability,\n\ttitle        = {Probability inequalities for the sum of independent random variables},\n\tauthor       = {George Bennett},\n\tyear         = 1962,\n\tjournal      = {Journal of the American Statistical Association (JASA)},\n\tvolume       = 57,\n\tnumber       = 297,\n\tpages        = {33--45}\n}\n@article{benson2014scalable,\n\ttitle        = {{Scalable Methods for Nonnegative Matrix Factorizations of Near-Separable Tall-and-Skinny Matrices}},\n\tauthor       = {Benson, Austin R and {Jason D. Lee} and Rajwa, Bartek and Gleich, David F.},\n\tyear         = 2014,\n\tjournal      = {Neural Information Processing Systems (NIPS)},\n\tpages        = {1--9}\n}\n@article{bentaieb2017adversarial,\n\ttitle        = {Adversarial stain transfer for histopathology image analysis},\n\tauthor       = {A{\\\"\\i}cha BenTaieb and Ghassan Hamarneh},\n\tyear         = 2017,\n\tjournal      = {IEEE transactions on medical imaging},\n\tvolume       = 37,\n\tnumber       = 3,\n\tpages        = {792--802}\n}\n@article{bental2013robust,\n\ttitle        = {Robust Solutions of Optimization Problems Affected by Uncertain Probabilities},\n\tauthor       = {Aharon Ben-Tal and Dick den Hertog and Anja De Waegenaere and Bertrand Melenberg and Gijs Rennen},\n\tyear         = 2013,\n\tjournal      = {Management Science},\n\tvolume       = 59,\n\tpages        = {341--357}\n}\n@article{benzi2005numerical,\n\ttitle        = {Numerical solution of saddle point problems},\n\tauthor       = {Benzi, Michele and Golub, Gene H and Liesen, J{\\\"o}rg},\n\tyear         = 2005,\n\tjournal      = {Acta numerica},\n\tpublisher    = {Cambridge Univ Press},\n\tvolume       = 14,\n\tpages        = {1--137}\n}\n@article{benzi2006eigenvalues,\n\ttitle        = {On the eigenvalues of a class of saddle point matrices},\n\tauthor       = {Benzi, Michele and Simoncini, Valeria},\n\tyear         = 2006,\n\tjournal      = {Numerische Mathematik},\n\tpublisher    = {Springer},\n\tvolume       = 103,\n\tnumber       = 2,\n\tpages        = {173--196}\n}\n@inproceedings{berant2007boosting,\n\ttitle        = {Boosting Unsupervised Grammar Induction by Splitting Complex Sentences on Function Words},\n\tauthor       = {Jonathan Berant and Yaron Gross and Matan Mussel and Ben Sandbank and Eytan Ruppin and Shimon Edelman},\n\tyear         = {2007 2007},\n\tbooktitle    = {Boston University Conference on Language Development}\n}\n@inproceedings{berant2008tracks,\n\ttitle        = {Tracks in the Mind: Differential Entrenchment of Common and Rare Liturgical and Every- day Multiword Phrases in Religious and Secular Hebrew Speakers},\n\tauthor       = {Jonathan Berant and Catherine Caldwell-Harris and Shimon Edelman},\n\tyear         = {2008 2008},\n\tbooktitle    = {Annual Meeting of the Cognitive Science Society}\n}\n@inproceedings{berant2010global,\n\ttitle        = {Global Learning of Focused Entailment Graphs},\n\tauthor       = {Jonathan Berant and Ido Dagan and Jacob Goldberger},\n\tyear         = 2010,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{berant2011global,\n\ttitle        = {Global Learning of Typed Entailment Rules},\n\tauthor       = {Jonathan Berant and Ido Dagan and Jacob Goldberger},\n\tyear         = 2011,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{berant2012efficient,\n\ttitle        = {Efficient Tree-based Approximation for Entailment Graph learning},\n\tauthor       = {Jonathan Berant and Ido Dagan and Meni Adler and Jacob Goldberger},\n\tyear         = 2012,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{berant2012learning,\n\ttitle        = {Learning Entailment Relations by Global Graph Structure Optimization},\n\tauthor       = {Jonathan Berant and Ido Dagan and Jacob Goldberger},\n\tyear         = 2012,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 38,\n\tpages        = {73--111}\n}\n@inproceedings{berant2013freebase,\n\ttitle        = {Semantic Parsing on {F}reebase from Question-Answer Pairs},\n\tauthor       = {Jonathan Berant and Andrew Chou and Roy Frostig and Percy Liang},\n\tyear         = 2013,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{berant2014biological,\n\ttitle        = {Modeling Biological Processes for Reading Comprehension},\n\tauthor       = {Jonathan Berant and Vivek Srikumar and Pei-Chun Chen and Abby Vander Linden and Brittany Harding and Brad Huang and Peter Clark and Christopher D Manning},\n\tyear         = 2014,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{berant2014paraphrasing,\n\ttitle        = {Semantic Parsing via Paraphrasing},\n\tauthor       = {Jonathan Berant and Percy Liang},\n\tyear         = 2014,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{berant2015agenda,\n\ttitle        = {Imitation Learning of Agenda-Based Semantic Parsers},\n\tauthor       = {Jonathan Berant and Percy Liang},\n\tyear         = 2015,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 3,\n\tpages        = {545--558}\n}\n@article{berant2015efficient,\n\ttitle        = {Efficient Global Learning of Entailment Graphs},\n\tauthor       = {Jonathan Berant and Noga Alon and Ido Dagan and Jacob Goldberger},\n\tyear         = 2015,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 41,\n\tpages        = {221--264}\n}\n@article{berant2015kb,\n\ttitle        = {Knowledge-based Textual Inference via Parse-Tree transformations},\n\tauthor       = {Roy Bar-Haim and Ido Dagan and Jonathan Berant},\n\tyear         = 2015,\n\tjournal      = {Journal of Artificial Intelligence Research (JAIR)},\n\tvolume       = 54,\n\tpages        = {1--57}\n}\n@inproceedings{berant2019explaining,\n\ttitle        = {Explaining Queries over Web Tables to Non-Experts},\n\tauthor       = {Jonathan Berant and Daniel Deutch and Amir Globerson and Tova Milo and Tomer Wolfson},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Data Engineering (ICDE)}\n}\n@book{berger2013statistical,\n\ttitle        = {Statistical decision theory and Bayesian analysis},\n\tauthor       = {Berger, James O},\n\tyear         = 2013,\n\tpublisher    = {Springer Science \\& Business Media}\n}\n@misc{berger2017translated,\n\ttitle        = {Israel Arrests Palestinian Because Facebook Translated `Good Morning' to `Attack Them'},\n\tauthor       = {Yotam Berger},\n\tyear         = 2017,\n\thowpublished = {\\url{https://www.haaretz.com/israel-news/palestinian-arrested-over-mistranslated-good-morning-facebook-post-1.5459427}}\n}\n@inproceedings{bergsma2008discriminative,\n\ttitle        = {Discriminative learning of selectional preference from unlabeled text},\n\tauthor       = {Shane Bergsma and Dekang Lin and Randy Goebel},\n\tyear         = 2008,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {59--68}\n}\n@article{bergstra12hyper,\n\ttitle        = {Random Search for Hyper-Parameter Optimization},\n\tauthor       = {James Bergstra and Yoshua Bengio},\n\tyear         = 2012,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 13,\n\tpages        = {281--305}\n}\n@inproceedings{bergstra2010theano,\n\ttitle        = {Theano: a {CPU} and {GPU} Math Expression Compiler},\n\tauthor       = {James Bergstra and Olivier Breuleux and Fr{'{e}}d{'{e}}ric Bastien and Pascal Lamblin and Razvan Pascanu and Guillaume Desjardins and Joseph Turian and David Warde-Farley and Yoshua Bengio},\n\tyear         = 2010,\n\tbooktitle    = {Python for Scientific Computing Conference}\n}\n@book{berk2012criminal,\n\ttitle        = {Criminal justice forecasts of risk: A machine learning approach},\n\tauthor       = {Richard Berk},\n\tyear         = 2012,\n\tpublisher    = {Springer Science \\& Business Media}\n}\n@article{berk2017convex,\n\ttitle        = {A convex framework for fair regression},\n\tauthor       = {Richard Berk and Hoda Heidari and Shahin Jabbari and Matthew Joseph and Michael Kearns and Jamie Morgenstern and Seth Neel and Aaron Roth},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1706.02409}\n}\n@article{berkenkamp2017safe,\n\ttitle        = {Safe model-based reinforcement learning with stability guarantees},\n\tauthor       = {Berkenkamp, Felix and Turchetta, Matteo and Schoellig, Angela P and Krause, Andreas},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1705.08551},\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{berkhin80second,\n\ttitle        = {Second-order asymptotically minimax estimates for the mean of a normal population},\n\tauthor       = {P. E. Berkhin and B. Ya. Levit},\n\tyear         = 1980,\n\tjournal      = {Problemy Peredachi Informatsii},\n\tvolume       = 16,\n\tpages        = {60--79}\n}\n@book{berlinet2011reproducing,\n\ttitle        = {Reproducing kernel Hilbert spaces in probability and statistics},\n\tauthor       = {Berlinet, Alain and Thomas-Agnan, Christine},\n\tyear         = 2011,\n\tpublisher    = {Springer Science \\& Business Media}\n}\n@article{berman2018ethical,\n\ttitle        = {Ethical considerations when using geospatial technologies for evidence generation},\n\tauthor       = {Gabrielle Berman and Sara de la Rosa and Tanya Accone},\n\tyear         = 2018,\n\tjournal      = {Innocenti Discussion Paper, UNICEF Office of Research}\n}\n@article{bernardo2003variational,\n\ttitle        = {The variational {B}ayesian {EM} algorithm for incomplete data: with application to scoring graphical model structures},\n\tauthor       = {Matthew J. Beal and Zoubin Ghahramani},\n\tyear         = 2003,\n\tjournal      = {Bayesian Statistics}\n}\n@article{bernardo79reference,\n\ttitle        = {Reference posterior distributions for {B}ayesian inference},\n\tauthor       = {J. M. Bernardo},\n\tyear         = 1979,\n\tjournal      = {Journal of the Royal Statistics Society: Series B (Statistical Methodology)},\n\tvolume       = 41,\n\tpages        = {113--147}\n}\n@article{berner2019dota,\n\ttitle        = {Dota 2 with large scale deep reinforcement learning},\n\tauthor       = {Berner, Christopher and Brockman, Greg and Chan, Brooke and Cheung, Vicki and Debiak, Przemyslaw and Dennison, Christy and Farhi, David and Fischer, Quirin and Hashme, Shariq and Hesse, Chris and others},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1912.06680}\n}\n@misc{bernhardsson2016fonts,\n\ttitle        = {Analyzing 50k fonts using deep neural networks},\n\tauthor       = {E. Bernhardsson},\n\tyear         = 2016\n}\n@article{bernheim1984rationalizable,\n\ttitle        = {Rationalizable strategic behavior},\n\tauthor       = {B Douglas Bernheim},\n\tyear         = 1984,\n\tjournal      = {Econometrica: Journal of the Econometric Society},\n\tpages        = {1007--1028}\n}\n@techreport{bernholt2006robust,\n\ttitle        = {Robust estimators are hard to compute},\n\tauthor       = {Thorsten Bernholt},\n\tyear         = 2006,\n\tinstitution  = {Universit\\\"{a}t Dortmund}\n}\n@article{Bernnett62,\n\ttitle        = {Probability Inequalities for the Sum of Independent Random Variables},\n\tauthor       = {Bennett, George},\n\tyear         = 1962,\n\tjournal      = {Journal of the American Statistical Association},\n\tpublisher    = {American Statistical Association},\n\tvolume       = 57,\n\tnumber       = 297,\n\tpages        = {pp. 33--45},\n\tissn         = {01621459},\n\turl          = {http://www.jstor.org/stable/2282438},\n\tcopyright    = {Copyright © 1962 American Statistical Association},\n\tabstract     = {This paper proves a number of inequalities which improve on existing upper limits to the probability distribution of the sum of independent random variables. The inequalities presented require knowledge only of the variance of the sum and the means and bounds of the component random variables. They are applicable when the number of component random variables is small and/or have different distributions. Figures show the improvement on existing inequalities.},\n\tjstor_articletype = {research-article},\n\tjstor_formatteddate = {Mar., 1962},\n\tlanguage     = {English}\n}\n@article{Bernstein,\n\tauthor       = {Bernstein, S.},\n\tyear         = 1927,\n\tjournal      = {Theory of Probability}\n}\n@article{bernstein1984systematic,\n\ttitle        = {A systematic approach to higher-order necessary conditions in optimization theory},\n\tauthor       = {Bernstein, Dennis S},\n\tyear         = 1984,\n\tjournal      = {SIAM journal on control and optimization},\n\tpublisher    = {SIAM},\n\tvolume       = 22,\n\tnumber       = 2,\n\tpages        = {211--238}\n}\n@inproceedings{bernstein2010soylent,\n\ttitle        = {Soylent: a word processor with a crowd inside},\n\tauthor       = {Michael S Bernstein and Greg Little and Robert C Miller and Bj{\\\"o}rn Hartmann and Mark S Ackerman and David R Karger and David Crowell and Katrina Panovich},\n\tyear         = 2010,\n\tbooktitle    = {Symposium on User Interface Software and Technology},\n\tpages        = {313--322}\n}\n@inproceedings{bernstein2011crowds,\n\ttitle        = {Crowds in Two Seconds: Enabling Realtime Crowd-powered Interfaces},\n\tauthor       = {Michael S Bernstein and Joel Brandt and Robert C Miller and David R Karger},\n\tyear         = 2011,\n\tbooktitle    = {User Interface Software and Technology},\n\tpages        = {33--42}\n}\n@inproceedings{bernstein2016consistently,\n\ttitle        = {Consistently Estimating {M}arkov Chains with Noisy Aggregate Data},\n\tauthor       = {Garrett Bernstein and Daniel Sheldon},\n\tyear         = 2016,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)},\n\tpages        = {1142--1150}\n}\n@book{berry1985bandit,\n\ttitle        = {Bandit Problems: Sequential Allocation of Experiments (Monographs on Statistics and Applied Probability)},\n\tauthor       = {Berry, Donald A and Fristedt, Bert},\n\tyear         = 1985,\n\tpublisher    = {Springer}\n}\n@article{berthelot2019mixmatch,\n\ttitle        = {MixMatch: A Holistic Approach to Semi-Supervised Learning},\n\tauthor       = {David Berthelot and Nicholas Carlini and Ian Goodfellow and Nicolas Papernot and Avital Oliver and Colin Raffel},\n\tyear         = 2019,\n\tjournal      = {arXiv}\n}\n@article{berthelot2021adamatch,\n\ttitle        = {AdaMatch: A Unified Approach to Semi-Supervised Learning and Domain Adaptation},\n\tauthor       = {David Berthelot and Rebecca Roelofs and Kihyuk Sohn and Nicholas Carlini and Alex Kurakin},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2106.04732}\n}\n@inproceedings{berthet2013complexity,\n\ttitle        = {Complexity Theoretic Lower Bounds for Sparse Principal Component Detection},\n\tauthor       = {Quentin Berthet and Philippe Rigollet},\n\tyear         = 2013,\n\tbooktitle    = {Conference on Learning Theory (COLT)},\n\tpages        = {1046--1066}\n}\n@book{bertot2004interactive,\n\ttitle        = {Interactive theorem proving and program development: {C}oq'{A}rt: the calculus of inductive constructions},\n\tauthor       = {Yves Bertot and Pierre Castéran},\n\tyear         = 2004,\n\tpublisher    = {Springer}\n}\n@article{bertrand2004emily,\n\ttitle        = {Are Emily and Greg more employable than Lakisha and Jamal? A field experiment on labor market discrimination},\n\tauthor       = {Marianne Bertrand and Sendhil Mullainathan},\n\tyear         = 2004,\n\tjournal      = {American economic review},\n\tvolume       = 94,\n\tnumber       = 4,\n\tpages        = {991--1013}\n}\n@article{bertsekas1976dynamic,\n\ttitle        = {Dynamic programming and stochastic control},\n\tauthor       = {Bertsekas, Dimitri P},\n\tyear         = 1976,\n\tpublisher    = {Academic Press, Inc.}\n}\n@book{bertsekas1995dynamic,\n\ttitle        = {Dynamic programming and optimal control},\n\tauthor       = {Bertsekas, Dimitri P},\n\tyear         = 1995,\n\tpublisher    = {Athena Scientific, Belmont, MA},\n\tvolume       = 1,\n\tnumber       = 2\n}\n@inproceedings{bertsekas1995neuro,\n\ttitle        = {Neuro-dynamic programming: an overview},\n\tauthor       = {Bertsekas, Dimitri P and Tsitsiklis, John N},\n\tyear         = 1995,\n\tbooktitle    = {Proceedings of the 34th IEEE Conference on Decision and Control},\n\tvolume       = 1,\n\tpages        = {560--564},\n\torganization = {IEEE}\n}\n@book{bertsekas2009convex,\n\ttitle        = {Convex Optimization Theory},\n\tauthor       = {Bertsekas, Dimitri P},\n\tyear         = 2009,\n\tpublisher    = {Athena Scientific, Belmont, MA}\n}\n@article{bertsekas2011approximate,\n\ttitle        = {Approximate policy iteration: a survey and some new methods},\n\tauthor       = {Bertsekas, Dimitri P.},\n\tyear         = 2011,\n\tjournal      = {J. Control Theory Appl.},\n\tvolume       = 9,\n\tnumber       = 3,\n\tpages        = {310--335},\n\tissn         = {1672-6340},\n\turl          = {https://doi.org/10.1007/s11768-011-1005-3},\n\tfjournal     = {Journal of Control Theory and Applications},\n\tmrclass      = {90C39 (68T05 90C15)},\n\tmrnumber     = 2833999,\n\tmrreviewer   = {Yukihiro Maruyama}\n}\n@book{bertsekas2013abstract,\n\ttitle        = {Abstract dynamic programming},\n\tauthor       = {Bertsekas, Dimitri P},\n\tyear         = 2013,\n\tpublisher    = {Athena Scientific, Belmont, MA},\n\tpages        = {viii+248},\n\tisbn         = {978-1-886529-42-7; 1-886529-42-6},\n\tmrclass      = {90-01 (90C39)},\n\tmrnumber     = 3204932\n}\n@book{bertsekas96neuro,\n\ttitle        = {Neuro-Dynamic Programming},\n\tauthor       = {Dimitri P. Bertsekas and John N. Tsitsiklis},\n\tyear         = 1996,\n\tmonth        = sep,\n\tpublisher    = {Athena Scientific},\n\tisbn         = {1-886529-10-8}\n}\n@book{bertsekas99nlp,\n\ttitle        = {Nonlinear Programming},\n\tauthor       = {D. Bertsekas},\n\tyear         = 1999,\n\tpublisher    = {Athena Scientific}\n}\n@article{bertsimas2011theory,\n\ttitle        = {Theory and applications of robust optimization},\n\tauthor       = {Dimitris Bertsimas and David B Brown and Constantine Caramanis},\n\tyear         = 2011,\n\tjournal      = {SIAM review},\n\tvolume       = 53,\n\tnumber       = 3,\n\tpages        = {464--501}\n}\n@article{bertsimas2018data,\n\ttitle        = {Data-driven robust optimization},\n\tauthor       = {Dimitris Bertsimas and Vishal Gupta and Nathan Kallus},\n\tyear         = 2018,\n\tjournal      = {Mathematical Programming Series A},\n\tvolume       = 167\n}\n@article{besag75pseudo,\n\ttitle        = {The analysis of non-lattice data},\n\tauthor       = {J. Besag},\n\tyear         = 1975,\n\tjournal      = {The Statistician},\n\tvolume       = 24,\n\tpages        = {179--195}\n}\n@article{bessi2016users,\n\ttitle        = {Users polarization on Facebook and Youtube},\n\tauthor       = {Alessandro Bessi and Fabiana Zollo and Michela Del Vicario and Michelangelo Puliga and Antonio Scala and Guido Caldarelli and Brian Uzzi and Walter Quattrociocchi},\n\tyear         = 2016,\n\tjournal      = {PloS one},\n\tvolume       = 11,\n\tnumber       = 8\n}\n@article{beutel2017data,\n\ttitle        = {Data decisions and theoretical implications when adversarially learning fair representations},\n\tauthor       = {Alex Beutel and Jilin Chen and Zhe Zhao and Ed H Chi},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1707.00075}\n}\n@inproceedings{bg17,\n\ttitle        = {Globally Optimal Gradient Descent for a ConvNet with Gaussian Inputs},\n\tauthor       = {Alon Brutzkus and Amir Globerson},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpublisher    = {http://arxiv.org/abs/1702.07966}\n}\n@inproceedings{bgms18,\n\ttitle        = {{SGD} Learns Over-parameterized Networks that Provably Generalize on Linearly Separable Data},\n\tauthor       = {Alon Brutzkus and Amir Globerson and Eran Malach and Shai Shalev-Shwartz},\n\tyear         = 2018,\n\tbooktitle    = {ICLR},\n\turl          = {https://arxiv.org/abs/1710.10174}\n}\n@article{bhagat2013paraphrase,\n\ttitle        = {What Is a Paraphrase?},\n\tauthor       = {Rahul Bhagat and Eduard Hovy},\n\tyear         = 2013,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 39\n}\n@inproceedings{bhagavatula2015tabel,\n\ttitle        = {{TabEL}: entity linking in web tables},\n\tauthor       = {Chandra Sekhar Bhagavatula and Thanapon Noraset and Doug Downey},\n\tyear         = 2015,\n\tbooktitle    = {International Semantic Web Conference (ISWC)}\n}\n@article{bhandari2019global,\n\ttitle        = {Global optimality guarantees for policy gradient methods},\n\tauthor       = {Bhandari, Jalaj and Russo, Daniel},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1906.01786}\n}\n@article{bharadhwaj2020conservative,\n\ttitle        = {Conservative Safety Critics for Exploration},\n\tauthor       = {Bharadhwaj, Homanga and Kumar, Aviral and Rhinehart, Nicholas and Levine, Sergey and Shkurti, Florian and Garg, Animesh},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.14497}\n}\n@article{Bhaskar13,\n\ttitle        = {Atomic Norm Denoising with Applications to Line Spectral Estimation},\n\tauthor       = {Badri Narayan Bhaskar and Gongguo Tang and Benjamin Recht},\n\tyear         = 2013,\n\tjournal      = {{IEEE} Transactions on Signal Processing},\n\tbooktitle    = {Proceedings of the 49th Annual Allerton Conference},\n\tvolume       = 61,\n\tnumber       = 23,\n\tpages        = {5987--5999},\n\tdate-added   = {2016-04-05 05:55:59 +0000},\n\tdate-modified = {2016-04-05 05:55:59 +0000}\n}\n@inproceedings{bhaskara2013smoothed,\n\ttitle        = {Smoothed analysis of tensor decompositions},\n\tauthor       = {Bhaskara, Aditya and Charikar, Moses and Moitra, Ankur and Vijayaraghavan, Aravindan},\n\tyear         = 2014,\n\tbooktitle    = {Proceedings of the 46th Symposium on Theory of Computing Conference, STOC 2014, New York, NY, USA, May 31 - Jun 3},\n\tpages        = {594--603},\n\torganization = {ACM}\n}\n@inproceedings{bhaskara2014uniqueness,\n\ttitle        = {Uniqueness of tensor decompositions with applications to polynomial identifiability},\n\tauthor       = {Aditya Bhaskara and Moses Charikar and Aravindan Vijayaraghavan},\n\tyear         = 2014,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@inproceedings{BhaskaraCMV14,\n\ttitle        = {Smoothed Analysis of Tensor Decompositions},\n\tauthor       = {Aditya Bhaskara and Moses Charikar and Ankur Moitra and Aravindan Vijayaraghavan},\n\tyear         = 2014,\n\tbooktitle    = {STOC}\n}\n@inproceedings{BhaskaraCV14,\n\ttitle        = {Proceedings of The 27th Conference on Learning Theory, COLT 2014, Barcelona, Spain, June 13-15, 2014},\n\tauthor       = {Aditya Bhaskara and Moses Charikar and Aravindan Vijayaraghavan},\n\tyear         = 2014,\n\tbooktitle    = {COLT},\n\tpublisher    = {JMLR.org},\n\tseries       = {JMLR Proceedings},\n\tvolume       = 35,\n\tpages        = {742--778},\n\teditor       = {Maria-Florina Balcan and Csaba Szepesv{\\'a}ri},\n\tee           = {http://jmlr.org/proceedings/papers/v35/bhaskara14a.html},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de}\n}\n@inproceedings{bhat2012non,\n\ttitle        = {Non-parametric approximate dynamic programming via the kernel method},\n\tauthor       = {Bhat, Nikhil and Farias, Vivek and Moallemi, Ciamac C},\n\tyear         = 2012,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {386--394}\n}\n@book{Bhatia1997,\n\ttitle        = {{Matrix Analysis}},\n\tauthor       = {Bhatia, Rajendra},\n\tyear         = 1997,\n\tpublisher    = {Springer New York},\n\taddress      = {New York, NY},\n\tseries       = {Graduate Texts in Mathematics},\n\tvolume       = 169,\n\tdoi          = {10.1007/978-1-4612-0653-8},\n\tisbn         = {978-1-4612-6857-4},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Bhatia - 1997 - Matrix Analysis.pdf:pdf},\n\tmendeley-groups = {Books/Algebra}\n}\n@inproceedings{bhatia2015robust,\n\ttitle        = {Robust regression via hard thresholding},\n\tauthor       = {Kush Bhatia and Prateek Jain and Puroshottam Kar},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {721--729}\n}\n@inproceedings{bhatia2018synfix,\n\ttitle        = {Neuro-Symbolic Program Corrector for Introductory Programming Assignments},\n\tauthor       = {Sahil Bhatia and Pushmeet Kohli and Rishabh Singh},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Software Engineering (ICSE)}\n}\n@inproceedings{bhattacharjee2020more,\n\ttitle        = {Is More Autonomy Always Better? Exploring Preferences of Users with Mobility Impairments in Robot-assisted Feeding},\n\tauthor       = {Tapomayukh Bhattacharjee and Ethan K Gordon and Rosario Scalise and Maria E Cabrera and Anat Caspi and Maya Cakmak and Siddhartha S Srinivasa},\n\tyear         = 2020,\n\tbooktitle    = {ACM/IEEE International Conference on Human Robot Interaction (HRI)},\n\tpages        = {181--190}\n}\n@inproceedings{bhattamishra-etal-2020-computational,\n\ttitle        = {On the Computational Power of Transformers and Its Implications in Sequence Modeling},\n\tauthor       = {Bhattamishra, Satwik  and Patel, Arkil  and Goyal, Navin},\n\tyear         = 2020,\n\tmonth        = nov,\n\tbooktitle    = {Proceedings of the 24th Conference on Computational Natural Language Learning},\n\tpublisher    = {Association for Computational Linguistics},\n\taddress      = {Online},\n\tpages        = {455--475},\n\tdoi          = {10.18653/v1/2020.conll-1.37},\n\turl          = {https://www.aclweb.org/anthology/2020.conll-1.37}\n}\n@inproceedings{bhl18,\n\ttitle        = {Gradient descent with identity initialization efficiently learns positive definite linear transformations},\n\tauthor       = {Bartlett, Peter and Helmbold, Dave and Long, Phil},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {520--529}\n}\n@article{BHNS14,\n\ttitle        = {A stochastic quasi-Newton method for large-scale optimization},\n\tauthor       = {Byrd, Richard H and Hansen, SL and Nocedal, Jorge and Singer, Yoram},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1401.7020}\n}\n@article{bhojanapalli2015dropping,\n\ttitle        = {Dropping convexity for faster semi-definite optimization},\n\tauthor       = {Bhojanapalli, Srinadh and Kyrillidis, Anastasios and Sanghavi, Sujay},\n\tyear         = 2015,\n\tjournal      = {arXiv:1509.03917},\n\tdate-modified = {2016-02-15 19:22:38 +0000}\n}\n@inproceedings{bhojanapalli2016global,\n\ttitle        = {Global optimality of local search for low rank matrix recovery},\n\tauthor       = {Bhojanapalli, Srinadh and Neyshabur, Behnam and Srebro, Nati},\n\tyear         = 2016,\n\tmonth        = may,\n\tjournal      = {ArXiv e-prints},\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {3873--3881},\n\tarchiveprefix = {arXiv},\n\teprint       = {1605.07221},\n\tprimaryclass = {stat.ML},\n\tkeywords     = {Statistics - Machine Learning, Computer Science - Learning, Mathematics - Optimization and Control},\n\tadsurl       = {http://adsabs.harvard.edu/abs/2016arXiv160507221B},\n\tadsnote      = {Provided by the SAO/NASA Astrophysics Data System}\n}\n@inproceedings{BhojanapalliJS2015-SVD,\n\ttitle        = {{Tighter Low-rank Approximation via Sampling the Leveraged Element}},\n\tauthor       = {Bhojanapalli, Srinadh and Jain, Prateek and Sanghavi, Sujay},\n\tyear         = 2015,\n\tbooktitle    = {SODA},\n\tpages        = {902--920}\n}\n@inproceedings{bibas2019new,\n\ttitle        = {A new look at an old problem: A universal learning approach to linear regression},\n\tauthor       = {Koby Bibas and Yaniv Fogel and Meir Feder},\n\tyear         = 2019,\n\tbooktitle    = {2019 IEEE International Symposium on Information Theory (ISIT)},\n\tpages        = {2304--2308}\n}\n@article{bickel06regularization,\n\ttitle        = {Regularization in Statistics},\n\tauthor       = {Peter Bickel and Bo Li},\n\tyear         = 2006,\n\tjournal      = {Sociedad de Estadística e Investigación Operativa Test},\n\tvolume       = 15,\n\tpages        = {271--344}\n}\n@article{bickel2009,\n\ttitle        = {Simultaneous analysis of Lasso and Dantzig selector},\n\tauthor       = {Bickel, Peter J. and Ritov, Ya’acov and Tsybakov, Alexandre B.},\n\tyear         = 2009,\n\tmonth        = {08},\n\tjournal      = {Ann. Statist.},\n\tpublisher    = {The Institute of Mathematical Statistics},\n\tvolume       = 37,\n\tnumber       = 4,\n\tpages        = {1705--1732},\n\tdoi          = {10.1214/08-AOS620},\n\turl          = {http://dx.doi.org/10.1214/08-AOS620},\n\tfjournal     = {The Annals of Statistics}\n}\n@inproceedings{bielinski2014preemptive,\n\ttitle        = {Preemptive genotyping for personalized medicine: design of the right drug, right dose, right time—using genomic data to individualize treatment protocol},\n\tauthor       = {Bielinski, Suzette J and Olson, Janet E and Pathak, Jyotishman and Weinshilboum, Richard M and Wang, Liewei and Lyke, Kelly J and Ryu, Euijung and Targonski, Paul V and Van Norstrand, Michael D and Hathcock, Matthew A and others},\n\tyear         = 2014,\n\tbooktitle    = {Mayo Clinic Proceedings},\n\tpages        = {25--33},\n\torganization = {Elsevier}\n}\n@inproceedings{bien2010cur,\n\ttitle        = {{CUR} from a sparse optimization viewpoint},\n\tauthor       = {Bien, Jacob and Xu, Ya and Mahoney, Michael W},\n\tyear         = 2010,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {217--225}\n}\n@techreport{BienstockIyengar2004,\n\ttitle        = {{Faster approximation algorithms for packing and covering problems}},\n\tauthor       = {Bienstock, D. and Iyengar, G.},\n\tyear         = 2004,\n\tnote         = {Preliminary version published in STOC '04}\n}\n@article{biggio2011label,\n\ttitle        = {Support Vector Machines Under Adversarial Label Noise},\n\tauthor       = {Battista Biggio and Blaine Nelson and Pavel Laskov},\n\tyear         = 2011,\n\tjournal      = {ACML},\n\tvolume       = 20,\n\tpages        = {97--112}\n}\n@inproceedings{biggio2012poisoning,\n\ttitle        = {Poisoning Attacks Against Support Vector Machines},\n\tauthor       = {Battista Biggio and Blaine Nelson and Pavel Laskov},\n\tyear         = 2012,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1467--1474}\n}\n@inproceedings{biggio2013clustering,\n\ttitle        = {Is data clustering in adversarial settings secure?},\n\tauthor       = {Battista Biggio and Ignazio Pillai and Samuel Rota Bul{\\`o} and Davide Ariu and Marcello Pelillo and Fabio Roli},\n\tyear         = 2013,\n\tbooktitle    = {Workshop on Artificial Intelligence and Security (AISec)}\n}\n@inproceedings{biggio2013evasion,\n\ttitle        = {Evasion attacks against machine learning at test time},\n\tauthor       = {Battista Biggio and Igino Corona and Davide Maiorca and Blaine Nelson and Nedim {\\v{S}}rndi{\\'c} and Pavel Laskov and Giorgio Giacinto and Fabio Roli},\n\tyear         = 2013,\n\tbooktitle    = {Joint European conference on machine learning and knowledge discovery in databases},\n\tpages        = {387--402}\n}\n@inproceedings{biggio2014linkage,\n\ttitle        = {Poisoning Complete-Linkage Hierarchical Clustering},\n\tauthor       = {Battista Biggio and Bul{\\`o}, Samuel Rota and Pillai, Ignazio and Mura, Michele and Mequanint, Eyasu Zemene and Pelillo, Marcello and Roli, Fabio},\n\tyear         = 2014,\n\tbooktitle    = {Workshop on Structural, Syntactic, and Statistical Pattern Recognition}\n}\n@inproceedings{biggio2014malware,\n\ttitle        = {Poisoning behavioral malware clustering},\n\tauthor       = {Battista Biggio and Konrad Rieck and Davide Ariu and Christian Wressnegger and Igino Corona and Giorgio Giacinto and Fabio Roli},\n\tyear         = 2014,\n\tbooktitle    = {Workshop on Artificial Intelligence and Security (AISec)}\n}\n@article{biggio2014security,\n\ttitle        = {Security evaluation of pattern classifiers under attack},\n\tauthor       = {Battista Biggio and Giorgio Fumera and Fabio Roli},\n\tyear         = 2014,\n\tjournal      = {IEEE Transactions on Knowledge and Data Engineering},\n\tvolume       = 26,\n\tnumber       = 4,\n\tpages        = {984--996}\n}\n@inproceedings{biggio2014securitysvm,\n\ttitle        = {Security evaluation of support vector machines in adversarial environments},\n\tauthor       = {Battista Biggio and Igino Corona and Blaine Nelson and Benjamin Rubinstein and Davide Maiorca and Giorgio Fumera and Giorgio Giacinto and Fabio Roli},\n\tyear         = 2014,\n\tbooktitle    = {Support Vector Machines Applications}\n}\n@article{biggio2018wild,\n\ttitle        = {Wild patterns: Ten years after the rise of adversarial machine learning},\n\tauthor       = {Battista Biggio and Fabio Roli},\n\tyear         = 2018,\n\tjournal      = {Pattern Recognition},\n\tvolume       = 84,\n\tpages        = {317--331}\n}\n@inproceedings{bigham2010vizwiz,\n\ttitle        = {{VizWiz}: nearly real-time answers to visual questions},\n\tauthor       = {Jeffrey P Bigham and Chandrika Jayant and Hanjie Ji and Greg Little and Andrew Miller and Robert C Miller and Robin Miller and Aubrey Tatarowicz and Brandyn White and Samual White and Tom Yeh},\n\tyear         = 2010,\n\tbooktitle    = {User Interface Software and Technology (UIST)},\n\tpages        = {333--342}\n}\n@inproceedings{biloki2019neural,\n\ttitle        = {Neural Program Planner for Structured Predictions},\n\tauthor       = {Jacob Biloki and Chen Liang and Ni Lao},\n\tyear         = 2019,\n\tbooktitle    = {Deep Reinforcement Learning Meets Structured Prediction Workshop at ICLR 2019}\n}\n@article{bilu2012stable,\n\ttitle        = {Are stable instances easy?},\n\tauthor       = {Bilu, Yonatan and Linial, Nathan},\n\tyear         = 2012,\n\tjournal      = {Combinatorics, Probability and Computing},\n\tpublisher    = {Cambridge Univ Press},\n\tvolume       = 21,\n\tnumber       = {05},\n\tpages        = {643--660}\n}\n@inproceedings{bing2014web,\n\ttitle        = {Web page segmentation with structured prediction and its application in web page classification},\n\tauthor       = {Lidong Bing and Rui Guo and Wai Lam and Zheng-Yu Niu and Haifeng Wang},\n\tyear         = 2014,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)}\n}\n@article{binns2018fairness,\n\ttitle        = {Fairness in Machine Learning: Lessons from Political Philosophy},\n\tauthor       = {Reuben Binns},\n\tyear         = 2018,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 81,\n\tpages        = {1--11}\n}\n@phdthesis{binsted1996jape,\n\ttitle        = {Machine Humour: An Implemented Model of Puns},\n\tauthor       = {Kim Binsted},\n\tyear         = 1996,\n\tschool       = {University of Edinburgh}\n}\n@book{bird2009nltk,\n\ttitle        = {Natural Language Processing with Python},\n\tauthor       = {Steven Bird and Edward Loper and Ewan Klein},\n\tyear         = 2009,\n\tpublisher    = {O’Reilly Media Inc.}\n}\n@article{bishop1995training,\n\ttitle        = {Training with noise is equivalent to Tikhonov regularization},\n\tauthor       = {Chris M Bishop},\n\tyear         = 1995,\n\tjournal      = {Neural computation},\n\tvolume       = 7,\n\tnumber       = 1,\n\tpages        = {108--116}\n}\n@book{bishop2002art,\n\ttitle        = {The art and science of computer security},\n\tauthor       = {Matthew A. Bishop},\n\tyear         = 2002,\n\tpublisher    = {Addison-Wesley Longman Publishing Co., Inc.}\n}\n@book{bishop2006pattern,\n\ttitle        = {Pattern recognition and machine learning},\n\tauthor       = {Bishop, Christopher M},\n\tyear         = 2006,\n\tmonth        = oct,\n\tday          = {01},\n\tpublisher    = {springer},\n\tisbn         = {978-0-387-31073-2},\n\tedition      = {1st ed. 2006. Corr. 2nd printing},\n\tabstract     = {\n\t\t{The dramatic growth in practical applications for machine learning\n\n\t\tover the last ten years has been accompanied by many important developments\n\n\t\tin the underlying algorithms and techniques. For example, Bayesian\n\n\t\tmethods have grown from a specialist niche to become mainstream,\n\n\t\twhile graphical models have emerged as a general framework for describing\n\n\t\tand applying probabilistic techniques. The practical applicability\n\n\t\tof Bayesian methods has been greatly enhanced by the development\n\n\t\tof a range of approximate inference algorithms such as variational\n\n\t\tBayes and expectation propagation, while new models based on kernels\n\n\t\thave had a significant impact on both algorithms and applications.\n\n\t\tThis completely new textbook reflects these recent developments while\n\n\t\tproviding a comprehensive introduction to the fields of pattern recognition\n\n\t\tand machine learning. It is aimed at advanced undergraduates or first-year\n\n\t\tPhD students, as well as researchers and practitioners. No previous\n\n\t\tknowledge of pattern recognition or machine learning concepts is\n\n\t\tassumed. Familiarity with multivariate calculus and basic linear\n\n\t\talgebra is required, and some experience in the use of probabilities\n\n\t\twould be helpful though not essential as the book includes a self-contained\n\n\t\tintroduction to basic probability theory. The book is suitable for\n\n\t\tcourses on machine learning, statistics, computer science, signal\n\n\t\tprocessing, computer vision, data mining, and bioinformatics. Extensive\n\n\t\tsupport is provided for course instructors, including more than 400\n\n\t\texercises, graded according to difficulty. Example solutions for\n\n\t\ta subset of the exercises are available from the book web site, while\n\n\t\tsolutions for the remainder can be obtained by instructors from the\n\n\t\tpublisher. The book is supported by a great deal of additional material,\n\n\t\tand the reader is encouraged to visit the book web site for the latest\n\n\t\tinformation. A forthcoming companion volume will deal with practical\n\n\t\taspects of pattern recognition and machine learning, and will include\n\n\t\tfree software implementations of the key algorithms along with example\n\n\t\tdata sets and demonstration programs. Christopher Bishop is Assistant\n\n\t\tDirector at Microsoft Research Cambridge, and also holds a Chair\n\n\t\tin Computer Science at the University of Edinburgh. He is a Fellow\n\n\t\tof Darwin College Cambridge, and was recently elected Fellow of the\n\n\t\tRoyal Academy of Engineering. The author's previous textbook \"Neural\n\n\t\tNetworks for Pattern Recognition\" has been widely adopted.}\n\t},\n\thowpublished = {Hardcover},\n\tkeywords     = {book, machine\\_learning, pattern\\_classification},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@inproceedings{bisk2016evaluating,\n\ttitle        = {Evaluating Induced {CCG} Parsers on Grounded Semantic Parsing},\n\tauthor       = {Yonatan Bisk and Siva Reddy and John Blitzer and Julia Hockenmaier and Mark Steedman},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{bisk2016natural,\n\ttitle        = {Natural language communication with robots},\n\tauthor       = {Yonatan Bisk and Deniz Yuret and Daniel Marcu},\n\tyear         = 2016,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@inproceedings{bisk2020experience,\n\ttitle        = {Experience Grounds Language},\n\tauthor       = {Yonatan Bisk and Ari Holtzman and Jesse Thomason and Jacob Andreas and Yoshua Bengio and Joyce Chai and Mirella Lapata and Angeliki Lazaridou and Jonathan May and Aleksandr Nisnevich and Nicolas Pinto and Joseph Turian},\n\tyear         = 2020,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{bitzer2010using,\n\ttitle        = {Using dimensionality reduction to exploit constraints in reinforcement learning},\n\tauthor       = {Sebastian Bitzer and Matthew Howard and Sethu Vijayakumar},\n\tyear         = 2010,\n\tbooktitle    = {International Conference on Intelligent Robots and Systems (IROS)},\n\tpages        = {3219--3225}\n}\n@inproceedings{biyik2018batch,\n\ttitle        = {Batch Active Preference-Based Learning of Reward Functions},\n\tauthor       = {Erdem Biyik and Dorsa Sadigh},\n\tyear         = 2018,\n\tbooktitle    = {Conference on Robot Learning (CORL)}\n}\n@article{BKS,\n\ttitle        = {Dictionary learning using sum-of-square hierarchy},\n\tauthor       = {Boaz Barak and John Kelner and David Steurer},\n\tyear         = 2014,\n\tbooktitle    = {arXiv:1407.1543}\n}\n@article{BKW,\n\ttitle        = {Noise-tolerant learning, the parity problem, and the statistical query model},\n\tauthor       = {Blum, Avrim and Kalai, Adam and Wasserman, Hal},\n\tyear         = 2003,\n\tmonth        = jul,\n\tjournal      = {J. ACM},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tvolume       = 50,\n\tnumber       = 4,\n\tpages        = {506--519},\n\tissn         = {0004-5411},\n\tissue_date   = {July 2003},\n\tnumpages     = 14\n}\n@inproceedings{BL08,\n\ttitle        = {Correlational spectral clustering},\n\tauthor       = {M. B. Blaschko and C. H. Lampert},\n\tyear         = 2008,\n\tbooktitle    = {CVPR}\n}\n@article{BL1,\n\ttitle        = {A correlated topic model of Science},\n\tauthor       = {D. Blei and J. Lafferty},\n\tyear         = 2007,\n\tjournal      = {Annals of Applied Statistics},\n\tpages        = {17--35}\n}\n@inproceedings{BL2,\n\ttitle        = {Dynamic topic models},\n\tauthor       = {D. Blei and J. Lafferty},\n\tyear         = 2006,\n\tbooktitle    = {ICML},\n\tpages        = {113--120}\n}\n@article{black1973pricing,\n\ttitle        = {The pricing of options and corporate liabilities},\n\tauthor       = {Black, Fischer and Scholes, Myron},\n\tyear         = 1973,\n\tjournal      = {Journal of Political Economy}\n}\n@inproceedings{blackard1999comparative,\n\ttitle        = {Comparative accuracies of artificial neural networks and discriminant analysis in predicting forest cover types from cartographic variables},\n\tauthor       = {Jock A. Blackard and Denis J. Dean},\n\tyear         = 1999,\n\tbooktitle    = {Computers and Electronics in Agriculture}\n}\n@book{blackburn05semantics,\n\ttitle        = {Representation and Inference for Natural Language: A First Course in Computational Semantics},\n\tauthor       = {Patrick Blackburn and Johan Bos},\n\tyear         = 2005,\n\tpublisher    = {CSLI Publishers}\n}\n@article{blackwell1968big,\n\ttitle        = {The big match},\n\tauthor       = {Blackwell, David and Ferguson, Tom S},\n\tyear         = 1968,\n\tjournal      = {The Annals of Mathematical Statistics},\n\tpublisher    = {JSTOR},\n\tvolume       = 39,\n\tnumber       = 1,\n\tpages        = {159--163}\n}\n@article{blackwell57identifiable,\n\ttitle        = {On the Identifiability Problem for Functions of Finite {M}arkov Chains},\n\tauthor       = {David Blackwell and Lambert Koopmans},\n\tyear         = 1957,\n\tjournal      = {Annals of Mathematical Statistics},\n\tvolume       = 28,\n\tpages        = {1011--1015}\n}\n@article{blackwell73urn,\n\ttitle        = {{F}erguson Distributions via {P}\\'olya Urn Schemes},\n\tauthor       = {D. Blackwell and J. B. MacQueen},\n\tyear         = 1973,\n\tjournal      = {Annals of Statistics},\n\tvolume       = 1,\n\tpages        = {353--355}\n}\n@article{blanc2019implicit,\n\ttitle        = {Implicit regularization for deep neural networks driven by an Ornstein-Uhlenbeck like process},\n\tauthor       = {Blanc, Guy and Gupta, Neha and Valiant, Gregory and Valiant, Paul},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1904.09080}\n}\n@inproceedings{blanchard2011generalizing,\n\ttitle        = {Generalizing from Several Related Classification Tasks to a New Unlabeled Sample},\n\tauthor       = {Blanchard, Gilles and Lee, Gyemin and Scott, Clayton},\n\tyear         = 2011,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 24,\n\tpages        = {2178--2186},\n\turl          = {https://proceedings.neurips.cc/paper/2011/file/b571ecea16a9824023ee1af16897a582-Paper.pdf}\n}\n@article{blanchet2019quantifying,\n\ttitle        = {Quantifying distributional model risk via optimal transport},\n\tauthor       = {Jose Blanchet and Karthyek Murthy},\n\tyear         = 2019,\n\tjournal      = {Mathematics of Operations Research},\n\tvolume       = 44,\n\tnumber       = 2,\n\tpages        = {565--600}\n}\n@article{Blaschke04cubica,\n\ttitle        = {CuBICA: Independent Component Analysis by Simultaneous Third- and Fourth-Order Cumulant Diagonalization},\n\tauthor       = {Tobias Blaschke and Laurenz Wiskott},\n\tyear         = 2004,\n\tjournal      = {IEEE TRANSACTIONS ON SIGNAL PROCESSING},\n\tvolume       = 52,\n\tnumber       = 5,\n\tpages        = {1250--1256}\n}\n@article{blei03lda,\n\ttitle        = {Latent {D}irichlet Allocation},\n\tauthor       = {D. Blei and Andrew Ng and M. I. Jordan},\n\tyear         = 2003,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 3,\n\tpages        = {993--1022}\n}\n@inproceedings{blei04nested,\n\ttitle        = {Hierarchical topic models and the nested {C}hinese restaurant process},\n\tauthor       = {D. Blei and T. Griffiths and M. I. Jordan and J. Tenenbaum},\n\tyear         = 2004,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{blei05variational,\n\ttitle        = {Variational Inference for {D}irichlet Process Mixtures},\n\tauthor       = {D. Blei and M. I. Jordan},\n\tyear         = 2005,\n\tjournal      = {Bayesian Analysis},\n\tvolume       = 1,\n\tpages        = {121--144}\n}\n@article{blei2003latent,\n\ttitle        = {Latent Dirichlet allocation},\n\tauthor       = {David M. Blei and Andrew Ng and Michael Jordan},\n\tyear         = 2003,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 3,\n\tpages        = {993--1022}\n}\n@inproceedings{blei2006correlated,\n\ttitle        = {Correlated topic models},\n\tauthor       = {Blei, D. and Lafferty, J.},\n\tyear         = 2006,\n\tbooktitle    = {Advances in Neural Information Processing Systems}\n}\n@inproceedings{blei2006dynamic,\n\ttitle        = {Dynamic Topic Models},\n\tauthor       = {Blei, David M. and Lafferty, John D.},\n\tyear         = 2006,\n\tbooktitle    = {Proceedings of the 23rd International Conference on Machine Learning}\n}\n@article{blei2012probabilistic,\n\ttitle        = {Probabilistic topic models},\n\tauthor       = {Blei, David M.},\n\tyear         = 2012,\n\tmonth        = apr,\n\tjournal      = {Communication of the Association for Computing Machinery},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tvolume       = 55,\n\tnumber       = 4,\n\tpages        = {77--84},\n\tdoi          = {10.1145/2133806.2133826},\n\tissn         = {0001-0782},\n\turl          = {http://doi.acm.org/10.1145/2133806.2133826},\n\tissue_date   = {April 2012},\n\tnumpages     = 8,\n\tacmid        = 2133826\n}\n@book{blg13,\n\ttitle        = {Concentration inequalities: A nonasymptotic theory of independence},\n\tauthor       = {Boucheron, St{\\'e}phane and Lugosi, G{\\'a}bor and Massart, Pascal},\n\tyear         = 2013,\n\tpublisher    = {Oxford university press}\n}\n@inproceedings{blitzer2006domain,\n\ttitle        = {Domain Adaptation with Structural Correspondence Learning},\n\tauthor       = {John Blitzer and Ryan McDonald and Fernando Pereira},\n\tyear         = 2006,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{blitzer2007adaptation,\n\ttitle        = {Domain adaptation of natural language processing systems},\n\tauthor       = {John Blitzer and Fernando Pereira},\n\tyear         = 2007,\n\tjournal      = {University of Pennsylvania}\n}\n@inproceedings{blitzer2007biographies,\n\ttitle        = {Biographies, bollywood, boom-boxes and blenders: Domain adaptation for sentiment classification},\n\tauthor       = {John Blitzer and Mark Dredze and Fernando Pereira},\n\tyear         = 2007,\n\tbooktitle    = {Proceedings of the 45th annual meeting of the association of computational linguistics},\n\tpages        = {440--447}\n}\n@inproceedings{blitzer2008learning,\n\ttitle        = {Learning bounds for domain adaptation},\n\tauthor       = {Blitzer, John and Crammer, Koby and Kulesza, Alex and Pereira, Fernando and Wortman, Jennifer},\n\tyear         = 2008,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {129--136}\n}\n@inproceedings{blitzer2011domain,\n\ttitle        = {Domain adaptation with coupled subspaces},\n\tauthor       = {John Blitzer and Sham Kakade and Dean P. Foster},\n\tyear         = 2011,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)},\n\tpages        = {173--181}\n}\n@inproceedings{blodgett2016,\n\ttitle        = {Demographic Dialectal Variation in Social Media: A Case Study of {A}frican-{A}merican {E}nglish},\n\tauthor       = {Su Lin Blodgett and Lisa Green and Brendan O'Connor},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1119--1130}\n}\n@article{blodgett2017racial,\n\ttitle        = {Racial Disparity in Natural Language Processing: A Case Study of Social Media {A}frican-{A}merican {E}nglish},\n\tauthor       = {Su Lin Blodgett and Brendan OConnor},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1707.00061}\n}\n@article{blondel2000survey,\n\ttitle        = {A survey of computational complexity results in systems and control},\n\tauthor       = {Blondel, Vincent D and Tsitsiklis, John N},\n\tyear         = 2000,\n\tjournal      = {Automatica},\n\tpublisher    = {Elsevier},\n\tvolume       = 36,\n\tnumber       = 9,\n\tpages        = {1249--1274}\n}\n@article{BLS2015,\n\ttitle        = {A geometric alternative to {N}esterov's accelerated gradient descent},\n\tauthor       = {Bubeck, S{\\'e}bastien and Lee, Yin Tat and Singh, Mohit},\n\tyear         = 2015,\n\tmonth        = jun,\n\tjournal      = {ArXiv e-prints},\n\tvolume       = {abs/1506.08187},\n\turl          = {http://arxiv.org/abs/1506.08187}\n}\n@inproceedings{blukis2018following,\n\ttitle        = {Following High-level Navigation Instructions on a Simulated Quadcopter with Imitation Learning},\n\tauthor       = {Valts Blukis and Nataly Brukhim and Andrew Bennett and Ross A. Knepper and Yoav Artzi},\n\tyear         = 2018,\n\tbooktitle    = {Robotics: Science and Systems (RSS)}\n}\n@article{blum1954multidimensional,\n\ttitle        = {Multidimensional Stochastic Approximation Methods},\n\tauthor       = {Julius R. Blum},\n\tyear         = 1954,\n\tjournal      = {Annals of Mathematical Statistics},\n\tvolume       = 25,\n\tnumber       = 4,\n\tpages        = {737--744}\n}\n@inproceedings{blum1989training,\n\ttitle        = {Training a 3-node neural network is {NP}-complete},\n\tauthor       = {Blum, Avrim and Rivest, Ronald L},\n\tyear         = 1989,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpublisher    = {Springer},\n\tpages        = {494--501}\n}\n@article{blum1995coloring,\n\ttitle        = {Coloring random and semi-random {k}-colorable graphs},\n\tauthor       = {Avrim Blum and Joel Spencer},\n\tyear         = 1995,\n\tjournal      = {Journal of Algorithms},\n\tvolume       = 19,\n\tnumber       = 2,\n\tpages        = {204--234}\n}\n@inproceedings{blum1998combining,\n\ttitle        = {Combining labeled and unlabeled data with co-training},\n\tauthor       = {Blum, Avrim and Mitchell, Tom},\n\tyear         = 1998,\n\tbooktitle    = {Proceedings of the eleventh annual conference on Computational learning theory},\n\tpages        = {92--100}\n}\n@article{blum2007external,\n\ttitle        = {From external to internal regret},\n\tauthor       = {Blum, Avrim and Mansour, Yishay},\n\tyear         = 2007,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 8,\n\tnumber       = {Jun},\n\tpages        = {1307--1324}\n}\n@article{blum2014learning,\n\ttitle        = {Learning Valuation Distributions from Partial Observation},\n\tauthor       = {Avrim Blum and Yishay Mansour and Jamie Morgenstern},\n\tyear         = 2014,\n\tjournal      = {arXiv}\n}\n@inproceedings{blum98cotraining,\n\ttitle        = {Combining Labeled and Unlabeled Data with Co-training},\n\tauthor       = {Avrim Blum and Tom Mitchell},\n\tyear         = 1998,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@article{blumenstock2015poverty,\n\ttitle        = {Predicting poverty and wealth from mobile phone metadata},\n\tauthor       = {J. Blumenstock and G. Cadamuro and R. On},\n\tyear         = 2015,\n\tjournal      = {Science},\n\tvolume       = 350\n}\n@inproceedings{blunsom09note,\n\ttitle        = {A note on the implementation of Hierarchical {D}irichlet Processes},\n\tauthor       = {Phil Blunsom and Trevor Cohn and Sharon Goldwater and Mark Johnson},\n\tyear         = 2009,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{blunsom09synchronous,\n\ttitle        = {{B}ayesian Synchronous Grammar Induction},\n\tauthor       = {Phil Blunsom and Trevor Cohn and Miles Osborne},\n\tyear         = 2009,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{BLV97,\n\ttitle        = {Vandermonde factorization of a {H}ankel matrix},\n\tauthor       = {D. L. Boley and F. T. Luk and D. Vandevoorde},\n\tyear         = 1997,\n\tbooktitle    = {Scientific Computing}\n}\n@inproceedings{BM12,\n\ttitle        = {Spectral Learning of General Weighted Automata via Constrained Matrix Completion},\n\tauthor       = {B. Balle and M. Mohri},\n\tyear         = 2012,\n\tbooktitle    = {Advances in Neural Information Processing Systems 25}\n}\n@book{BMP90,\n\ttitle        = {Adaptive algorithms and stochastic approximations},\n\tauthor       = {Benveniste, Albert and M{\\'e}tivier, Michel and Priouret, Pierre},\n\tyear         = 2012,\n\tpublisher    = {Springer Science \\& Business Media},\n\tvolume       = 22\n}\n@article{BoazEtal:DictionaryLearning,\n\ttitle        = {{Dictionary Learning via the Sum-of-Squares Method}},\n\tauthor       = {B. Barak and J. Kelner and D. Steurer},\n\tyear         = 2014,\n\tjournal      = {Unpublished manuscript}\n}\n@article{bobkov1997isoperimetric,\n\ttitle        = {An isoperimetric inequality on the discrete cube, and an elementary proof of the isoperimetric inequality in Gauss space},\n\tauthor       = {Bobkov, Sergey G and others},\n\tyear         = 1997,\n\tjournal      = {The Annals of Probability},\n\tpublisher    = {Institute of Mathematical Statistics},\n\tvolume       = 25,\n\tnumber       = 1,\n\tpages        = {206--214}\n}\n@article{bobkov1997poincare,\n\ttitle        = {{P}oincar{\\'e}'s inequalities and {T}alagrand's concentration phenomenon for the exponential distribution},\n\tauthor       = {S. Bobkov and M. Ledoux},\n\tyear         = 1997,\n\tjournal      = {Probability Theory and Related Fields},\n\tvolume       = 107,\n\tnumber       = 3,\n\tpages        = {383--400}\n}\n@phdthesis{bobrow1964student,\n\ttitle        = {Natural language input for a computer problem solving system},\n\tauthor       = {Daniel G Bobrow},\n\tyear         = 1964,\n\tschool       = {Massachusetts Institute of Technology}\n}\n@article{bobrow1977gus,\n\ttitle        = {GUS, a frame-driven dialog system},\n\tauthor       = {Daniel G Bobrow and Ronald M Kaplan and Martin Kay and Donald A Norman and Henry Thompson and Terry Winograd},\n\tyear         = 1977,\n\tjournal      = {Artificial Intelligence},\n\tvolume       = 8,\n\tnumber       = 2,\n\tpages        = {155--173}\n}\n@inproceedings{bobu2018adapting,\n\ttitle        = {Adapting to Continuously Shifting Domains},\n\tauthor       = {Andreea Bobu and Eric Tzeng and Judy Hoffman and Trevor Darrell},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations Workshop (ICLR)}\n}\n@article{Boct2015variable,\n\ttitle        = {A variable smoothing algorithm for solving convex optimization problems},\n\tauthor       = {Bo{\\c{t}}, Radu Ioan and Hendrich, Christopher},\n\tyear         = 2015,\n\tjournal      = {TOP},\n\tpublisher    = {Springer},\n\tvolume       = 23,\n\tnumber       = 1,\n\tpages        = {124--150}\n}\n@inproceedings{bodenstab2011beam,\n\ttitle        = {Beam-width prediction for efficient context-free parsing},\n\tauthor       = {Nathan Bodenstab and Aaron Dunlop and Keith Hall and Brian Roark},\n\tyear         = 2011,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {440--449}\n}\n@book{boeottcher2005spectral,\n\ttitle        = {Spectral properties of banded Toeplitz matrices},\n\tauthor       = {Boe{\\'o}ttcher, Albrecht and Grudsky, Sergei M},\n\tyear         = 2005,\n\tpublisher    = {Siam},\n\tvolume       = 96\n}\n@inproceedings{bogin2018emergence,\n\ttitle        = {Emergence of Communication in an Interactive World with Consistent Speakers},\n\tauthor       = {Ben Bogin and Mor Geva and Jonathan Berant},\n\tyear         = 2018,\n\tbooktitle    = {Emergent Communication Workshop@NIPS}\n}\n@inproceedings{bogin2019global,\n\ttitle        = {Global Reasoning over Database Structures for Text-to-{SQL} Parsing},\n\tauthor       = {Ben Bogin and Matt Gardner and Jonathan Berant},\n\tyear         = 2019,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{bogin2019representing,\n\ttitle        = {Representing Schema Structure with Graph Neural Networks for Text-to-{SQL} Parsing},\n\tauthor       = {Ben Bogin and Matt Gardner and Jonathan Berant},\n\tyear         = 2019,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{bogin2020latent,\n\ttitle        = {Latent Compositional Representations Improve Systematic Generalization in Grounded Question Answering},\n\tauthor       = {Ben Bogin and Sanjay Subramanian and Matt Gardner and Jonathan Berant},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.00266}\n}\n@article{bohacek1996art,\n\ttitle        = {The art and practice of structure-based drug design: a molecular modeling perspective},\n\tauthor       = {Regine S Bohacek and Colin McMartin and Wayne C Guida},\n\tyear         = 1996,\n\tjournal      = {Medicinal research reviews},\n\tvolume       = 16,\n\tnumber       = 1,\n\tpages        = {3--50}\n}\n@misc{boisvert2018gym,\n\ttitle        = {Gym-{M}iniworld environment for OpenAI Gym},\n\tauthor       = {Maxime Chevalier-Boisvert},\n\tyear         = 2018,\n\thowpublished = {\\url{https://github.com/maximecb/gym-miniworld}}\n}\n@article{bojanowski2017enriching,\n\ttitle        = {Enriching word vectors with subword information},\n\tauthor       = {Piotr Bojanowski and Edouard Grave and Armand Joulin and Tomas Mikolov},\n\tyear         = 2017,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 5,\n\tpages        = {135--146}\n}\n@inproceedings{bojar2017findings,\n\ttitle        = {Findings of the 2017 conference on machine translation (wmt17)},\n\tauthor       = {Ond{\\v{r}}ej Bojar and Rajen Chatterjee and Christian Federmann and Yvette Graham and Barry Haddow and Shujian Huang and Matthias Huck and Philipp Koehn and Qun Liu and Varvara Logacheva and others},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the Second Conference on Machine Translation},\n\tpages        = {169--214}\n}\n@article{bojinov2020avoid,\n\ttitle        = {Avoid the Pitfalls of {A/B} Testing Make sure your experiments recognize customers' varying needs},\n\tauthor       = {Iavor Bojinov and Guillaume Saint-Jacques and Martin Tingley},\n\tyear         = 2020,\n\tjournal      = {Harvard Business Review},\n\tvolume       = 98,\n\tnumber       = 2,\n\tpages        = {48--53}\n}\n@inproceedings{bollacker2008freebase,\n\ttitle        = {{F}reebase: a collaboratively created graph database for structuring human knowledge},\n\tauthor       = {Kurt Bollacker and Colin Evans and Praveen Paritosh and Tim Sturge and Jamie Taylor},\n\tyear         = 2008,\n\tbooktitle    = {International Conference on Management of Data (SIGMOD)},\n\tpages        = {1247--1250}\n}\n@inproceedings{bollini2011bakebot,\n\ttitle        = {Bakebot: Baking cookies with the {PR2}},\n\tauthor       = {M. Bollini and J. Barry and D. Rus},\n\tyear         = 2011,\n\tbooktitle    = {The PR2 Workshop, IROS}\n}\n@inproceedings{bollini2012interpreting,\n\ttitle        = {Interpreting and executing recipes with a cooking robot},\n\tauthor       = {M. Bollini and S. Tellex and T. Thompson and N. Roy and D. Rus},\n\tyear         = 2012,\n\tbooktitle    = {International Symposium on Experimental Robotics (ISER)}\n}\n@inproceedings{bolukbasi2016man,\n\ttitle        = {Man is to computer programmer as woman is to homemaker? {Debiasing} word embeddings},\n\tauthor       = {Tolga Bolukbasi and Kai-Wei Chang and James Y Zou and Venkatesh Saligrama and Adam T Kalai},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {4349--4357}\n}\n@article{bommasani2021opportunities,\n\ttitle        = {On the Opportunities and Risks of Foundation Models},\n\tauthor       = {Rishi Bommasani and Drew A. Hudson and Ehsan Adeli and Russ Altman and Simran Arora and Sydney von Arx and Michael S. Bernstein and Jeannette Bohg and Antoine Bosselut and Emma Brunskill and Erik Brynjolfsson and Shyamal Buch and Dallas Card and Rodrigo Castellon and Niladri Chatterji and Annie Chen and Kathleen Creel and Jared Quincy Davis and Dora Demszky and Chris Donahue and Moussa Doumbouya and Esin Durmus and Stefano Ermon and John Etchemendy and Kawin Ethayarajh and Li Fei-Fei and Chelsea Finn and Trevor Gale and Lauren Gillespie and Karan Goel and Noah Goodman and Shelby Grossman and Neel Guha and Tatsunori Hashimoto and Peter Henderson and John Hewitt and Daniel E. Ho and Jenny Hong and Kyle Hsu and Jing Huang and Thomas Icard and Saahil Jain and Dan Jurafsky and Pratyusha Kalluri and Siddharth Karamcheti and Geoff Keeling and Fereshte Khani and Omar Khattab and Pang Wei Kohd and Mark Krass and Ranjay Krishna and Rohith Kuditipudi and Ananya Kumar and Faisal Ladhak and Mina Lee and Tony Lee and Jure Leskovec and Isabelle Levent and Xiang Lisa Li and Xuechen Li and Tengyu Ma and Ali Malik and Christopher D. Manning and Suvir Mirchandani and Eric Mitchell and Zanele Munyikwa and Suraj Nair and Avanika Narayan and Deepak Narayanan and Ben Newman and Allen Nie and Juan Carlos Niebles and Hamed Nilforoshan and Julian Nyarko and Giray Ogut and Laurel Orr and Isabel Papadimitriou and Joon Sung Park and Chris Piech and Eva Portelance and Christopher Potts and Aditi Raghunathan and Rob Reich and Hongyu Ren and Frieda Rong and Yusuf Roohani and Camilo Ruiz and Jack Ryan and Christopher Ré and Dorsa Sadigh and Shiori Sagawa and Keshav Santhanam and Andy Shih and Krishnan Srinivasan and Alex Tamkin and Rohan Taori and Armin W. Thomas and Florian Tramèr and Rose E. Wang and William Wang and Bohan Wu and Jiajun Wu and Yuhuai Wu and Sang Michael Xie and Michihiro Yasunaga and Jiaxuan You and Matei Zaharia and Michael Zhang and Tianyi Zhang and Xikun Zhang and Yuhui Zhang and Lucia Zheng and Kaitlyn Zhou and Percy Liang},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2108.07258}\n}\n@article{bondy1977graph,\n\ttitle        = {Graph reconstructiona survey},\n\tauthor       = {Bondy, J Adrian and Hemminger, Robert L},\n\tyear         = 1977,\n\tjournal      = {Journal of Graph Theory},\n\tvolume       = 1,\n\tnumber       = 3,\n\tpages        = {227--268},\n\towner        = {gewor_000},\n\ttimestamp    = {2013.09.29}\n}\n@book{books:understanding,\n\ttitle        = {Understanding Machine Learning - From Theory to Algorithms.},\n\tauthor       = {Shalev-Shwartz, Shai and Ben-David, Shai},\n\tyear         = 2014,\n\tpublisher    = {Cambridge University Press},\n\tpages        = {I-XVI, 1--397},\n\tisbn         = {978-1-10-705713-5},\n\tadded-at     = {2020-06-05T00:00:00.000+0200},\n\tbiburl       = {https://www.bibsonomy.org/bibtex/293329d1cd5964dd826bba3100cd17fe4/dblp},\n\tee           = {http://www.cambridge.org/de/academic/subjects/computer-science/pattern-recognition-and-machine-learning/understanding-machine-learning-theory-algorithms},\n\tinterhash    = {125d708c7b440a3cfeb6146e83ab5de3},\n\tintrahash    = {93329d1cd5964dd826bba3100cd17fe4},\n\tkeywords     = {dblp},\n\ttimestamp    = {2020-06-06T11:43:42.000+0200}\n}\n@inproceedings{bordes2012joint,\n\ttitle        = {Joint learning of words and meaning representations for open-text semantic parsing},\n\tauthor       = {Bordes, Antoine and Glorot, Xavier and Weston, Jason and Bengio, Yoshua},\n\tyear         = 2012,\n\tbooktitle    = {International Conference on Artificial Intelligence and Statistics}\n}\n@inproceedings{bordes2013translating,\n\ttitle        = {Translating embeddings for modeling multi-relational data},\n\tauthor       = {Antoine Bordes and Nicolas Usunier and Alberto Garcia-Duran and Jason Weston and Oksana Yakhnenko},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {2787--2795}\n}\n@inproceedings{bordes2014qa,\n\ttitle        = {Question Answering with Subgraph Embeddings},\n\tauthor       = {Antoine Bordes and Sumit Chopra and Jason Weston},\n\tyear         = 2014,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{bordes2015simple,\n\ttitle        = {Large-scale Simple Question Answering with Memory Networks},\n\tauthor       = {Antoine Bordes and Nicolas Usunier and Sumit Chopra and Jason Weston},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1506.02075}\n}\n@inproceedings{bordes2017learning,\n\ttitle        = {Learning End-to-End Goal-Oriented Dialog},\n\tauthor       = {Antoine Bordes and Jason Weston},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{borkan1980assessment,\n\ttitle        = {Assessment of biological age using a profile of physical parameters},\n\tauthor       = {Gary A Borkan and Arthur H Norris},\n\tyear         = 1980,\n\tjournal      = {Journal of Gerontology},\n\tvolume       = 35,\n\tnumber       = 2,\n\tpages        = {177--184}\n}\n@article{borkan2019limitations,\n\ttitle        = {Limitations of pinned auc for measuring unintended bias},\n\tauthor       = {Daniel Borkan and Lucas Dixon and John Li and Jeffrey Sorensen and Nithum Thain and Lucy Vasserman},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1903.02088}\n}\n@inproceedings{borkan2019nuanced,\n\ttitle        = {Nuanced metrics for measuring unintended bias with real data for text classification},\n\tauthor       = {Daniel Borkan and Lucas Dixon and Jeffrey Sorensen and Nithum Thain and Lucy Vasserman},\n\tyear         = 2019,\n\tbooktitle    = {World Wide Web (WWW)},\n\tpages        = {491--500}\n}\n@article{borkar2009new,\n\ttitle        = {A new learning algorithm for optimal stopping},\n\tauthor       = {Borkar, Vivek S and Pinto, Jervis and Prabhu, Tarun},\n\tyear         = 2009,\n\tjournal      = {Discrete Event Dynamic Systems},\n\tpublisher    = {Springer},\n\tvolume       = 19,\n\tnumber       = 1,\n\tpages        = {91--113}\n}\n@inproceedings{borkar2010risk,\n\ttitle        = {Risk-constrained Markov decision processes},\n\tauthor       = {Borkar, Vivek and Jain, Rahul},\n\tyear         = 2010,\n\tbooktitle    = {49th IEEE Conference on Decision and Control (CDC)},\n\tpages        = {2664--2669},\n\torganization = {IEEE}\n}\n@article{borodin1981time,\n\ttitle        = {A time-space tradeoff for sorting on non-oblivious machines},\n\tauthor       = {Borodin, Allan and Fischer, Michael J and Kirkpatrick, David G and Lynch, Nancy A and Tompa, Martin},\n\tyear         = 1981,\n\tjournal      = {Journal of Computer and System Sciences},\n\tpublisher    = {Elsevier},\n\tvolume       = 22,\n\tnumber       = 3,\n\tpages        = {351--364}\n}\n@article{borodin1987time,\n\ttitle        = {A time-space tradeoff for element distinctness},\n\tauthor       = {Borodin, Allan and Fich, Faith and Meyer auf der Heide, Friedhelm and Upfal, Eli and Wigderson, Avi},\n\tyear         = 1987,\n\tjournal      = {SIAM Journal on Computing},\n\tpublisher    = {SIAM},\n\tvolume       = 16,\n\tnumber       = 1,\n\tpages        = {97--99}\n}\n@book{borwein05variational,\n\ttitle        = {Techniques of Variational Analysis},\n\tauthor       = {Jonathan M. Borwein and Qiji Jim Zhu},\n\tyear         = 2005,\n\tpublisher    = {Springer}\n}\n@inproceedings{bos04wide,\n\ttitle        = {Wide-coverage semantic representations from a {CCG} parser},\n\tauthor       = {Johan Bos and Stephen Clark and Mark Steedman and James R. Curran and Julia Hockenmaier},\n\tyear         = 2004,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)},\n\tpages        = {1240--1246}\n}\n@inproceedings{bos06superlatives,\n\ttitle        = {An Empirical Approach to the Interpretation of Superlatives},\n\tauthor       = {Malvina Nissim and Johan Bos},\n\tyear         = 2006,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{bos08framenet,\n\ttitle        = {Combining Discourse Representation Theory with FrameNet},\n\tauthor       = {J. Bos and M. Nissim},\n\tyear         = 2008,\n\tbooktitle    = {Frames, Corpora, and Knowledge Representation},\n\tpages        = {169--183}\n}\n@inproceedings{bos09economical,\n\ttitle        = {A Controlled Fragment of {DRT}},\n\tauthor       = {J. Bos},\n\tyear         = 2009,\n\tbooktitle    = {Workshop on Controlled Natural Language},\n\tpages        = {1--5}\n}\n@techreport{bos94drs,\n\ttitle        = {A Compositional {DRS}-based formalism for {NLP} applications},\n\tauthor       = {Johan Bos and Elsbeth Mastenbroek and Scott McGlashan and Sebastian Millies and Manfred Pinkal},\n\tyear         = 1994,\n\tinstitution  = {Universität des Saarlandes}\n}\n@inproceedings{bothe2017dialogue,\n\ttitle        = {Dialogue-Based Neural Learning to Estimate the Sentiment of a Next Upcoming Utterance},\n\tauthor       = {Chandrakant Bothe and Sven Magg and Cornelius Weber and Stefan Wermter},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Artificial Neural Networks (ICANN)},\n\tpages        = {477--485}\n}\n@incollection{bottou-98x,\n\ttitle        = {Online Algorithms and Stochastic Approximations},\n\tauthor       = {Bottou, L\\'{e}on},\n\tyear         = 1998,\n\tbooktitle    = {Online Learning and Neural Networks},\n\tpublisher    = {Cambridge University Press},\n\taddress      = {Cambridge, UK},\n\turl          = {http://leon.bottou.org/papers/bottou-98x},\n\tnote         = {revised, oct 2012},\n\teditor       = {Saad, David}\n}\n@misc{Bottou-SGD,\n\ttitle        = {Stochastic Gradient Descent},\n\tauthor       = {L\\'{e}on Bottou},\n\thowpublished = {\\url{http://leon.bottou.org/projects/sgd}}\n}\n@incollection{Bottou:1999:OLS:304710.304720,\n\ttitle        = {On-line Learning in Neural Networks},\n\tauthor       = {Bottou, L{\\'e}on},\n\tyear         = 1998,\n\tpublisher    = {Cambridge University Press},\n\taddress      = {New York, NY, USA},\n\tpages        = {9--42},\n\tisbn         = {0-521-65263-4},\n\turl          = {http://dl.acm.org/citation.cfm?id=304710.304720},\n\tchapter      = {On-line Learning and Stochastic Approximations},\n\teditor       = {Saad, David},\n\tnumpages     = 34,\n\tacmid        = 304720\n}\n@inproceedings{bottou08large,\n\ttitle        = {The Tradeoffs of Large Scale Learning},\n\tauthor       = {Léon Bottou and Olivier Bousquet},\n\tyear         = 2008,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{bottou2012stochastic,\n\ttitle        = {Stochastic gradient descent tricks},\n\tauthor       = {Léon Bottou},\n\tyear         = 2012,\n\tbooktitle    = {Neural Networks: Tricks of the Trade},\n\tpages        = {421--436}\n}\n@article{bottou2013counterfactual,\n\ttitle        = {Counterfactual Reasoning and Learning Systems: The Example of Computational Advertising},\n\tauthor       = {L\\'eon Bottou and Jonas Peters and Joaquin {Qui\\~nonero-Candela} and Denis X. Charles and D. Max Chickering and Elon Portugaly and Dipankar Ray and Patrice Simard and Ed Snelson},\n\tyear         = 2013,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 14,\n\tpages        = {3207--3260}\n}\n@misc{bottou2015two,\n\ttitle        = {Two high stakes challenges in machine learning},\n\tauthor       = {L\\'eon Bottou},\n\tyear         = 2015,\n\thowpublished = {Invited talk at the 32nd International Conference on Machine Learning}\n}\n@article{botvinick2001conflict,\n\ttitle        = {Conflict monitoring and cognitive control.},\n\tauthor       = {Botvinick, Matthew M and Braver, Todd S and Barch, Deanna M and Carter, Cameron S and Cohen, Jonathan D},\n\tyear         = 2001,\n\tjournal      = {Psychological review},\n\tpublisher    = {American Psychological Association},\n\tvolume       = 108,\n\tnumber       = 3,\n\tpages        = 624\n}\n@inproceedings{bouchard04tradeoff,\n\ttitle        = {The Trade-Off Between Generative and Discriminative Classifiers},\n\tauthor       = {Guillaume Bouchard and Bill Triggs},\n\tyear         = 2004,\n\tbooktitle    = {International Conference on Computational Statistics},\n\tpages        = {721--728}\n}\n@inproceedings{bouchard07diachronic,\n\ttitle        = {A Probabilistic Approach to Diachronic Phonology},\n\tauthor       = {Alexandre Bouchard-C\\^ot\\'e and Percy Liang and Tom Griffiths and Dan Klein},\n\tyear         = 2007,\n\tbooktitle    = {Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP/CoNLL)}\n}\n@inproceedings{bouchard07tradeoff,\n\ttitle        = {Bias-Variance Tradeoff in Hybrid Generative-Discriminative Models},\n\tauthor       = {Guillaume Bouchard},\n\tyear         = 2007,\n\tbooktitle    = {Sixth International Conference on Machine Learning and Applications (ICMLA)},\n\tpages        = {124--129}\n}\n@inproceedings{bouchard08language,\n\ttitle        = {A Probabilistic Approach to Language Change},\n\tauthor       = {Alexandre Bouchard-C\\^ot\\'e and Percy Liang and Tom Griffiths and Dan Klein},\n\tyear         = 2008,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{boucheron2004concentration,\n\ttitle        = {Concentration inequalities},\n\tauthor       = {Boucheron, St{\\'e}phane and Lugosi, G{\\'a}bor and Bousquet, Olivier},\n\tyear         = 2004,\n\tjournal      = {Advanced lectures on machine learning},\n\tpublisher    = {Springer},\n\tpages        = {208--240}\n}\n@book{boucheron2013concentration,\n\ttitle        = {Concentration inequalities: A nonasymptotic theory of independence},\n\tauthor       = {St{\\'e}phane Boucheron and G{\\'a}bor Lugosi and Pascal Massart},\n\tyear         = 2013,\n\tpublisher    = {Oxford University Press}\n}\n@article{boulanger2012modeling,\n\ttitle        = {Modeling temporal dependencies in high-dimensional sequences: Application to polyphonic music generation and transcription},\n\tauthor       = {Nicolas Boulanger-Lewandowski and Yoshua Bengio and Pascal Vincent},\n\tyear         = 2012,\n\tjournal      = {arXiv preprint arXiv:1206.6392}\n}\n@inproceedings{boumal2016non,\n\ttitle        = {The non-convex Burer-Monteiro approach works on smooth semidefinite programs},\n\tauthor       = {Boumal, Nicolas and Voroninski, Vladislav and Bandeira, Afonso S},\n\tyear         = 2016,\n\tbooktitle    = {NIPS}\n}\n@inproceedings{boureau2007sparse,\n\ttitle        = {Sparse feature learning for deep belief networks},\n\tauthor       = {Boureau, Y-lan and Cun, Yann L and others},\n\tyear         = 2007,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {1185--1192},\n\towner        = {gewor_000},\n\ttimestamp    = {2013.11.10}\n}\n@inproceedings{bousmalis2017domain,\n\ttitle        = {Unsupervised Pixel-Level Domain Adaptation with Generative Adversarial Networks},\n\tauthor       = {Konstantinos Bousmalis and Nathan Silberman and David Dohan and Dumitru Erhan and Dilip Krishnan},\n\tyear         = 2017,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@inproceedings{bousmalis2018using,\n\ttitle        = {Using simulation and domain adaptation to improve efficiency of deep robotic grasping},\n\tauthor       = {Bousmalis, Konstantinos and Irpan, Alex and Wohlhart, Paul and Bai, Yunfei and Kelcey, Matthew and Kalakrishnan, Mrinal and Downs, Laura and Ibarz, Julian and Pastor, Peter and Konolige, Kurt and Levine, Sergey and Vanhoucke, Vincent},\n\tyear         = 2018,\n\tbooktitle    = {2018 IEEE international conference on robotics and automation (ICRA)},\n\tpages        = {4243--4250},\n\torganization = {IEEE}\n}\n@article{bousquet02stability,\n\ttitle        = {Stability and Generalization},\n\tauthor       = {O. Bousquet and A. Elisseeff},\n\tyear         = 2002,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 2,\n\tpages        = {499--526}\n}\n@article{bousquet2002stability,\n\ttitle        = {Stability and generalization},\n\tauthor       = {Bousquet, Olivier and Elisseeff, Andr{\\'e}},\n\tyear         = 2002,\n\tjournal      = {Journal of machine learning research},\n\tvolume       = 2,\n\tnumber       = {Mar},\n\tpages        = {499--526}\n}\n@article{bousquet2004introduction,\n\ttitle        = {Introduction to statistical learning theory},\n\tauthor       = {Olivier Bousquet and St{\\'e}phane Boucheron and G{\\'a}bor Lugosi},\n\tyear         = 2004,\n\tjournal      = {Advanced Lectures on Machine Learning},\n\tpages        = {169--207}\n}\n@article{boutilier2000stochastic,\n\ttitle        = {Stochastic dynamic programming with factored representations},\n\tauthor       = {Boutilier, Craig and Dearden, Richard and Goldszmidt, Moises},\n\tyear         = 2000,\n\tjournal      = {Artificial intelligence},\n\tpublisher    = {Elsevier},\n\tvolume       = 121,\n\tnumber       = {1-2},\n\tpages        = {49--107}\n}\n@inproceedings{Boutsidis2014faster,\n\ttitle        = {{Faster SVD-truncated regularized least-squares}},\n\tauthor       = {Boutsidis, Christos and Magdon-Ismail, Malik},\n\tyear         = 2014,\n\tbooktitle    = {2014 IEEE International Symposium on Information Theory},\n\tpages        = {1321--1325},\n\torganization = {IEEE}\n}\n@article{boutsidis2014near,\n\ttitle        = {Near-optimal column-based matrix reconstruction},\n\tauthor       = {Boutsidis, Christos and Drineas, Petros and Magdon-Ismail, Malik},\n\tyear         = 2014,\n\tjournal      = {SIAM Journal on Computing},\n\tpublisher    = {SIAM},\n\tvolume       = 43,\n\tnumber       = 2,\n\tpages        = {687--717}\n}\n@article{boutsidis2014optimal,\n\ttitle        = {Optimal {CUR} matrix decompositions},\n\tauthor       = {Boutsidis, Christos and Woodruff, David P},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1405.7910}\n}\n@inproceedings{Boutsidis2015online,\n\ttitle        = {Online principal components analysis},\n\tauthor       = {Boutsidis, Christos and Garber, Dan and Karnin, Zohar and Liberty, Edo},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the Twenty-Sixth Annual ACM-SIAM Symposium on Discrete Algorithms},\n\tpages        = {887--901},\n\torganization = {SIAM}\n}\n@techreport{bowling2000analysis,\n\ttitle        = {An analysis of stochastic game theory for multiagent reinforcement learning},\n\tauthor       = {Bowling, Michael and Veloso, Manuela},\n\tyear         = 2000,\n\tinstitution  = {Carnegie-Mellon Univ Pittsburgh Pa School of Computer Science}\n}\n@inproceedings{bowling2001rational,\n\ttitle        = {Rational and convergent learning in stochastic games},\n\tauthor       = {Bowling, Michael and Veloso, Manuela},\n\tyear         = 2001,\n\tbooktitle    = {International joint conference on artificial intelligence},\n\tvolume       = 17,\n\tnumber       = 1,\n\tpages        = {1021--1026},\n\torganization = {Lawrence Erlbaum Associates Ltd}\n}\n@inproceedings{bowling2002scalable,\n\ttitle        = {Scalable learning in stochastic games},\n\tauthor       = {Bowling, Michael and Veloso, Manuela},\n\tyear         = 2002,\n\tbooktitle    = {AAAI Workshop on Game Theoretic and Decision Theoretic Agents},\n\tpages        = {11--18}\n}\n@inproceedings{bowman2014recursive,\n\ttitle        = {Can recursive neural tensor networks learn logical reasoning?},\n\tauthor       = {Samuel R. Bowman and Christopher Potts and Christopher D. Manning},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{bowman2015large,\n\ttitle        = {A large annotated corpus for learning natural language inference},\n\tauthor       = {Samuel Bowman and Gabor Angeli and Christopher Potts and Christopher D. Manning},\n\tyear         = 2015,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{bowman2016continuous,\n\ttitle        = {Generating Sentences from a Continuous Space},\n\tauthor       = {Samuel R. Bowman and Luke Vilnis and Oriol Vinyals and Andrew M. Dai and Rafal Jozefowicz and Samy Bengio},\n\tyear         = 2016,\n\tbooktitle    = {Computational Natural Language Learning (CoNLL)},\n\tpages        = {10--21}\n}\n@article{box1953robust,\n\ttitle        = {Non-normality and tests on variances},\n\tauthor       = {George E.P. Box},\n\tyear         = 1953,\n\tjournal      = {Biometrika},\n\tvolume       = 40,\n\tpages        = {318--335}\n}\n@book{box1994time,\n\ttitle        = {Time Series Analysis: Forecasting and Control},\n\tauthor       = {George E.P. Box and Gwilym M. Jenkins and Gregory C. Reinsel},\n\tyear         = 1994,\n\tpublisher    = {Prentice Hall},\n\taddress      = {Englewood Cliffs, NJ},\n\tseries       = {Forecasting and Control Series},\n\tisbn         = 9780130607744,\n\tedition      = {3rd},\n\tlccn         = 93034620,\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@book{box2011bayesian,\n\ttitle        = {Bayesian inference in statistical analysis},\n\tauthor       = {George EP Box and George C Tiao},\n\tyear         = 2011,\n\tpublisher    = {John Wiley \\& Sons},\n\tvolume       = 40\n}\n@book{boyd,\n\ttitle        = {Convex optimization},\n\tauthor       = {Boyd, Stephen and Vandenberghe, Lieven},\n\tyear         = 2004,\n\tpublisher    = {Cambridge university press}\n}\n@book{boyd2004convex,\n\ttitle        = {Convex {Optimization}},\n\tauthor       = {Stephen Boyd and Lieven Vandenberghe},\n\tyear         = 2004,\n\tpublisher    = {Cambridge University Press}\n}\n@article{boyd2011admm,\n\ttitle        = {Distributed Optimization and Statistical Learning via the Alternating Direction Method of Multipliers},\n\tauthor       = {Stephen Boyd and Neal Parikh and Eric Chu and Borja Peleato and Jonathan Eckstein},\n\tyear         = 2011,\n\tjournal      = {Foundations and Trends in Machine Learning},\n\tvolume       = 3,\n\tnumber       = 1,\n\tpages        = {1--122}\n}\n@article{boyd2011distributed,\n\ttitle        = {Distributed Optimization and Statistical Learning via the Alternating Direction Method of Multipliers},\n\tauthor       = {Boyd, Stephen and Parikh, Neal and Chu, Eric and Peleato, Borja and Eckstein, Jonathan},\n\tyear         = 2011,\n\tmonth        = jan,\n\tjournal      = {Found. Trends Mach. Learn.},\n\tpublisher    = {Now Publishers Inc.},\n\taddress      = {Hanover, MA, USA},\n\tvolume       = 3,\n\tnumber       = 1,\n\tpages        = {1--122},\n\tdoi          = {10.1561/2200000016},\n\tissn         = {1935-8237},\n\turl          = {http://dx.doi.org/10.1561/2200000016},\n\tissue_date   = {January 2011},\n\tnumpages     = 122,\n\tacmid        = 2185816\n}\n@article{boykov2001fast,\n\ttitle        = {Fast Approximate Energy Minimization via Graph Cuts?},\n\tauthor       = {Yuri Boykov and Olga Veksler and Ramin Zabih},\n\tyear         = 2001,\n\tjournal      = {IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI)},\n\tvolume       = 23,\n\tpages        = {1222--1239}\n}\n@article{boykov2004mincut,\n\ttitle        = {An Experimental Comparison of Min-Cut/Max-Flow Algorithms for Energy Minimization in Vision},\n\tauthor       = {Yuri Boykov and Vladimir Kolmogorov},\n\tyear         = 2004,\n\tjournal      = {IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI)},\n\tvolume       = 26,\n\tpages        = {1124--1137}\n}\n@article{boykov2004what,\n\ttitle        = {What Energy Functions Can Be Minimized via Graph Cuts?},\n\tauthor       = {Vladimir Kolmogorov and Ramin Zabih},\n\tyear         = 2004,\n\tjournal      = {IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI)},\n\tvolume       = 26,\n\tpages        = {147--159}\n}\n@inproceedings{BQC11,\n\ttitle        = {A Spectral Learning Algorithm for Finite State Transducers},\n\tauthor       = {B. Balle and A. Quattoni and X. Carreras},\n\tyear         = 2011,\n\tbooktitle    = {ECML-PKDD}\n}\n@inproceedings{BQC12,\n\ttitle        = {Local Loss Optimization in Operator Models: A New Insight into Spectral Learning},\n\tauthor       = {B. Balle and A. Quattoni and X. Carreras},\n\tyear         = 2012,\n\tbooktitle    = {ICML}\n}\n@article{brachat2010symmetric,\n\ttitle        = {Symmetric tensor decomposition},\n\tauthor       = {Jerome Brachat and Pierre Comon and Bernard Mourrain and Elias Tsigaridas},\n\tyear         = 2010,\n\tjournal      = {Linear Algebra and its Applications},\n\tvolume       = 433,\n\tnumber       = 11,\n\tpages        = {1851--1872}\n}\n@inproceedings{bradbury2017quasi,\n\ttitle        = {Quasi-recurrent neural networks},\n\tauthor       = {James Bradbury and Stephen Merity and Caiming Xiong and Richard Socher},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{BradleyKBG2011,\n\ttitle        = {Parallel coordinate descent for l1-regularized loss minimization},\n\tauthor       = {Bradley, Joseph K. and Kyrola, Aapo and Bickson, Danny and Guestrin, Carlos},\n\tyear         = 2011,\n\tbooktitle    = {Proceedings of the 28th International Conference on Machine Learning},\n\tseries       = {ICML' 11}\n}\n@article{bradtke1996linear,\n\ttitle        = {Linear least-squares algorithms for temporal difference learning},\n\tauthor       = {Bradtke, Steven J and Barto, Andrew G},\n\tyear         = 1996,\n\tjournal      = {Machine learning},\n\tpublisher    = {Springer},\n\tvolume       = 22,\n\tnumber       = {1-3},\n\tpages        = {33--57}\n}\n@article{brafman2002r,\n\ttitle        = {R-max - a General Polynomial Time Algorithm for Near-optimal Reinforcement Learning},\n\tauthor       = {Brafman, Ronen I. and Tennenholtz, Moshe},\n\tyear         = 2003,\n\tmonth        = mar,\n\tjournal      = {J. Mach. Learn. Res.},\n\tpublisher    = {JMLR.org},\n\tvolume       = 3,\n\tnumber       = {Oct},\n\tpages        = {213--231},\n\tissn         = {1532-4435},\n\tacmid        = 944928,\n\tissue_date   = {3/1/2003},\n\tnumpages     = 19\n}\n@book{brams2003negotiation,\n\ttitle        = {Negotiation Games: Applying Game Theory to Bargaining and Arbitration},\n\tauthor       = {Steven J Brams},\n\tyear         = 2003,\n\tpublisher    = {Psychology Press}\n}\n@inproceedings{branavan08annotation,\n\ttitle        = {Learning Document-Level Semantic Properties from Free-text Annotations},\n\tauthor       = {S.R.K. Branavan and Harr Chen and Jacob Eisenstein and Regina Barzilay},\n\tyear         = 2008,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{branavan09reinforcement,\n\ttitle        = {Reinforcement Learning for Mapping Instructions to Actions},\n\tauthor       = {S.R.K. Branavan and Harr Chen and Luke S. Zettlemoyer and Regina Barzilay},\n\tyear         = 2009,\n\tbooktitle    = {Association for Computational Linguistics and International Joint Conference on Natural Language Processing (ACL-IJCNLP)},\n\tpages        = {82--90}\n}\n@inproceedings{branavan10high,\n\ttitle        = {Reading Between the Lines: Learning to Map High-level Instructions to Commands},\n\tauthor       = {S.R.K. Branavan and Luke Zettlemoyer and Regina Barzilay},\n\tyear         = 2010,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {1268--1277}\n}\n@inproceedings{branavan11win,\n\ttitle        = {Learning to Win by Reading Manuals in a {M}onte-{C}arlo Framework},\n\tauthor       = {S.R.K. Branavan and David Silver and Regina Barzilay},\n\tyear         = 2011,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {268--277}\n}\n@inproceedings{branavan2012learning,\n\ttitle        = {Learning high-level planning from text},\n\tauthor       = {SRK Branavan and Nate Kushman and Tao Lei and Regina Barzilay},\n\tyear         = 2012,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {126--135}\n}\n@inproceedings{brand2002incremental,\n\ttitle        = {\n\t\tIncremental Singular Value Decomposition of Uncertain Data with Missing\n\n\t\tValues\n\t},\n\tauthor       = {Brand,, Matthew},\n\tyear         = 2002,\n\tbooktitle    = {Proceedings of the 7th European Conference on Computer Vision},\n\tpublisher    = {Springer-Verlag},\n\taddress      = {London, UK},\n\tpages        = {707--720},\n\tisbn         = {3-540-43745-2}\n}\n@article{brandow1995automatic,\n\ttitle        = {Automatic condensation of electronic publications by sentence selection},\n\tauthor       = {Ronald Brandow and Karl Mitze and Lisa F. Rau},\n\tyear         = 1995,\n\tjournal      = {Information Processing and Management},\n\tvolume       = 31,\n\tpages        = {675--685}\n}\n@article{brandwood1983complex,\n\ttitle        = {\n\t\tA complex gradient operator and its application in adaptive array\n\n\t\ttheory\n\t},\n\tauthor       = {Brandwood, D.H.},\n\tyear         = 1983,\n\tmonth        = feb,\n\tjournal      = {Communications, Radar and Signal Processing, IEE Proceedings F},\n\tvolume       = 130,\n\tnumber       = 1,\n\tpages        = {11--16},\n\tabstract     = {\n\t\tThe problem of minimising a real scalar quantity (for example array\n\n\t\toutput power, or mean square error) as a function of a complex vector\n\n\t\t(the set of weights) frequently arises in adaptive array theory.\n\n\t\tA complex gradient operator is defined in the paper for this purpose\n\n\t\tand its use justified. Three examples of its application to array\n\n\t\ttheory problems are given.\n\t}\n}\n@inproceedings{branson2017lean,\n\ttitle        = {Lean Crowdsourcing : Combining Humans and Machines in an Online System},\n\tauthor       = {Steve Branson and Grant Van Horn and Pietro Perona},\n\tyear         = 2017,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {7474--7483}\n}\n@misc{brants2006ngram,\n\ttitle        = {Web 1{T} 5-gram version 1},\n\tauthor       = {Thorsten Brants and Alex Franz},\n\tyear         = 2006\n}\n@article{braun2011generalized,\n\ttitle        = {Generalized Direct Sampling for Hierarchical {B}ayesian Models},\n\tauthor       = {Braun, Michael and Damien, Paul},\n\tyear         = 2011,\n\tmonth        = sep,\n\tday          = 7,\n\tabstract     = {\n\t\tIn this paper, we develop a new method to sample from posterior distributions\n\n\t\tin hierarchical models without using Markov chain Monte Carlo. This method is\n\n\t\tgenerally applicable to high-dimensional models involving large data sets.\n\n\t\tIllustrative analysis exemplifies the ease with which one could implement our\n\n\t\tmethod, which results in independent samples from the posterior distributions\n\n\t\tof interest.\n\t},\n\tarchiveprefix = {arXiv},\n\teprint       = {1108.2245},\n\tkeywords     = {bayes, efficiency, exact\\_sampling, hierarchical\\_model, mcmc},\n\tposted-at    = {2011-09-08 09:23:31},\n\tpriority     = 2\n}\n@inproceedings{braun2015mx,\n\ttitle        = {{M}x1 and {M}x2 key antiviral proteins are surprisingly lost in toothed whales},\n\tauthor       = {Benjamin A. Braun and Amir Marcovitz and J. Gray Camp and Robin Jia and Gill Bejerano},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the National Academy of Sciences of the United States of America (PNAS)}\n}\n@inproceedings{braverman16communication,\n\ttitle        = {Communication lower bounds for statistical estimation problems via a distributed data processing inequality},\n\tauthor       = {Mark Braverman and Ankit Garg and Tengyu Ma and Huy L. Nguyen and David P. Woodruff},\n\tbooktitle    = {Proceedings of the 48th Symposium on Theory of Computing (STOC), 2016},\n\tdoi          = {10.1145/2897518.2897582},\n\turl          = {http://doi.acm.org/10.1145/2897518.2897582},\n\tcrossref     = {DBLP:conf/stoc/2016},\n\ttimestamp    = {Fri, 10 Jun 2016 10:47:01 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/stoc/BravermanGMNW16},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@article{breiman1977variable,\n\ttitle        = {Variable kernel estimates of multivariate densities},\n\tauthor       = {Leo Breiman and William Meisel and Edward Purcell},\n\tyear         = 1977,\n\tjournal      = {Technometrics},\n\tvolume       = 19,\n\tnumber       = 2,\n\tpages        = {135--144}\n}\n@article{brendel2017comment,\n\ttitle        = {Comment on\" Biologically inspired protection of deep networks from adversarial attacks\"},\n\tauthor       = {Wieland Brendel and Matthias Bethge},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1704.01547}\n}\n@article{brier1950verification,\n\ttitle        = {Verification of forecasts expressed in terms of probability},\n\tauthor       = {Brier, Glenn W},\n\tyear         = 1950,\n\tjournal      = {Monthly weather review},\n\tvolume       = 78,\n\tnumber       = 1,\n\tpages        = {1--3}\n}\n@inproceedings{briet2010positive,\n\ttitle        = {The positive semidefinite {G}rothendieck problem with rank constraint},\n\tauthor       = {J. Bri{\\\"e}t and F. M. de Oliveira Filho and F. Vallentin},\n\tyear         = 2010,\n\tbooktitle    = {Automata, Languages and Programming},\n\tpages        = {31--42}\n}\n@article{briet2014groth,\n\ttitle        = {{G}rothendieck inequalities for semidefinite programs with rank constraints},\n\tauthor       = {J. Bri{\\\"e}t and F. M. de Oliveira Filho and F. Vallentin},\n\tyear         = 2014,\n\tjournal      = {Theory of Computing},\n\tvolume       = 10,\n\tpages        = {77--105}\n}\n@inproceedings{briggs06functional,\n\ttitle        = {Functional genetic programming with combinators},\n\tauthor       = {Forrest Briggs and Melissa O'Neill},\n\tyear         = 2006,\n\tbooktitle    = {Third Asian-Pacific workshop on Genetic Programming},\n\tpages        = {110--127}\n}\n@article{brill1995transformation,\n\ttitle        = {Transformation-based error-driven learning and natural language processing: A case study in part-of-speech tagging},\n\tauthor       = {Eric Brill},\n\tyear         = 1995,\n\tjournal      = {Computational linguistics},\n\tvolume       = 21,\n\tnumber       = 4,\n\tpages        = {543--565}\n}\n@inproceedings{brill2002askmsr,\n\ttitle        = {An analysis of the {A}sk{MSR} question-answering system},\n\tauthor       = {Eric Brill and Susan Dumais and Michele Banko},\n\tyear         = 2002,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {257--264}\n}\n@inproceedings{bringmann2012efficient,\n\ttitle        = {Efficient sampling methods for discrete distributions},\n\tauthor       = {Bringmann, Karl and Panagiotou, Konstantinos},\n\tyear         = 2012,\n\tbooktitle    = {International Colloquium on Automata, Languages, and Programming},\n\tpages        = {133--144},\n\torganization = {Springer}\n}\n@inproceedings{bringmann2013succinct,\n\ttitle        = {Succinct sampling from discrete distributions},\n\tauthor       = {Bringmann, Karl and Larsen, Kasper Green},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of the forty-fifth annual ACM symposium on Theory of computing},\n\tpages        = {775--782},\n\torganization = {ACM}\n}\n@article{bro70,\n\ttitle        = {The convergence of a class of double-rank minimization algorithms 2. The new algorithm},\n\tauthor       = {Broyden, Charles G},\n\tyear         = 1970,\n\tjournal      = {IMA Journal of Applied Mathematics},\n\tpublisher    = {IMA},\n\tvolume       = 6,\n\tnumber       = 3,\n\tpages        = {222--231}\n}\n@inproceedings{broad2018learning,\n\ttitle        = {Learning models for shared control of human-machine systems with unknown dynamics},\n\tauthor       = {Alexander Broad and Todd Murphey and Brenna Argall},\n\tyear         = 2018,\n\tbooktitle    = {Robotics: Science and Systems (RSS)}\n}\n@inproceedings{broad2019highly,\n\ttitle        = {Highly parallelized data-driven {MPC} for minimal intervention shared control},\n\tauthor       = {Alexander Broad and Todd Murphey and Brenna Argall},\n\tyear         = 2019,\n\tbooktitle    = {Robotics: Science and Systems (RSS)}\n}\n@article{broad2020datadriven,\n\ttitle        = {Data-driven Koopman operators for model-based shared control of human–machine systems},\n\tauthor       = {A. Broad and Ian Abraham and T. Murphey and Brenna Argall},\n\tyear         = 2020,\n\tjournal      = {International Journal of Robotics Research (IJRR)},\n\tvolume       = 39,\n\tpages        = {1178--1195}\n}\n@techreport{Brochu:2010c,\n\ttitle        = {A Tutorial on Bayesian Optimization of Expensive Cost Functions, with Application to Active User Modeling and Hierarchical Reinforcement Learning},\n\tauthor       = {Eric Brochu and Vlad M Cora and Nando {de Freitas}},\n\tyear         = 2010,\n\tmonth        = dec,\n\tnumber       = {arXiv:1012.2599},\n\tinstitution  = {arXiv.org},\n\ttype         = {eprint}\n}\n@article{brocker2007reliability,\n\ttitle        = {Increasing the Reliability of Reliability Diagrams},\n\tauthor       = {Jochen Bröcker and Leonard A. Smith},\n\tyear         = 2007,\n\tjournal      = {Weather and Forecasting},\n\tvolume       = 22,\n\tnumber       = 3,\n\tpages        = {651--661}\n}\n@article{brocker2009decomposition,\n\ttitle        = {Reliability, sufficiency, and the decomposition of proper scores},\n\tauthor       = {Jochen Brocker},\n\tyear         = 2009,\n\tjournal      = {Quarterly Journal of the Royal Meteorological Society},\n\tvolume       = 135,\n\tnumber       = 643,\n\tpages        = {1512--1519}\n}\n@article{brocker2012empirical,\n\ttitle        = {Estimating reliability and resolution of probability forecasts through decomposition of the empirical score},\n\tauthor       = {Jochen Brocker},\n\tyear         = 2012,\n\tjournal      = {Climate Dynamics},\n\tvolume       = 39,\n\tpages        = {655--667}\n}\n@article{brockman2016openai,\n\ttitle        = {Open{AI} {Gym}},\n\tauthor       = {Brockman, Greg and Cheung, Vicki and Pettersson, Ludwig and Schneider, Jonas and Schulman, John and Tang, Jie and Zaremba, Wojciech},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1606.01540}\n}\n@inproceedings{brockschmidt2019generative,\n\ttitle        = {Generative Code Modeling with Graphs},\n\tauthor       = {Marc Brockschmidt and Miltiadis Allamanis and Alexander L. Gaunt and Oleksandr Polozov},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@book{brockwell1987time,\n\ttitle        = {Time Series: Theory and Methods},\n\tauthor       = {Peter J. Brockwell and Richard A. Davis},\n\tyear         = 1987,\n\tpublisher    = {Springer-Verlag New York, Inc.},\n\taddress      = {New York, NY, USA},\n\tisbn         = {0-387-96406-1},\n\tabstract     = {\n\t\tDiscusses ARMA, ARIMA models with a very strong math view point. Gives\n\n\t\tthe Yule-Walker equations for ARMA models; the Wold decomposition;\n\n\t\tthe Akaike Information Criterion (AIC). It presents Hilbert spaces\n\n\t\twith inner products, fractional differencing models (FARMA) (\\~{}pink\n\n\t\tnoise), random variables with infinite variance, and Kalman filtering.\n\t},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@article{bromley1993signature,\n\ttitle        = {Signature verification using a\" siamese\" time delay neural network},\n\tauthor       = {Bromley, Jane and Guyon, Isabelle and LeCun, Yann and S{\\\"a}ckinger, Eduard and Shah, Roopak},\n\tyear         = 1993,\n\tjournal      = {Advances in neural information processing systems},\n\tvolume       = 6,\n\tpages        = {737--744}\n}\n@book{brooks2011handbook,\n\ttitle        = {Handbook of {M}arkov Chain {M}onte {C}arlo},\n\tauthor       = {Steve Brooks and Andrew Gelman and Galin Jones and Xiao-Li Meng},\n\tyear         = 2011,\n\tpublisher    = {CRC press}\n}\n@article{brostrom2000acceptance,\n\ttitle        = {{Acceptance-rejection Sampling from the Conditional Distribution of Independent Discrete Random Variables, given their Sum}},\n\tauthor       = {Brostr\\\"{o}m, G\\\"{o}ran and Nilsson, Leif},\n\tyear         = 2000,\n\tmonth        = jan,\n\tjournal      = {Statistics},\n\tvolume       = 34,\n\tnumber       = 3,\n\tpages        = {247--257},\n\tdoi          = {10.1080/02331880008802716},\n\tissn         = {0233-1888},\n\turl          = {http://www.tandfonline.com/doi/abs/10.1080/02331880008802716},\n\tfile         = {:home/leili/Dropbox/reading/sampling/Sampling on sum/rejection SAMPLING conditional independent discrete given sum - 2000.pdf:pdf},\n\tkeywords     = {bernoulli distribution,bootstrap,exponential families,importance sampling,proportional hazards,simulation,sufficiency,survival analysis,tilted distributions}\n}\n@article{broussard2020grades,\n\ttitle        = {When Algorithms Give Real Students Imaginary Grades},\n\tauthor       = {Meredith Broussard},\n\tyear         = 2020,\n\tjournal      = {The New York Times},\n\turl          = {https://www.nytimes.com/2020/09/08/opinion/international-baccalaureate-algorithm-grades.html}\n}\n@article{brown2017adversarial,\n\ttitle        = {Adversarial patch},\n\tauthor       = {Tom B Brown and Dandelion Mané and Aurko Roy and Martín Abadi and Justin Gilmer},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1712.09665}\n}\n@article{brown2020gpt3,\n\ttitle        = {Language Models are Few-Shot Learners},\n\tauthor       = {Tom B. Brown and Benjamin Mann and Nick Ryder and Melanie Subbiah and Jared Kaplan and Prafulla Dhariwal and Arvind Neelakantan and Pranav Shyam and Girish Sastry and Amanda Askell and Sandhini Agarwal and Ariel Herbert-Voss and Gretchen Krueger and Tom Henighan and Rewon Child and Aditya Ramesh and Daniel M. Ziegler and Jeffrey Wu and Clemens Winter and Christopher Hesse and Mark Chen and Eric Sigler and Mateusz Litwin and Scott Gray and Benjamin Chess and Jack Clark and Christopher Berner and Sam McCandlish and Alec Radford and Ilya Sutskever and Dario Amodei},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2005.14165}\n}\n@article{brown92class,\n\ttitle        = {Class-Based n-gram Models of Natural Language},\n\tauthor       = {P. F. Brown and  V. J. Della Pietra and P. V. deSouza and J. C. Lai and R. L. Mercer},\n\tyear         = 1992,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 18,\n\tpages        = {467--479}\n}\n@article{brown93mt,\n\ttitle        = {The Mathematics of Statistical Machine Translation: Parameter Estimation},\n\tauthor       = {Peter F. Brown and Stephen A. Della Pietra and Vincent J. Della Pietra and Robert L. Mercer},\n\tyear         = 1993,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 19,\n\tpages        = {263--311}\n}\n@article{browne2012monte,\n\ttitle        = {A survey of {M}onte {C}arlo tree search methods},\n\tauthor       = {Cameron B Browne and Edward Powley and Daniel Whitehouse and Simon M Lucas and Peter I Cowling and Philipp Rohlfshagen and Stephen Tavener and Diego Perez and Spyridon Samothrakis and Simon Colton},\n\tyear         = 2012,\n\tjournal      = {IEEE Transactions on Computational Intelligence and AI in Games},\n\tvolume       = 4,\n\tpages        = {1--43}\n}\n@inproceedings{BRRT,\n\ttitle        = {Factoring nonnegative matrices with linear programs},\n\tauthor       = {V. Bittorf and B. Recht and C. Re and J. Tropp},\n\tyear         = 2012,\n\tbooktitle    = {NIPS}\n}\n@inproceedings{bruce2002real,\n\ttitle        = {Real-time randomized path planning for robot navigation},\n\tauthor       = {J. Bruce and M. Veloso},\n\tyear         = 2002,\n\tbooktitle    = {IROS},\n\tvolume       = 3,\n\tpages        = {2383--2388}\n}\n@inproceedings{bruckner2011stackelberg,\n\ttitle        = {{S}tackelberg games for adversarial prediction problems},\n\tauthor       = {Michael Br{\\\"u}ckner and Tobias Scheffer},\n\tyear         = 2011,\n\tbooktitle    = {SIGKDD},\n\tpages        = {547--555}\n}\n@article{bruckner2012static,\n\ttitle        = {Static prediction games for adversarial learning problems},\n\tauthor       = {Michael Br{\\\"u}ckner and Christian Kanzow and Tobias Scheffer},\n\tyear         = 2012,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 13,\n\tpages        = {2617--2654}\n}\n@article{brunet2018understanding,\n\ttitle        = {Understanding the origins of bias in word embeddings},\n\tauthor       = {Marc-Etienne Brunet and Colleen Alkalay-Houlihan and Ashton Anderson and Richard Zemel},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1810.03611}\n}\n@inproceedings{bruni2017adversarial,\n\ttitle        = {Adversarial Evaluation for Open-Domain Dialogue Generation},\n\tauthor       = {Elia Bruni and Raquel Fernandez},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the SIGDIAL 2017 Conference}\n}\n@article{Brunovsky1970,\n\ttitle        = {A classification of linear controllable systems},\n\tauthor       = {Brunovsky, Pavol},\n\tyear         = 1970,\n\tjournal      = {Kybernetika},\n\tpublisher    = {Institute of Information Theory and Automation AS CR},\n\tvolume       = {06},\n\tnumber       = 3,\n\tpages        = {(173)-188},\n\turl          = {http://eudml.org/doc/28376},\n\tkeywords     = {control theory},\n\tlanguage     = {eng}\n}\n@article{brutzkus2017globally,\n\ttitle        = {Globally optimal gradient descent for a ConvNet with Gaussian inputs},\n\tauthor       = {Brutzkus, Alon and Globerson, Amir},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1702.07966}\n}\n@article{brutzkus2017sgd,\n\ttitle        = {Sgd learns over-parameterized networks that provably generalize on linearly separable data},\n\tauthor       = {Brutzkus, Alon and Globerson, Amir and Malach, Eran and Shalev-Shwartz, Shai},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1710.10174}\n}\n@article{bruzzone2009domain,\n\ttitle        = {Domain adaptation problems: A {DASVM} classification technique and a circular validation strategy},\n\tauthor       = {Lorenzo Bruzzone and Mattia Marconcini},\n\tyear         = 2009,\n\tjournal      = {IEEE transactions on pattern analysis and machine intelligence},\n\tvolume       = 32,\n\tnumber       = 5,\n\tpages        = {770--787}\n}\n@inproceedings{brys2015reinforcement,\n\ttitle        = {Reinforcement Learning from Demonstration through Shaping},\n\tauthor       = {Tim Brys and Anna Harutyunyan and Halit Bener Suay and Sonia Chernova and Matthew E. Taylor and Ann Now{'{e}}},\n\tyear         = 2015,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@inproceedings{BS,\n\ttitle        = {Polynomial Learning of Distribution Families},\n\tauthor       = {Mikhail Belkin and Kaushik Sinha},\n\tyear         = 2010,\n\tbooktitle    = {51th Annual {IEEE} Symposium on Foundations of Computer Science, {FOCS} 2010, October 23-26, 2010, Las Vegas, Nevada, {USA}},\n\tpages        = {103--112},\n\tdoi          = {10.1109/FOCS.2010.16},\n\turl          = {http://dx.doi.org/10.1109/FOCS.2010.16},\n\tcrossref     = {DBLP:conf/focs/2010},\n\ttimestamp    = {Tue, 16 Dec 2014 09:57:25 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/focs/BelkinS10},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{BS10,\n\ttitle        = {Polynomial Learning of Distribution Families},\n\tauthor       = {M. Belkin and K. Sinha},\n\tyear         = 2010,\n\tbooktitle    = {FOCS}\n}\n@inproceedings{BSG11,\n\ttitle        = {An Online Spectral Learning Algorithm for Partially Observable Nonlinear Dynamical Systems},\n\tauthor       = {B. Boots and S. Siddiqi and G. Gordon},\n\tyear         = 2011,\n\tbooktitle    = {AAAI}\n}\n@article{buadoiu2008optimal,\n\ttitle        = {Optimal core-sets for balls},\n\tauthor       = {B{\\u{a}}doiu, Mihai and Clarkson, Kenneth L},\n\tyear         = 2008,\n\tjournal      = {Computational Geometry},\n\tpublisher    = {Elsevier},\n\tvolume       = 40,\n\tnumber       = 1,\n\tpages        = {14--22}\n}\n@article{bubeck2012regret,\n\ttitle        = {Regret Analysis of Stochastic and Nonstochastic Multi-armed Bandit Problems},\n\tauthor       = {Sebastien Bubeck and Nicolo Cesa-Bianchi},\n\tyear         = 2012,\n\tjournal      = {Foundations and Trends in Machine Learning},\n\tvolume       = 5,\n\tnumber       = 1\n}\n@article{Bubeck2015book,\n\ttitle        = {Convex Optimization: Algorithms and Complexity},\n\tauthor       = {Bubeck, S{\\'e}bastien},\n\tyear         = 2015,\n\tjournal      = {Foundations and Trends in Machine Learning},\n\tpublisher    = {Now Publishers Inc.},\n\tvolume       = 8,\n\tnumber       = {3-4},\n\tpages        = {231--357}\n}\n@article{bubeck2018sampling,\n\ttitle        = {Sampling from a log-concave distribution with projected langevin monte carlo},\n\tauthor       = {Bubeck, S{\\'e}bastien and Eldan, Ronen and Lehec, Joseph},\n\tyear         = 2018,\n\tjournal      = {Discrete \\& Computational Geometry},\n\tpublisher    = {Springer},\n\tvolume       = 59,\n\tnumber       = 4,\n\tpages        = {757--783}\n}\n@inproceedings{bubeck2019adversarial,\n\ttitle        = {Adversarial examples from computational constraints},\n\tauthor       = {Sebastien Bubeck and Eric Price and Ilya Razenshteyn},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{bucher2018semantic,\n\ttitle        = {Semantic bottleneck for computer vision tasks},\n\tauthor       = {Maxime Bucher and St{\\'e}phane Herbin and Fr{\\'e}d{\\'e}ric Jurie},\n\tyear         = 2018,\n\tbooktitle    = {Asian Conference on Computer Vision},\n\tpages        = {695--712}\n}\n@inproceedings{bucila06compress,\n\ttitle        = {Model Compression},\n\tauthor       = {Cristian Bucil\\u{a} and Rich Caruana and Alexandru Niculescu-Mizil},\n\tyear         = 2006,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)}\n}\n@article{buck2017ask,\n\ttitle        = {Ask the Right Questions: Active Question Reformulation with Reinforcement Learning},\n\tauthor       = {Christian Buck and Jannis Bulian and Massimiliano Ciaramita and Andrea Gesmundo and Neil Houlsby and Wojciech Gajewski and Wei Wang},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1705.07830}\n}\n@inproceedings{buckley2004incomplete,\n\ttitle        = {Retrieval evaluation with incomplete information},\n\tauthor       = {Chris Buckley and Ellen M. Voorhees},\n\tyear         = 2004,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)},\n\tpages        = {25--32}\n}\n@inproceedings{buckley2007bias,\n\ttitle        = {Bias and the limits of pooling for large collections},\n\tauthor       = {Chris Buckley and Darrin Dimmick and Ian Soboroff and Ellen Voorhees},\n\tyear         = 2007,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)}\n}\n@article{buda2018systematic,\n\ttitle        = {A systematic study of the class imbalance problem in convolutional neural networks},\n\tauthor       = {Buda, Mateusz and Maki, Atsuto and Mazurowski, Maciej A},\n\tyear         = 2018,\n\tjournal      = {Neural Networks},\n\tpublisher    = {Elsevier},\n\tvolume       = 106,\n\tpages        = {249--259}\n}\n@inproceedings{buehrer2007toward,\n\ttitle        = {Toward terabyte pattern mining: an architecture-conscious solution},\n\tauthor       = {\n\t\tBuehrer, Gregory and Parthasarathy, Srinivasan and Tatikonda, Shirish\n\n\t\tand Kurc, Tahsin and Saltz, Joel\n\t},\n\tyear         = 2007,\n\tbooktitle    = {\n\t\tProceedings of the 12th ACM SIGPLAN symposium on Principles and practice\n\n\t\tof parallel programming\n\t},\n\tlocation     = {San Jose, California, USA},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {PPoPP '07},\n\tpages        = {2--12},\n\tdoi          = {http://doi.acm.org/10.1145/1229428.1229432},\n\tisbn         = {978-1-59593-602-8},\n\tacmid        = 1229432,\n\tkeywords     = {itemset mining, out of core, parallel},\n\tnumpages     = 11\n}\n@article{bug2017context,\n\ttitle        = {Context-based normalization of histological stains using deep convolutional features},\n\tauthor       = {Daniel Bug and Steffen Schneider and Anne Grote and Eva Oswald and Friedrich Feuerhake and Julia Sch{\\\"u}ler and Dorit Merhof},\n\tyear         = 2017,\n\tjournal      = {Deep Learning in Medical Image Analysis and Multimodal Learning for Clinical Decision Support},\n\tpages        = {135--142}\n}\n@article{buhlmann1999variable,\n\ttitle        = {Variable length {M}arkov chains},\n\tauthor       = {Peter B{\\\"u}hlmann and Abraham J Wyner},\n\tyear         = 1999,\n\tjournal      = {Annals of Statistics},\n\tvolume       = 27,\n\tnumber       = 2,\n\tpages        = {480--513}\n}\n@inproceedings{buhlmann2016magging,\n\ttitle        = {Magging: maximin aggregation for inhomogeneous large-scale data},\n\tauthor       = {Peter B\\\"uhlmann and Nicolai Meinshausen},\n\tyear         = 2016,\n\tbooktitle    = {IEEE}\n}\n@misc{buja05lossfunctions,\n\ttitle        = {Loss Functions for Binary Class Probability Estimation and Classification: Structure and Applications},\n\tauthor       = {Andreas Buja and Werner Stuetzle and Yi Shen},\n\tyear         = 2005\n}\n@book{bump1998automorphic,\n\ttitle        = {Automorphic forms and representations},\n\tauthor       = {Bump, Daniel},\n\tyear         = 1998,\n\tpublisher    = {Cambridge university press},\n\tnumber       = 55\n}\n@inproceedings{bunescu2005shortest,\n\ttitle        = {A shortest path dependency kernel for relation extraction},\n\tauthor       = {Razvan C Bunescu and Raymond J Mooney},\n\tyear         = 2005,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {724--731}\n}\n@inproceedings{bunescu2007learning,\n\ttitle        = {Learning to extract relations from the web using minimal supervision},\n\tauthor       = {Razvan Bunescu and Raymond Mooney},\n\tyear         = 2007,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{bunse1993numerical,\n\ttitle        = {Numerical methods for simultaneous diagonalization},\n\tauthor       = {Bunse-Gerstner, A. and Byers, R. and Mehrmann, V.},\n\tyear         = 1993,\n\tjournal      = {SIAM Journal on Matrix Analysis and Applications},\n\tpublisher    = {SIAM},\n\tvolume       = 14,\n\tnumber       = 4,\n\tpages        = {927--949}\n}\n@inproceedings{buntine2009estimating,\n\ttitle        = {Estimating Likelihoods for Topic Models},\n\tauthor       = {Wray L. Buntine},\n\tyear         = 2009,\n\tbooktitle    = {Asian Conference on Machine Learning}\n}\n@inproceedings{buolamwini2018gender,\n\ttitle        = {Gender shades: Intersectional accuracy disparities in commercial gender classification},\n\tauthor       = {Joy Buolamwini and Timnit Gebru},\n\tyear         = 2018,\n\tbooktitle    = {Conference on Fairness, Accountability and Transparency},\n\tpages        = {77--91}\n}\n@article{burda2018exploration,\n\ttitle        = {Exploration by random network distillation},\n\tauthor       = {Yuri Burda and Harrison Edwards and Amos Storkey and Oleg Klimov},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1810.12894}\n}\n@inproceedings{burda2019curiosity,\n\ttitle        = {Large-Scale Study of Curiosity-Driven Learning},\n\tauthor       = {Yuri Burda and Harri Edwards and Deepak Pathak and Amos Storkey and Trevor Darrell and Alexei A. Efros},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@book{burden1985bisection,\n\ttitle        = {Numerical Analysis (3rd ed.)},\n\tauthor       = {Richard L. Burden and J. Douglas Faires},\n\tyear         = 1985,\n\tpublisher    = {PWS Publishers}\n}\n@book{BurdenNumerical,\n\ttitle        = {Numerical Analysis},\n\tauthor       = {R.L. Burden and J.D. Faires},\n\tyear         = 2000,\n\tpublisher    = {Brooks Cole, 7 edition}\n}\n@article{burer01anonlinear,\n\ttitle        = {A Nonlinear Programming Algorithm for Solving Semidefinite Programs via Low-rank Factorization},\n\tauthor       = {S. Burer and R. Monteiro},\n\tyear         = 2001,\n\tjournal      = {Mathematical Programming},\n\tvolume       = 95,\n\tnumber       = 2,\n\tpages        = {329--357}\n}\n@article{burer2003nonlinear,\n\ttitle        = {A nonlinear programming algorithm for solving semidefinite programs via low-rank factorization},\n\tauthor       = {Burer, Samuel and Monteiro, Renato DC},\n\tyear         = 2003,\n\tjournal      = {Mathematical Programming},\n\tpublisher    = {Springer},\n\tvolume       = 95,\n\tnumber       = 2,\n\tpages        = {329--357}\n}\n@article{burer2005local,\n\ttitle        = {Local minima and convergence in low-rank semidefinite programming},\n\tauthor       = {Burer, Samuel and Monteiro, Renato DC},\n\tyear         = 2005,\n\tjournal      = {Mathematical Programming},\n\tpublisher    = {Springer},\n\tvolume       = 103,\n\tnumber       = 3,\n\tpages        = {427--444}\n}\n@inproceedings{burkard2017analysis,\n\ttitle        = {Analysis of Causative Attacks against {SVM}s Learning from Data Streams},\n\tauthor       = {Cody Burkard and Brent Lagesse},\n\tyear         = 2017,\n\tbooktitle    = {International Workshop on Security And Privacy Analytics}\n}\n@article{burke1997question,\n\ttitle        = {Question answering from frequently asked question files: Experiences with the {FAQ} finder system},\n\tauthor       = {Robin D Burke and Kristian J Hammond and Vladimir Kulyukin and Steven L Lytinen and Noriko Tomuro and Scott Schoenberg},\n\tyear         = 1997,\n\tjournal      = {AI magazine},\n\tvolume       = 18\n}\n@article{burke2005robust,\n\ttitle        = {A robust gradient sampling algorithm for nonsmooth, nonconvex optimization},\n\tauthor       = {Burke, James V and Lewis, Adrian S and Overton, Michael L},\n\tyear         = 2005,\n\tjournal      = {SIAM Journal on Optimization},\n\tpublisher    = {SIAM},\n\tvolume       = 15,\n\tnumber       = 3,\n\tpages        = {751--779}\n}\n@article{burke2016mortality,\n\ttitle        = {Sources of variation in under-5 mortality across sub-Saharan Africa: a spatial analysis},\n\tauthor       = {Marshall Burke and Sam Heft-Neal and Eran Bendavid},\n\tyear         = 2016,\n\tjournal      = {Lancet Global Health},\n\tvolume       = 4\n}\n@article{burnetas1997optimal,\n\ttitle        = {Optimal adaptive policies for {Markov} decision processes},\n\tauthor       = {Burnetas, Apostolos N and Katehakis, Michael N},\n\tyear         = 1997,\n\tjournal      = {Mathematics of Operations Research},\n\tpublisher    = {INFORMS},\n\tvolume       = 22,\n\tnumber       = 1,\n\tpages        = {222--255}\n}\n@misc{burtsev2018convai2,\n\ttitle        = {The Conversational Intelligence Challenge 2 ({ConvAI2})},\n\tauthor       = {Mikhail Burtsev and Varvara Logacheva and Valentin Malykh and Ryan Lowe and Iulian Serban and Shrimai Prabhumoye and Emily Dinan and Douwe Kiela and Alexander Miller and Kurt Shuster and Arthur Szlam and Jack Urbanek and Jason Weston},\n\tyear         = 2018\n}\n@article{busoniu2008comprehensive,\n\ttitle        = {A comprehensive survey of multiagent reinforcement learning},\n\tauthor       = {Lucian Busoniu and Robert Babuska and Bart De Schutter},\n\tyear         = 2008,\n\tjournal      = {IEEE Trans. Systems, Man, and Cybernetics, Part C},\n\tvolume       = 38,\n\tnumber       = 2,\n\tpages        = {156--172}\n}\n@article{buzsaki2014log,\n\ttitle        = {The log-dynamic brain: how skewed distributions affect network operations},\n\tauthor       = {Buzs{\\'a}ki, Gy{\\\"o}rgy and Mizuseki, Kenji},\n\tyear         = 2014,\n\tjournal      = {Nature Reviews Neuroscience}\n}\n@inproceedings{BV08,\n\ttitle        = {Isotropic {PCA} and Affine-Invariant Clustering},\n\tauthor       = {S. C. Brubaker and S. Vempala},\n\tyear         = 2008,\n\tbooktitle    = {FOCS}\n}\n@article{BWY14,\n\ttitle        = {Statistical guarantees for the {EM} algorithm: From population to sample-based analysis},\n\tauthor       = {Sivaraman Balakrishnan and Martin J. Wainwright and Bin Yu},\n\tyear         = 2014,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1408.2156},\n\turl          = {http://arxiv.org/abs/1408.2156},\n\ttimestamp    = {Tue, 03 Mar 4460020 12:24:48 +},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/journals/corr/BalakrishnanWY14},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{byers2000utility,\n\ttitle        = {Utility-based decision-making in wireless sensor networks},\n\tauthor       = {Byers, John and Nasser, Gabriel},\n\tyear         = 2000,\n\tbooktitle    = {Mobile and Ad Hoc Networking and Computing, 2000. MobiHOC. 2000 First Annual Workshop on},\n\tpages        = {143--144},\n\torganization = {IEEE}\n}\n@inproceedings{byrd2019effect,\n\ttitle        = {What is the effect of importance weighting in deep learning?},\n\tauthor       = {Byrd, Jonathon and Lipton, Zachary},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {872--881},\n\torganization = {PMLR}\n}\n@article{C,\n\ttitle        = {Upper and lower bounds for the normal distribution function},\n\tauthor       = {John D. Cook},\n\tpublisher    = {\\url{https://www.johndcook.com/blog/norm-dist-bounds/}}\n}\n@inproceedings{cadamuro2016debugging,\n\ttitle        = {Debugging machine learning models},\n\tauthor       = {Gabriel Cadamuro and Ran Gilad-Bachrach and Xiaojin Zhu},\n\tyear         = 2016,\n\tbooktitle    = {ICML Workshop on Reliable Machine Learning in the Wild}\n}\n@inproceedings{cafarella2008webtables,\n\ttitle        = {Web{T}ables: exploring the power of tables on the web},\n\tauthor       = {Michael J Cafarella and Alon Halevy and Daisy Zhe Wang and Eugene Wu and Yang Zhang},\n\tyear         = 2008,\n\tbooktitle    = {Very Large Data Bases (VLDB)},\n\tpages        = {538--549}\n}\n@inproceedings{cafarella2009data,\n\ttitle        = {Data integration for the relational web},\n\tauthor       = {Michael J Cafarella and Alon Halevy and Nodira Khoussainova},\n\tyear         = 2009,\n\tbooktitle    = {Very Large Data Bases (VLDB)}\n}\n@article{cai10soft,\n\ttitle        = {A singular value thresholding algorithm for matrix completion},\n\tauthor       = {J.-F. Cai and E. J. Candes and Z. Shen},\n\tyear         = 2010,\n\tjournal      = {SIAM Journal on Optimization},\n\tvolume       = 20,\n\tpages        = {1956--1982}\n}\n@inproceedings{cai2013large,\n\ttitle        = {Large-scale Semantic Parsing via Schema Matching and Lexicon Extension},\n\tauthor       = {Qingqing Cai and Alexander Yates},\n\tyear         = 2013,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{cai2013sparse,\n\ttitle        = {Sparse PCA: Optimal rates and adaptive estimation},\n\tauthor       = {Cai, T Tony and Ma, Zongming and Wu, Yihong and others},\n\tyear         = 2013,\n\tjournal      = {The Annals of Statistics},\n\tpublisher    = {Institute of Mathematical Statistics},\n\tvolume       = 41,\n\tnumber       = 6,\n\tpages        = {3074--3110}\n}\n@article{cai2015robust,\n\ttitle        = {Robust and computationally feasible community detection in the presence of arbitrary outlier nodes},\n\tauthor       = {T. Tony Cai and Xiaodong Li},\n\tyear         = 2015,\n\tjournal      = {The Annals of Statistics},\n\tvolume       = 43,\n\tnumber       = 3,\n\tpages        = {1027--1059}\n}\n@article{cai2018crop,\n\ttitle        = {A high-performance and in-season classification system of field-level crop types using time-series Landsat data and a machine learning approach},\n\tauthor       = {Yaping Cai and Kaiyu Guan and Jian Peng and Shaowen Wang and Christopher Seifert and Brian Wardlow and Zhan Li},\n\tyear         = 2018,\n\tjournal      = {Remote Sensing of Environment},\n\tvolume       = 210,\n\tpages        = {74--84}\n}\n@article{cai2019neural,\n\ttitle        = {Neural Temporal-Difference Learning Converges to Global Optima},\n\tauthor       = {Cai, Qi and Yang, Zhuoran and Lee, Jason D and Wang, Zhaoran},\n\tyear         = 2019,\n\tjournal      = {Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{cai2019provably,\n\ttitle        = {Provably Efficient Exploration in Policy Optimization},\n\tauthor       = {Cai, Qi and Yang, Zhuoran and Jin, Chi and Wang, Zhaoran},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:1912.05830},\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {1283--1294},\n\torganization = {PMLR}\n}\n@article{cai2020neural,\n\ttitle        = {Neural Temporal-Difference and Q-Learning Provably Converge to Global Optima},\n\tauthor       = {Qi Cai and Zhuoran Yang and Jason D. Lee and Zhaoran Wang},\n\tyear         = 2020,\n\teprint       = {1905.10027},\n\tarchiveprefix = {arXiv},\n\tprimaryclass = {cs.LG}\n}\n@article{cai2021theory,\n\ttitle        = {A Theory of Label Propagation for Subpopulation Shift},\n\tauthor       = {Cai, Tianle and Gao, Ruiqi and Lee, Jason D and Lei, Qi},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2102.11203}\n}\n@inproceedings{cakmak2007affordances,\n\ttitle        = {Affordances as a framework for robot control},\n\tauthor       = {Maya Cakmak and Mehmet R Dogar and Emre Ugur and Erol Sahin},\n\tyear         = 2007,\n\tbooktitle    = {International conference on epigenetic robotics}\n}\n@article{caldas2018leaf,\n\ttitle        = {Leaf: A benchmark for federated settings},\n\tauthor       = {Sebastian Caldas and Peter Wu and Tian Li and Jakub Kone{\\v{c}}n{\\`y} and H Brendan McMahan and Virginia Smith and Ameet Talwalkar},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1812.01097}\n}\n@inproceedings{calders2009building,\n\ttitle        = {Building classifiers with independency constraints},\n\tauthor       = {Toon Calders and Faisal Kamiran and Mykola Pechenizkiy},\n\tyear         = 2009,\n\tbooktitle    = {Data mining workshops, 2009. ICDMW'09. IEEE international conference on},\n\tpages        = {13--18}\n}\n@inproceedings{calmon2017optimized,\n\ttitle        = {Optimized pre-processing for discrimination prevention},\n\tauthor       = {Flavio Calmon and Dennis Wei and Bhanukiran Vinzamuri and Karthikeyan Natesan Ramamurthy and Kush R Varshney},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {3992--4001}\n}\n@inproceedings{camacho2017non,\n\ttitle        = {Non-markovian rewards expressed in LTL: guiding search via reward shaping},\n\tauthor       = {Camacho, Alberto and Chen, Oscar and Sanner, Scott and McIlraith, Sheila A},\n\tyear         = 2017,\n\tbooktitle    = {Tenth Annual Symposium on Combinatorial Search}\n}\n@inproceedings{camacho2019ltl,\n\ttitle        = {LTL and beyond: Formal languages for reward function specification in reinforcement learning},\n\tauthor       = {Camacho, Alberto and Icarte, R Toro and Klassen, Toryn Q and Valenzano, Richard and McIlraith, Sheila A},\n\tyear         = 2019,\n\tbooktitle    = {Proceedings of the 28th International Joint Conference on Artificial Intelligence (IJCAI)},\n\tpages        = {6065--6073}\n}\n@inproceedings{campagna2017almond,\n\ttitle        = {Almond: The Architecture of an Open, Crowdsourced, Privacy-Preserving, Programmable Virtual Assistant},\n\tauthor       = {Giovanni Campagna and Rakesh Ramesh and Silei Xu and Michael Fischer and Monica S. Lam},\n\tyear         = 2017,\n\tbooktitle    = {World Wide Web (WWW)},\n\tpages        = {341--350}\n}\n@article{campanella2019clinical,\n\ttitle        = {Clinical-grade computational pathology using weakly supervised deep learning on whole slide images},\n\tauthor       = {Gabriele Campanella and Matthew G Hanna and Luke Geneslaw and Allen Miraflor and Vitor Werneck Krauss Silva and Klaus J Busam and Edi Brogi and Victor E Reuter and David S Klimstra and Thomas J Fuchs},\n\tyear         = 2019,\n\tjournal      = {Nature medicine},\n\tvolume       = 25,\n\tnumber       = 8,\n\tpages        = {1301--1309}\n}\n@book{campbell1998historical,\n\ttitle        = {Historical Linguistics: An Introduction},\n\tauthor       = {L. Campbell},\n\tyear         = 1998,\n\tpublisher    = {Edinburgh University Press}\n}\n@article{campbell2017uncovering,\n\ttitle        = {Uncovering genomic trajectories with heterogeneous genetic and environmental backgrounds across single-cells and populations},\n\tauthor       = {Kieran Campbell and Christopher Yau},\n\tyear         = 2017,\n\tjournal      = {bioRxiv}\n}\n@article{campero2020learning,\n\ttitle        = {Learning with {AMIG}o: Adversarially Motivated Intrinsic Goals},\n\tauthor       = {Andres Campero and Roberta Raileanu and Heinrich K{\\\"{u}}ttler and Joshua B. Tenenbaum and Tim Rockt{\\\"{a}}schel and Edward Grefenstette},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.12122}\n}\n@article{Campi02,\n\ttitle        = {Finite Sample Properties of System Identification Methods},\n\tauthor       = {M. C. Campi and Erik Weyer},\n\tyear         = 2002,\n\tjournal      = {{IEEE} Transactions on Automatic Control},\n\tvolume       = 47,\n\tnumber       = 8,\n\tpages        = {1329--1334},\n\tdate-added   = {2016-04-02 18:41:57 +0000},\n\tdate-modified = {2016-04-02 18:42:41 +0000}\n}\n@inproceedings{campos2018skip,\n\ttitle        = {Skip {RNN}: Learning to Skip State Updates in Recurrent Neural Networks},\n\tauthor       = {Victor Campos and Brendan Jou and Xavier Giro-i-Nieto and Jordi Torres and Shih-Fu Chang},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{candes11phaselift,\n\ttitle        = {PhaseLift: Exact and Stable Signal Recovery from Magnitude Measurements via Convex Programming},\n\tauthor       = {Emmanuel J. Candes and Thomas Strohmer and Vladislav Voroninski},\n\tyear         = 2011,\n\tjournal      = {arXiv}\n}\n@article{candes2005decoding,\n\ttitle        = {Decoding by linear programming},\n\tauthor       = {Candes, Emmanuel J and Tao, Terence},\n\tyear         = 2005,\n\tmonth        = dec,\n\tjournal      = {IEEE transactions on information theory},\n\tpublisher    = {IEEE},\n\tvolume       = 51,\n\tnumber       = 12,\n\tpages        = {4203--4215},\n\tdoi          = {10.1109/TIT.2005.858979},\n\tissn         = {0018-9448},\n\tkeywords     = {Gaussian processes;convex programming;decoding;error correction codes;indeterminancy;linear codes;linear programming;minimisation;random codes;sparse matrices;Gaussian random matrix;basis pursuit;linear code decoding;linear programming;minimization problem;natural error correcting problem;simple convex optimization problem;sparse solution;uncertainty principle;Decoding;Equations;Error correction;Error correction codes;Information theory;Linear code;Linear programming;Mathematics;Sparse matrices;Vectors;Basis pursuit;Gaussian random matrices;decoding of (random) linear codes;duality in optimization;linear codes;linear programming;principal angles;restricted orthonormality;singular values of random matrices;sparse solutions to underdetermined systems}\n}\n@article{candes2006near,\n\ttitle        = {Near-optimal signal recovery from random projections: Universal encoding strategies?},\n\tauthor       = {Candes, Emmanuel J and Tao, Terence},\n\tyear         = 2006,\n\tjournal      = {Information Theory, IEEE Transactions on},\n\tpublisher    = {IEEE},\n\tvolume       = 52,\n\tnumber       = 12,\n\tpages        = {5406--5425}\n}\n@article{candes2006robust,\n\ttitle        = {Robust uncertainty principles: Exact signal reconstruction from highly incomplete frequency information},\n\tauthor       = {Cand{\\`e}s, Emmanuel J and Romberg, Justin and Tao, Terence},\n\tyear         = 2006,\n\tjournal      = {Information Theory, IEEE Transactions on},\n\tpublisher    = {IEEE},\n\tvolume       = 52,\n\tnumber       = 2,\n\tpages        = {489--509},\n\towner        = {gewor_000},\n\ttimestamp    = {2013.11.10}\n}\n@article{candes2008restricted,\n\ttitle        = {The restricted isometry property and its implications for compressed sensing},\n\tauthor       = {Candes, Emmanuel J},\n\tyear         = 2008,\n\tjournal      = {Comptes Rendus Mathematique},\n\tpublisher    = {Elsevier},\n\tvolume       = 346,\n\tnumber       = 9,\n\tpages        = {589--592}\n}\n@article{candes2009exact,\n\ttitle        = {Exact matrix completion via convex optimization},\n\tauthor       = {Cand{\\`e}s, Emmanuel J and Recht, Benjamin},\n\tyear         = 2009,\n\tjournal      = {Foundations of Computational mathematics},\n\tpublisher    = {Springer},\n\tvolume       = 9,\n\tnumber       = 6,\n\tpages        = {717--772}\n}\n@article{candes2010matrix,\n\ttitle        = {Matrix completion with noise},\n\tauthor       = {Candes, Emmanuel J and Plan, Yaniv},\n\tyear         = 2010,\n\tjournal      = {Proceedings of the IEEE},\n\tvolume       = 98,\n\tnumber       = 6,\n\tpages        = {925--936}\n}\n@article{candes2010power,\n\ttitle        = {The power of convex relaxation: Near-optimal matrix completion},\n\tauthor       = {Cand{\\`e}s, Emmanuel J and Tao, Terence},\n\tyear         = 2010,\n\tjournal      = {Information Theory, IEEE Transactions on},\n\tpublisher    = {IEEE},\n\tvolume       = 56,\n\tnumber       = 5,\n\tpages        = {2053--2080}\n}\n@article{candes2011robust,\n\ttitle        = {Robust principal component analysis?},\n\tauthor       = {Cand{\\`e}s, Emmanuel J and Li, Xiaodong and Ma, Yi and Wright, John},\n\tyear         = 2011,\n\tjournal      = {Journal of the ACM (JACM)},\n\tpublisher    = {ACM},\n\tvolume       = 58,\n\tnumber       = 3,\n\tpages        = 11\n}\n@article{candes2013phaselift,\n\ttitle        = {Phaselift: Exact and stable signal recovery from magnitude measurements via convex programming},\n\tauthor       = {Candes, Emmanuel J and Strohmer, Thomas and Voroninski, Vladislav},\n\tyear         = 2013,\n\tjournal      = {Communications on Pure and Applied Mathematics},\n\tpublisher    = {Wiley Online Library},\n\tvolume       = 66,\n\tnumber       = 8,\n\tpages        = {1241--1274}\n}\n@article{candes2015phase,\n\ttitle        = {Phase retrieval via Wirtinger flow: Theory and algorithms},\n\tauthor       = {Candes, Emmanuel J and Li, Xiaodong and Soltanolkotabi, Mahdi},\n\tyear         = 2015,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tpublisher    = {IEEE},\n\tvolume       = 61,\n\tnumber       = 4,\n\tpages        = {1985--2007}\n}\n@inproceedings{canetti2019soft,\n\ttitle        = {From soft classifiers to hard decisions: How fair can we be?},\n\tauthor       = {Ran Canetti and Aloni Cohen and Nishanth Dikkala and Govind Ramnarayan and Sarah Scheffler and Adam Smith},\n\tyear         = 2019,\n\tbooktitle    = {Proceedings of the Conference on Fairness, Accountability, and Transparency},\n\tpages        = {309--318}\n}\n@inproceedings{cao2006adapting,\n\ttitle        = {Adapting ranking {SVM} to document retrieval},\n\tauthor       = {Yunbo Cao and Jun Xu and Tie-Yan Liu and Hang Li and Yalou Huang and Hsiao-Wuen Hon},\n\tyear         = 2006,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)}\n}\n@article{cao2013overview,\n\ttitle        = {An overview of recent progress in the study of distributed multi-agent coordination},\n\tauthor       = {Yongcan Cao and Wenwu Yu and Wei Ren and Guanrong Chen},\n\tyear         = 2013,\n\tjournal      = {IEEE Transactions on Industrial informatics},\n\tvolume       = 9,\n\tnumber       = 1,\n\tpages        = {427--438}\n}\n@inproceedings{cao2017latent,\n\ttitle        = {Latent Variable Dialogue Models and Their Diversity},\n\tauthor       = {Kris Cao and Stephen Clark},\n\tyear         = 2017,\n\tbooktitle    = {European Association for Computational Linguistics (EACL)}\n}\n@inproceedings{cao2018emergent,\n\ttitle        = {Emergent Communication through Negotiation},\n\tauthor       = {Kris Cao and Angeliki Lazaridou and Marc Lanctot and Joel Z Leibo and Karl Tuyls and Stephen Clark},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{cao2019learning,\n\ttitle        = {Learning imbalanced datasets with label-distribution-aware margin loss},\n\tauthor       = {Cao, Kaidi and Wei, Colin and Gaidon, Adrien and Arechiga, Nikos and Ma, Tengyu},\n\tyear         = 2019,\n\tmonth        = jun,\n\tjournal      = {arXiv e-prints},\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 32,\n\tpages        = {1565--1576},\n\tkeywords     = {Computer Science - Machine Learning, Computer Science - Computer Vision and Pattern Recognition, Statistics - Machine Learning},\n\teid          = {arXiv:1906.07413},\n\tarchiveprefix = {arXiv},\n\teprint       = {1906.07413},\n\tprimaryclass = {cs.LG},\n\tadsurl       = {https://ui.adsabs.harvard.edu/abs/2019arXiv190607413C},\n\tadsnote      = {Provided by the SAO/NASA Astrophysics Data System}\n}\n@article{cao2020heteroskedastic,\n\ttitle        = {Heteroskedastic and Imbalanced Deep Learning with Adaptive Regularization},\n\tauthor       = {Cao, Kaidi and Chen, Yining and Lu, Junwei and Arechiga, Nikos and Gaidon, Adrien and Ma, Tengyu},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.15766},\n\tbooktitle    = {International Conference on Learning Representations},\n\turl          = {https://openreview.net/forum?id=mEdwVCRJuX4}\n}\n@inproceedings{cao2021heteroskedastic,\n\ttitle        = {Heteroskedastic and Imbalanced Deep Learning with Adaptive Regularization},\n\tauthor       = {Kaidi Cao and Yining Chen and Junwei Lu and Nikos Arechiga and Adrien Gaidon and Tengyu Ma},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2006.15766},\n\tbooktitle    = {International Conference on Learning Representations},\n\turl          = {https://openreview.net/forum?id=mEdwVCRJuX4}\n}\n@article{cappe09online,\n\ttitle        = {Online Expectation-Maximization Algorithm for Latent Data Models},\n\tauthor       = {Olivier Capp\\'e and Eric Moulines},\n\tyear         = 2009,\n\tjournal      = {Journal of the Royal Statistics Society: Series B (Statistical Methodology)},\n\tvolume       = 71,\n\tpages        = {593--613}\n}\n@article{cappe2007overview,\n\ttitle        = {An overview of existing methods and recent advances in sequential {M}onte {C}arlo},\n\tauthor       = {Olivier Cappé and Simon J Godsill and Eric Moulines},\n\tyear         = 2007,\n\tjournal      = {Proceedings of the IEEE},\n\tvolume       = 95,\n\tnumber       = 5,\n\tpages        = {899--924}\n}\n@article{caraballo1998new,\n\ttitle        = {New figures of merit for best-first probabilistic chart parsing},\n\tauthor       = {Sharon A. Caraballo and Eugene Charniak},\n\tyear         = 1998,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 24,\n\tpages        = {275--298}\n}\n@article{caramanis1992perturbation,\n\ttitle        = {Perturbation analysis for the design of flexible manufacturing system flow controllers},\n\tauthor       = {Caramanis, Michael and Liberopoulos, George},\n\tyear         = 1992,\n\tjournal      = {Operations Research},\n\tpublisher    = {INFORMS},\n\tvolume       = 40,\n\tnumber       = 6,\n\tpages        = {1107--1125}\n}\n@article{carbery2001distributional,\n\ttitle        = {Distributional and L\\^{} q norm inequalities for polynomials over convex bodies in R\\^{} n},\n\tauthor       = {Carbery, Anthony and Wright, James},\n\tyear         = 2001,\n\tjournal      = {Mathematical Research Letters},\n\tpublisher    = {International Press},\n\tvolume       = 8,\n\tnumber       = 3,\n\tpages        = {233--248}\n}\n@inproceedings{card2018calibration,\n\ttitle        = {The Importance of Calibration for Estimating Proportions from Annotations},\n\tauthor       = {Dallas Card and Noah A. Smith},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{cardoso1991super,\n\ttitle        = {Super-symmetric decomposition of the fourth-order cumulant tensor. Blind identification of more sources than sensors},\n\tauthor       = {Cardoso, J.-F.},\n\tyear         = 1991,\n\tbooktitle    = {Acoustics, Speech, and Signal Processing, 1991. ICASSP-91., 1991 International Conference on},\n\tpages        = {3109--3112},\n\torganization = {IEEE}\n}\n@techreport{cardoso1994perturbation,\n\ttitle        = {Perturbation of Joint Diagonalizers},\n\tauthor       = {J. Cardoso},\n\tyear         = 1994,\n\tinstitution  = {Télécom Paris}\n}\n@article{cardoso1996joint,\n\ttitle        = {Jacobi angles for simultaneous diagonalization},\n\tauthor       = {J. Cardoso and A. Souloumiac},\n\tyear         = 1996,\n\tjournal      = {SIAM Journal on Matrix Analysis and Applications},\n\tvolume       = 17,\n\tnumber       = 1,\n\tpages        = {161--164}\n}\n@inproceedings{CardosoComonICA,\n\ttitle        = {Independent component analysis, a survey of some algebraic methods},\n\tauthor       = {J. F. Cardoso and Pierre Comon},\n\tyear         = 1996,\n\tbooktitle    = {IEEE International Symposium on Circuits and Systems},\n\tpages        = {93--96}\n}\n@article{carlini2016defensive,\n\ttitle        = {Defensive distillation is not robust to adversarial examples},\n\tauthor       = {Nicholas Carlini and David Wagner},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@inproceedings{carlini2016hidden,\n\ttitle        = {Hidden voice commands},\n\tauthor       = {Nicholas Carlini and Pratyush Mishra and Tavish Vaidya and Yuankai Zhang and Micah Sherr and Clay Shields and David Wagner and Wenchao Zhou},\n\tyear         = 2016,\n\tbooktitle    = {USENIX Security}\n}\n@article{carlini2017adversarial,\n\ttitle        = {Adversarial Examples Are Not Easily Detected: Bypassing Ten Detection Methods},\n\tauthor       = {Nicholas Carlini and David Wagner},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@article{carlini2017ground,\n\ttitle        = {Ground-Truth Adversarial Examples},\n\tauthor       = {Nicholas Carlini and Guy Katz and Clark Barrett and David L. Dill},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@inproceedings{carlini2017towards,\n\ttitle        = {Towards evaluating the robustness of neural networks},\n\tauthor       = {Nicholas Carlini and David Wagner},\n\tyear         = 2017,\n\tbooktitle    = {IEEE Symposium on Security and Privacy},\n\tpages        = {39--57}\n}\n@inproceedings{carlson2010toward,\n\ttitle        = {Toward an architecture for never-ending language learning},\n\tauthor       = {Andrew Carlson and Justin Betteridge and Bryan Kisiel and Burr Settles and Estevam R Hruschka Jr and Tom M Mitchell},\n\tyear         = 2010,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@article{carlson2013brain,\n\ttitle        = {Brain-controlled wheelchairs: a robotic architecture},\n\tauthor       = {Tom Carlson and Jose del R Millan},\n\tyear         = 2013,\n\tjournal      = {IEEE Robotics \\& Automation Magazine (RAM)},\n\tvolume       = 20,\n\tpages        = {65--73}\n}\n@inproceedings{carlucci2019domain,\n\ttitle        = {Domain generalization by solving jigsaw puzzles},\n\tauthor       = {Fabio M Carlucci and Antonio D'Innocente and Silvia Bucci and Barbara Caputo and Tatiana Tommasi},\n\tyear         = 2019,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {2229--2238}\n}\n@article{carmon2016accelerated,\n\ttitle        = {Accelerated methods for non-convex optimization},\n\tauthor       = {Carmon, Yair and Duchi, John C and Hinder, Oliver and Sidford, Aaron},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1611.00756}\n}\n@article{carmon2016gradient,\n\ttitle        = {Gradient Descent Efficiently Finds the Cubic-Regularized Non-Convex {N}ewton Step},\n\tauthor       = {Carmon, Yair and Duchi, John C},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1612.00547}\n}\n@article{carmon2017convex,\n\ttitle        = {Convex Until Proven Guilty: Dimension-Free Acceleration of Gradient Descent on Non-Convex Functions},\n\tauthor       = {Carmon, Yair and Duchi, John and Hinder, Oliver and Sidford Aaron},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1705.02766}\n}\n@inproceedings{carmon2019unlabeled,\n\ttitle        = {Unlabeled Data Improves Adversarial Robustness},\n\tauthor       = {Yair Carmon and Aditi Raghunathan and Ludwig Schmidt and Percy Liang and John C. Duchi},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{CarmonAGD,\n\ttitle        = {Accelerated Methods for Non-Convex Optimization},\n\tauthor       = {Yair Carmon and John C. Duchi and Oliver Hinder and Aaron Sidford},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint 1611.00756}\n}\n@inproceedings{caron2018deep,\n\ttitle        = {Deep clustering for unsupervised learning of visual features},\n\tauthor       = {Caron, Mathilde and Bojanowski, Piotr and Joulin, Armand and Douze, Matthijs},\n\tyear         = 2018,\n\tbooktitle    = {Proceedings of the European Conference on Computer Vision (ECCV)},\n\tpages        = {132--149}\n}\n@inproceedings{caron2019unsupervised,\n\ttitle        = {Unsupervised pre-training of image features on non-curated data},\n\tauthor       = {Caron, Mathilde and Bojanowski, Piotr and Mairal, Julien and Joulin, Armand},\n\tyear         = 2019,\n\tbooktitle    = {Proceedings of the IEEE/CVF International Conference on Computer Vision},\n\tpages        = {2959--2968}\n}\n@inproceedings{caron2020swav,\n\ttitle        = {Unsupervised Learning of Visual Features by Contrasting Cluster Assignments},\n\tauthor       = {Mathilde Caron and Ishan Misra and Julien Mairal and Priya Goyal and Piotr Bojanowski and Armand Joulin},\n\tyear         = 2020,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tvolume       = 33,\n\tpages        = {9912--9924}\n}\n@article{caron2020unsupervised,\n\ttitle        = {Unsupervised learning of visual features by contrasting cluster assignments},\n\tauthor       = {Caron, Mathilde and Misra, Ishan and Mairal, Julien and Goyal, Priya and Bojanowski, Piotr and Joulin, Armand},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.09882},\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tvolume       = 33,\n\tpages        = {9912--9924}\n}\n@book{carpenter98type,\n\ttitle        = {Type-Logical Semantics},\n\tauthor       = {Bob Carpenter},\n\tyear         = 1998,\n\tpublisher    = {MIT Press}\n}\n@inproceedings{carpentier2012bandit,\n\ttitle        = {Bandit theory meets compressed sensing for high dimensional stochastic linear bandit},\n\tauthor       = {Carpentier, Alexandra and Munos, R{\\'e}mi},\n\tyear         = 2012,\n\tbooktitle    = {Artificial Intelligence and Statistics},\n\tpages        = {190--198},\n\torganization = {PMLR}\n}\n@article{carpineto2012expansion,\n\ttitle        = {A survey of automatic query expansion in information retrieval},\n\tauthor       = {Claudio Carpineto and Giovanni Romano},\n\tyear         = 2012,\n\tjournal      = {ACM Computing Surveys (CSUR)},\n\tvolume       = 44\n}\n@article{carroll1970analysis,\n\ttitle        = {Analysis of individual differences in multidimensional scaling via an N-way generalization of â€œEckart-Youngâ€? decomposition},\n\tauthor       = {Carroll, J Douglas and Chang, Jih-Jie},\n\tyear         = 1970,\n\tjournal      = {Psychometrika},\n\tpublisher    = {Springer},\n\tvolume       = 35,\n\tnumber       = 3,\n\tpages        = {283--319}\n}\n@book{carroll2006measurement,\n\ttitle        = {Measurement error in nonlinear models: a modern perspective},\n\tauthor       = {Raymond J Carroll and David Ruppert and Leonard A Stefanski and Ciprian M Crainiceanu},\n\tyear         = 2006,\n\tpublisher    = {Chapman and Hall/CRC}\n}\n@inproceedings{carroll92dependency,\n\ttitle        = {Two Experiments on Learning Probabilistic Dependency Grammars from Corpora},\n\tauthor       = {Glenn Carroll and Eugene Charniak},\n\tyear         = 1992,\n\tbooktitle    = {Workshop Notes for Statistically-Based NLP Techniques, AAAI},\n\tpages        = {1--13}\n}\n@book{carter2001foundations,\n\ttitle        = {Foundations of mathematical economics},\n\tauthor       = {Carter, Michael},\n\tyear         = 2001,\n\tpublisher    = {MIT press}\n}\n@inproceedings{cartis2009finding,\n\ttitle        = {Finding a point in the relative interior of a polyhedron, with applications to compressed sensing},\n\tauthor       = {Coralia Cartis and Gould Nicholas IM},\n\tyear         = 2009,\n\tbooktitle    = {SPARS'09-Signal Processing with Adaptive Sparse Structured Representations}\n}\n@article{cartisadaptive,\n\ttitle        = {Adaptive cubic regularisation methods for unconstrained optimization. Part I: motivation, convergence and numerical results},\n\tauthor       = {Cartis, Coralia and Gould, Nicholas IM and Toint, Philippe L},\n\tyear         = 2011,\n\tjournal      = {Mathematical Programming},\n\tpublisher    = {Springer},\n\tvolume       = 127,\n\tnumber       = 2,\n\tpages        = {245--295}\n}\n@article{cartisadaptive2,\n\ttitle        = {Adaptive cubic regularisation methods for unconstrained optimization. Part II: worst-case function-and derivative-evaluation complexity},\n\tauthor       = {Cartis, Coralia and Gould, Nicholas IM and Toint, Philippe L},\n\tyear         = 2011,\n\tjournal      = {Mathematical Programming},\n\tpublisher    = {Springer},\n\tvolume       = 130,\n\tnumber       = 2,\n\tpages        = {295--319}\n}\n@misc{cartwright2011number,\n\ttitle        = {The number of eigenvalues of a tensor},\n\tauthor       = {D. Cartwright and B. Sturmfels},\n\tyear         = 2013,\n\tjournal      = {Linear Algebra Appl.},\n\tvolume       = 438,\n\tnumber       = 2,\n\tpages        = {942--952}\n}\n@article{cartwright2013number,\n\ttitle        = {The number of eigenvalues of a tensor},\n\tauthor       = {Cartwright, Dustin and Sturmfels, Bernd},\n\tyear         = 2013,\n\tjournal      = {Linear algebra and its applications},\n\tpublisher    = {Elsevier},\n\tvolume       = 438,\n\tnumber       = 2,\n\tpages        = {942--952}\n}\n@article{CartwrightSturmfels2013,\n\ttitle        = {{The number of eigenvalues of a tensor}},\n\tauthor       = {Dustin Cartwright and Bernd Sturmfels},\n\tyear         = 2013,\n\tmonth        = jan,\n\tjournal      = {Linear Algebra and its Applications},\n\tvolume       = 438,\n\tnumber       = 2,\n\tpages        = {942--952}\n}\n@article{caruana2003outputs,\n\ttitle        = {Benefitting from the Variables that Variable Selection Discards},\n\tauthor       = {Rich Caruana and Virginia R. de Sa},\n\tyear         = 2003,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 3\n}\n@inproceedings{caruana2015intelligible,\n\ttitle        = {Intelligible models for healthcare: Predicting pneumonia risk and hospital 30-day readmission},\n\tauthor       = {Rich Caruana and Yin Lou and Johannes Gehrke and Paul Koch and Marc Sturm and Noemie Elhadad},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)},\n\tpages        = {1721--1730}\n}\n@article{caruana97multitask,\n\ttitle        = {Multitask learning},\n\tauthor       = {Rich Caruana},\n\tyear         = 1997,\n\tjournal      = {Machine Learning},\n\tvolume       = 28,\n\tpages        = {41--75}\n}\n@article{carvalho2010particle,\n\ttitle        = {{Particle Learning and Smoothing}},\n\tauthor       = {Carlos M. Carvalho and Michael S. Johannes and Hedibert F. Lopes and Nicholas G. Polson},\n\tyear         = 2010,\n\tjournal      = {Statistical Science},\n\tvolume       = 25,\n\tpages        = {88--106},\n\tdoi          = {10.1214/10-STS325},\n\tissue        = 2010\n}\n@book{casella1990statistical,\n\ttitle        = {Statistical Inference},\n\tauthor       = {George Casella and Roger L. Berger},\n\tyear         = 1990,\n\tpublisher    = {Wadsworth and Brooks}\n}\n@misc{casella1999monte,\n\ttitle        = {{M}onte {C}arlo statistical methods},\n\tauthor       = {Casella, George and Robert, Christian P},\n\tyear         = 1999,\n\tpublisher    = {Springer-Verlag, New York}\n}\n@article{castellano2020assured,\n\ttitle        = {Assured RL: Reinforcement Learning with Almost Sure Constraints},\n\tauthor       = {Castellano, Agustin and Bazerque, Juan and Mallada, Enrique},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2012.13036}\n}\n@inproceedings{castellon2021calm,\n\ttitle        = {Codified audio language modeling learns useful representations for music information retrieval},\n\tauthor       = {Rodrigo Castellon and Chris Donahue and Percy Liang},\n\tyear         = 2021,\n\tbooktitle    = {International Society for Music Information Retrieval (ISMIR)}\n}\n@inproceedings{Catalyst2015,\n\ttitle        = {A Universal Catalyst for First-Order Optimization},\n\tauthor       = {Hongzhou Lin and Julien Mairal and Za{\\\"{\\i}}d Harchaoui},\n\tyear         = 2015,\n\tjournal      = {arXiv},\n\tbooktitle    = {Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015, December 7-12, 2015, Montreal, Quebec, Canada},\n\tpages        = {3384--3392},\n\tcrossref     = {DBLP:conf/nips/2015}\n}\n@article{cattell1944parallel,\n\ttitle        = {Parallel proportional profiles and other principles for determining the choice of factors by rotation},\n\tauthor       = {Cattell, R. B.},\n\tyear         = 1944,\n\tjournal      = {Psychometrika},\n\tpublisher    = {Springer},\n\tvolume       = 9,\n\tnumber       = 4,\n\tpages        = {267--283}\n}\n@article{cauchois2020knowing,\n\ttitle        = {Knowing what you know: valid confidence sets in multiclass and multilabel prediction},\n\tauthor       = {Cauchois, Maxime and Gupta, Suyash and Duchi, John},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2004.10181}\n}\n@article{cayci2021sample,\n\ttitle        = {Sample Complexity and Overparameterization Bounds for Projection-Free Neural {TD} Learning},\n\tauthor       = {Cayci, Semih and Satpathi, Siddhartha and He, Niao and Srikant, R},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2103.01391}\n}\n@inproceedings{cb18,\n\ttitle        = {On the Global Convergence of Gradient Descent for Over-parameterized Models using Optimal Transport},\n\tauthor       = {Chizat, Lenaic and Bach, Francis},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.09545},\n\tbooktitle    = {Advances in Neural Information Processing Systems (NIPS)},\n\tpublisher    = {arXiv preprint arXiv:1805.09545}\n}\n@inproceedings{CDC2016,\n\ttitle        = {An online primal-dual method for discounted {M}arkov decision processes},\n\tauthor       = {Wang, Mengdi and Chen, Yichen},\n\tyear         = 2016,\n\tbooktitle    = {IEEE Conference of Decisions and Control}\n}\n@inproceedings{CDS,\n\ttitle        = {Atomic decomposition by basis pursuit},\n\tauthor       = {S. Chen and D. Donoho and M. Saunders},\n\tyear         = 1998,\n\tbooktitle    = {SIAM J. on Scientific Computing},\n\tpages        = {33--61}\n}\n@article{celeux1992classification,\n\ttitle        = {A classification {EM} algorithm for clustering and two stochastic versions},\n\tauthor       = {Gilles Celeux and Gérard Govaert},\n\tyear         = 1992,\n\tjournal      = {Computational Statistics & Data Analysis},\n\tvolume       = 14,\n\tpages        = {315--332}\n}\n@inproceedings{celikyilmaz2014resolving,\n\ttitle        = {Resolving Referring Expressions in Conversational Dialogs for Natural User Interfaces},\n\tauthor       = {Asli \\c{C}elikyilmaz and Zhaleh Feizollahi and Dilek Z. Hakkani-T{\\\"u}r and Ruhi Sarikaya},\n\tyear         = 2014,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{cen2020fast,\n\ttitle        = {Fast global convergence of natural policy gradient methods with entropy regularization},\n\tauthor       = {Cen, Shicong and Cheng, Chen and Chen, Yuxin and Wei, Yuting and Chi, Yuejie},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.06558}\n}\n@article{CeS08,\n\ttitle        = {Relaxed Alternating Projection Methods},\n\tauthor       = {Cegielski, Andrzej and Suchocka, Agnieszka},\n\tyear         = 2008,\n\tjournal      = {SIAM J. Optim.},\n\tvolume       = 19,\n\tnumber       = 3,\n\tpages        = {1093--1106},\n\tdoi          = {10.1137/070698750},\n\turl          = {http://dx.doi.org/10.1137/070698750},\n\teprint       = {http://dx.doi.org/10.1137/070698750},\n\tfjournal     = {SIAM Journal on Optimization}\n}\n@book{cesa2006prediction,\n\ttitle        = {Prediction, learning, and games},\n\tauthor       = {Cesa-Bianchi, Nicolo and Lugosi, G{\\'a}bor},\n\tyear         = 2006,\n\tpublisher    = {Cambridge university press},\n\taddress      = {Cambridge},\n\tdoi          = {10.1017/CBO9780511546921},\n\tisbn         = 9780511546921,\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Cesa-Bianchi, Lugosi - 2006 - Prediction, Learning, and Games.pdf:pdf},\n\tmendeley-groups = {Books/Optimization}\n}\n@inproceedings{cesa2013online,\n\ttitle        = {Online learning with switching costs and other adaptive adversaries},\n\tauthor       = {Cesa-Bianchi, Nicolo and Dekel, Ofer and Shamir, Ohad},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {1160--1168}\n}\n@article{cesabianchi05minimizing,\n\ttitle        = {Minimizing regret with label efficient prediction},\n\tauthor       = {Nicolò Cesa-Bianchi and Gábor Lugosi and Gilles Stoltz},\n\tyear         = 2005,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tvolume       = 51,\n\tpages        = {2152--2162}\n}\n@book{cesabianchi06prediction,\n\ttitle        = {Prediction, learning, and games},\n\tauthor       = {Nicolò Cesa-Bianchi and Gábor Lugosi},\n\tyear         = 2006,\n\tpublisher    = {Cambridge University Press}\n}\n@article{cesabianchi06regret,\n\ttitle        = {Regret Minimization Under Partial Monitoring},\n\tauthor       = {Nicolò Cesa-Bianchi and Gábor Lugosi and Gilles Stoltz},\n\tyear         = 2006,\n\tjournal      = {Mathematics of Operations Research},\n\tvolume       = 31,\n\tpages        = {562--580}\n}\n@article{cgcb14,\n\ttitle        = {Empirical evaluation of gated recurrent neural networks on sequence modeling},\n\tauthor       = {Chung, Junyoung and Gulcehre, Caglar and Cho, KyungHyun and Bengio, Yoshua},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1412.3555}\n}\n@article{CGLM08,\n\ttitle        = {Symmetric tensors and symmetric tensor rank},\n\tauthor       = {P. Comon and G. Golub and L.-H. Lim and B. Mourrain},\n\tyear         = 2008,\n\tjournal      = {SIAM Journal on Matrix Analysis Appl.},\n\tvolume       = 30,\n\tnumber       = 3,\n\tpages        = {1254--1279}\n}\n@inproceedings{chaganty13regression,\n\ttitle        = {Spectral Experts for Estimating Mixtures of Linear Regressions},\n\tauthor       = {Arun Chaganty and Percy Liang},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{chaganty2013spectral,\n\ttitle        = {Spectral Experts for Estimating Mixtures of Linear Regressions.},\n\tauthor       = {Chaganty, Arun Tejasvi and Liang, Percy},\n\tyear         = 2013,\n\tbooktitle    = {ICML (3)},\n\tpages        = {1040--1048}\n}\n@inproceedings{chaganty2014graphical,\n\ttitle        = {Estimating Latent-Variable Graphical Models using Moments and Likelihoods},\n\tauthor       = {Arun Chaganty and Percy Liang},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{chaganty2016perspectives,\n\ttitle        = {How Much is 131 Million Dollars? {P}utting Numbers in Perspective with Compositional Descriptions},\n\tauthor       = {Arun Tejasvi Chaganty and Percy Liang},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{chaganty2017unbiased,\n\ttitle        = {Importance sampling for unbiased on-demand evaluation of knowledge base population},\n\tauthor       = {Arun Chaganty and Ashwin Paranjape and Percy Liang and Chris Manning},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{chaganty2018evaluation,\n\ttitle        = {The price of debiasing automatic metrics in natural language evaluation},\n\tauthor       = {Arun Chaganty and Stephen Mussmann and Percy Liang},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{chai2004test,\n\ttitle        = {Test-cost sensitive naive {B}ayes classification},\n\tauthor       = {Xiaoyong Chai and Lin Deng and Qiang Yang and Charles X Ling},\n\tyear         = 2004,\n\tbooktitle    = {International Conference on Data Mining},\n\tpages        = {51--58}\n}\n@inproceedings{chai2005performance,\n\ttitle        = {Performance animation from low-dimensional control signals},\n\tauthor       = {Chai, Jinxiang and Hodgins, Jessica K.},\n\tyear         = 2005,\n\tbooktitle    = {ACM SIGGRAPH 2005 Papers},\n\tlocation     = {Los Angeles, California},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {SIGGRAPH '05},\n\tpages        = {686--696},\n\tdoi          = {http://doi.acm.org/10.1145/1186822.1073248},\n\tacmid        = 1073248,\n\tkeywords     = {\n\t\tdimensionality reduction, lazy learning, local modeling, motion capture\n\n\t\tdata, online control of human motion, performance animation, vision-based\n\n\t\tinterface\n\t},\n\tnumpages     = 11,\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@inproceedings{chaidaroon2017variational,\n\ttitle        = {Variational Deep Semantic Hashing for Text Documents},\n\tauthor       = {Suthee Chaidaroon and Yi Fang},\n\tyear         = 2017,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)},\n\tpages        = {75--84}\n}\n@inproceedings{chalkidis2020legal,\n\ttitle        = {{LEGAL-BERT}:\"Preparing the Muppets for Court\"},\n\tauthor       = {Ilias Chalkidis and Manos Fergadiotis and Prodromos Malakasiotis and Nikolaos Aletras and Ion Androutsopoulos},\n\tyear         = 2020,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {2898--2904}\n}\n@article{chaloner95bayesian,\n\ttitle        = {{B}ayesian Experimental Design: A Review},\n\tauthor       = {Kathryn Chaloner and Isabella Verdinelli},\n\tyear         = 1995,\n\tjournal      = {Statistical Science},\n\tvolume       = 10,\n\tpages        = {273--304}\n}\n@inproceedings{chambers2008narrative,\n\ttitle        = {Unsupervised Learning of Narrative Event Chains},\n\tauthor       = {Nathanael Chambers and Dan Jurafsky},\n\tyear         = 2008,\n\tbooktitle    = {Human Language Technology and Association for Computational Linguistics (HLT/ACL)}\n}\n@article{chambolle2011first,\n\ttitle        = {A first-order primal-dual algorithm for convex problems with applications to imaging},\n\tauthor       = {Chambolle, Antonin and Pock, Thomas},\n\tyear         = 2011,\n\tjournal      = {Journal of Mathematical Imaging and Vision},\n\tpublisher    = {Springer},\n\tvolume       = 40,\n\tnumber       = 1,\n\tpages        = {120--145},\n\tdoi          = {10.1007/s10851-010-0251-1},\n\tisbn         = 1085101002,\n\tissn         = {09249907},\n\tabstract     = {In this paper we study a first-order primal-dual algorithm for non-smooth convex optimization problems with known saddle-point structure. We prove convergence to a saddle-point with rate O(1/N) in finite dimensions for the complete class of problems. We further show accelerations of the proposed algorithm to yield improved rates on problems with some degree of smoothness. In particular we show that we can achieve O(1/N 2) convergence on problems, where the primal or the dual objective is uniformly convex, and we can show linear convergence, i.e. O($\\omega$ N ) for some $\\omega$∈(0,1), on smooth problems. The wide applicability of the proposed algorithm is demonstrated on several imaging problems such as image denoising, image deconvolution, image inpainting, motion estimation and multi-label image segmentation.},\n\tannote       = {Gives the full-gradient based accelerated algorithm for the saddle point problem.},\n\tfile         = {:D$\\backslash$:/Mendeley Desktop/Chambolle, Pock - 2011 - A first-order primal-dual algorithm for convex problems with applications to imaging.pdf:pdf},\n\tkeywords     = {Convex optimization,Dual approaches,Image,Inverse problems,Reconstruction,Total variation},\n\tmendeley-groups = {Optimization/Gradient Descent Theory}\n}\n@inproceedings{chandak2019learning,\n\ttitle        = {Learning action representations for reinforcement learning},\n\tauthor       = {Yash Chandak and Georgios Theocharous and James Kostas and Scott Jordan and Philip Thomas},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {941--950}\n}\n@inproceedings{chandrasekar1996motivations,\n\ttitle        = {Motivations and methods for text simplification},\n\tauthor       = {Raman Chandrasekar and Christine Doran and Bangalore Srinivas},\n\tyear         = 1996,\n\tbooktitle    = {Proceedings of the 16th conference on Computational linguistics-Volume 2},\n\tpages        = {1041--1044}\n}\n@article{chandrasekaran2011rank,\n\ttitle        = {Rank-sparsity incoherence for matrix decomposition},\n\tauthor       = {Venkat Chandrasekaran and Sujay Sanghavi and Pablo A. Parrilo and Alan S. Willsky},\n\tyear         = 2011,\n\tjournal      = {SIAM Journal on Optimization},\n\tvolume       = 21,\n\tnumber       = 2,\n\tpages        = {572--596}\n}\n@article{chandrasekaran2013,\n\ttitle        = {Computational and statistical tradeoffs via convex relaxation},\n\tauthor       = {Chandrasekaran, Venkat and Jordan, Michael I},\n\tyear         = 2013,\n\tjournal      = {Proceedings of the National Academy of Sciences},\n\tpublisher    = {National Acad Sciences},\n\tvolume       = 110,\n\tnumber       = 13,\n\tpages        = {E1181--E1190}\n}\n@article{chandrasekaran2013computational,\n\ttitle        = {Computational and statistical tradeoffs via convex relaxation},\n\tauthor       = {Venkat Chandrasekaran and Michael I Jordan},\n\tyear         = 2013,\n\tjournal      = {Proceedings of the National Academy of Sciences},\n\tvolume       = 110,\n\tnumber       = 13,\n\tpages        = {1181--1190}\n}\n@inproceedings{chandrasekaran2014finding,\n\ttitle        = {Finding a most biased coin with fewest flips},\n\tauthor       = {Karthekeyan Chandrasekaran and Richard Karp},\n\tyear         = 2014,\n\tbooktitle    = {Conference on Learning Theory (COLT)},\n\tpages        = {394--407}\n}\n@inproceedings{chang07constraint,\n\ttitle        = {Guiding Semi-Supervision with Constraint-Driven Learning},\n\tauthor       = {Ming-Wei Chang and Lev Ratinov and Dan Roth},\n\tyear         = 2007,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {280--287}\n}\n@inproceedings{chang2005toward,\n\ttitle        = {Toward Large Scale Integration: Building a {M}eta{Q}uerier over Databases on the Web},\n\tauthor       = {Kevin Chen-Chuan Chang and Bin He and Zhen Zhang},\n\tyear         = 2005,\n\tbooktitle    = {Conference on Innovative Data Systems Research (CIDR)},\n\tpages        = {44--55}\n}\n@incollection{chang2007psvm,\n\ttitle        = {PSVM: Parallelizing Support Vector Machines on Distributed Computers},\n\tauthor       = {\n\t\tEdward Chang and Kaihua Zhu and Hao Wang and Hongjie Bai and Jian\n\n\t\tLi and Zhihuan Qiu and Hang Cui and Chang, Edward Y. and Zhu, Kaihua\n\n\t\tand Wang, Hao and Bai, Hongjie and Li, Jian and Qiu, Zhihuan\n\t},\n\tyear         = 2007,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tvolume       = 20,\n\tabstract     = {\n\t\tSupport Vector Machines ({SVMs}) suffer from a widely recognized scalability\n\n\t\tproblem in both memory use and computational time. To improve scalability,\n\n\t\twe have developed a parallel {SVM} algorithm ({PSVM}), which reduces\n\n\t\tmemory use through performing a row-based, approximate matrix factorization,\n\n\t\tand which loads only essential data to each machine to perform parallel\n\n\t\tcomputation. Let n denote the number of training instances, p the\n\n\t\treduced matrix dimension after factorization (p is significantly\n\n\t\tsmaller than n), and m the number of machines. {PSVM} reduces the\n\n\t\tmemory requirement from O(n2) to O(np=m), and improves computation\n\n\t\ttime to O(np2=m). Empirical study shows {PSVM} to be effective. {PSVM}\n\n\t\tOpen Source is available for download at http://code.google.com/p/psvm/.\n\t},\n\tciteulike-article-id = 3152638,\n\tkeywords     = {nips, parallel-computing, svm},\n\tposted-at    = {2008-08-25 04:44:30},\n\tpriority     = 2\n}\n@inproceedings{chang2008importance,\n\ttitle        = {Importance of Semantic Representation: Dataless Classification},\n\tauthor       = {Ming-Wei Chang and Lev-Arie Ratinov and Dan Roth and Vivek Srikumar},\n\tyear         = 2008,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{chang2010discriminative,\n\ttitle        = {Discriminative Learning over Constrained Latent Representations},\n\tauthor       = {Ming-Wei Chang and Dan Goldwasser and Dan Roth and Vivek Srikumar},\n\tyear         = 2010,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@inproceedings{chang2010structured,\n\ttitle        = {Structured output learning with indirect supervision},\n\tauthor       = {Ming-Wei Chang and Vivek Srikumar and Dan Goldwasser and Dan Roth},\n\tyear         = 2010,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {199--206}\n}\n@inproceedings{chang2012sutime,\n\ttitle        = {{SUT}ime: A library for recognizing and normalizing time expressions},\n\tauthor       = {Angel X Chang and Christopher Manning},\n\tyear         = 2012,\n\tbooktitle    = {Language Resources and Evaluation (LREC)},\n\tpages        = {3735--3740}\n}\n@inproceedings{chang2014scene,\n\ttitle        = {Learning Spatial Knowledge for Text to 3{D} Scene Generation},\n\tauthor       = {Angel X Chang and Manolis Savva and Christopher D Manning},\n\tyear         = 2014,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{chang2014tensor,\n\ttitle        = {Typed tensor decomposition of knowledge bases for relation extraction},\n\tauthor       = {Kai-Wei Chang and Wen-Tau Yih and Bishan Yang and Christopher Meek},\n\tyear         = 2014,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1568--1579}\n}\n@article{chang2015learning,\n\ttitle        = {Learning to search better than your teacher},\n\tauthor       = {Kai-Wei Chang and Akshay Krishnamurthy and Alekh Agarwal and Hal {Daum{\\'e} III} and John Langford},\n\tyear         = 2015,\n\tjournal      = {arXiv}\n}\n@article{chang2015whitney,\n\ttitle        = {The {W}hitney extension theorem in high dimensions},\n\tauthor       = {Chang, Alan},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1508.01779}\n}\n@inproceedings{chang2017active,\n\ttitle        = {Active bias: Training more accurate neural networks by emphasizing high variance samples},\n\tauthor       = {Haw-Shiuan Chang and Erik Learned-Miller and Andrew McCallum},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1002--1012}\n}\n@inproceedings{chang2017affordable,\n\ttitle        = {Affordable On-Line Dialogue Policy Learning},\n\tauthor       = {Cheng Chang and Runzhe Yang and Lu Chen and Xiang Zhou and Kai Yu},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {223--231}\n}\n@article{Chang96,\n\ttitle        = {Full reconstruction of {M}arkov models on evolutionary trees: Identifiability and consistency},\n\tauthor       = {Joseph T. Chang},\n\tyear         = 1996,\n\tjournal      = {Mathematical Biosciences},\n\tvolume       = 137,\n\tpages        = {51--73}\n}\n@article{ChanHansen1990computing,\n\ttitle        = {{Computing truncated singular value decomposition least squares solutions by rank revealing QR-factorizations}},\n\tauthor       = {Chan, Tony F and Hansen, Per Christian},\n\tyear         = 1990,\n\tjournal      = {SIAM Journal on Scientific and Statistical Computing},\n\tpublisher    = {SIAM},\n\tvolume       = 11,\n\tnumber       = 3,\n\tpages        = {519--530}\n}\n@inproceedings{chao2011towards,\n\ttitle        = {Towards grounding concepts for transfer in goal learning from demonstration},\n\tauthor       = {Crystal Chao and Maya Cakmak and Andrea L Thomaz},\n\tyear         = 2011,\n\tbooktitle    = {International Conference on Development and Learning (ICDL)},\n\tpages        = {1--6}\n}\n@book{chapelle2006semisupervised,\n\ttitle        = {Semi-Supervised Learning},\n\tauthor       = {O. Chapelle and A. Zien and B. Scholkopf},\n\tyear         = 2006,\n\tpublisher    = {MIT Press}\n}\n@inproceedings{chaplot2018gated,\n\ttitle        = {Gated-Attention Architectures for Task-Oriented Language Grounding},\n\tauthor       = {Devendra Singh Chaplot and Kanthashree Mysore Sathyendra and Rama Kumar Pasumarthi and Dheeraj Rajagopal and Ruslan Salakhutdinov},\n\tyear         = 2018,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{charikar2002similarity,\n\ttitle        = {Similarity estimation techniques from rounding algorithms},\n\tauthor       = {Moses S Charikar},\n\tyear         = 2002,\n\tbooktitle    = {Proceedings of the thiry-fourth annual ACM symposium on Theory of computing},\n\tpages        = {380--388}\n}\n@inproceedings{charikar2017learning,\n\ttitle        = {Learning from Untrusted Data},\n\tauthor       = {Moses Charikar and Jacob Steinhardt and Gregory Valiant},\n\tyear         = 2017,\n\tbooktitle    = {Symposium on Theory of Computing (STOC)}\n}\n@inproceedings{CharikarLLM10,\n\ttitle        = {Vertex Sparsifiers and Abstract Rounding Algorithms},\n\tauthor       = {Moses Charikar and Tom Leighton and Shi Li and Ankur Moitra},\n\tyear         = 2010,\n\tbooktitle    = {51th Annual {IEEE} Symposium on Foundations of Computer Science, {FOCS} 2010, October 23-26, 2010, Las Vegas, Nevada, {USA}},\n\tpages        = {265--274},\n\tdoi          = {10.1109/FOCS.2010.32},\n\turl          = {http://doi.ieeecomputersociety.org/10.1109/FOCS.2010.32},\n\ttimestamp    = {Mon, 03 Nov 2014 22:22:11 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/focs/CharikarLLM10},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{charniak00maxent,\n\ttitle        = {A maximum-entropy-inspired parser},\n\tauthor       = {E. Charniak},\n\tyear         = 2000,\n\tbooktitle    = {Applied Natural Language Processing and North American Association for Computational Linguistics (ANLP/NAACL)},\n\tpages        = {132--139}\n}\n@inproceedings{charniak96treebank,\n\ttitle        = {Tree-bank Grammars},\n\tauthor       = {E. Charniak},\n\tyear         = 1996,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tpages        = {1031--1036}\n}\n@article{chatterjee1986influential,\n\ttitle        = {Influential observations, high leverage points, and outliers in linear regression},\n\tauthor       = {Samprit Chatterjee and Ali S Hadi},\n\tyear         = 1986,\n\tjournal      = {Statistical Science},\n\tpages        = {379--393}\n}\n@article{chatziafratis2019depth,\n\ttitle        = {Depth-width trade-offs for relu networks via sharkovsky's theorem},\n\tauthor       = {Chatziafratis, Vaggos and Nagarajan, Sai Ganesh and Panageas, Ioannis and Wang, Xiao},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1912.04378}\n}\n@inproceedings{chatziafratis2020better,\n\ttitle        = {Better depth-width trade-offs for neural networks through the lens of dynamical systems},\n\tauthor       = {Chatziafratis, Vaggos and Nagarajan, Sai Ganesh and Panageas, Ioannis},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {1469--1478},\n\torganization = {PMLR}\n}\n@inproceedings{chaudhari2018stochastic,\n\ttitle        = {Stochastic gradient descent performs variational inference, converges to limit cycles for deep networks},\n\tauthor       = {Chaudhari, Pratik and Soatto, Stefano},\n\tyear         = 2018,\n\tbooktitle    = {2018 Information Theory and Applications Workshop (ITA)},\n\tpages        = {1--10},\n\torganization = {IEEE}\n}\n@article{chaudhari2019entropy,\n\ttitle        = {Entropy-sgd: Biasing gradient descent into wide valleys},\n\tauthor       = {Chaudhari, Pratik and Choromanska, Anna and Soatto, Stefano and LeCun, Yann and Baldassi, Carlo and Borgs, Christian and Chayes, Jennifer and Sagun, Levent and Zecchina, Riccardo},\n\tyear         = 2019,\n\tjournal      = {Journal of Statistical Mechanics: Theory and Experiment},\n\tpublisher    = {IOP Publishing},\n\tvolume       = 2019,\n\tnumber       = 12,\n\tpages        = 124018\n}\n@inproceedings{chaudhuri2009multi,\n\ttitle        = {Multi-view clustering via canonical correlation analysis},\n\tauthor       = {Chaudhuri, Kamalika and Kakade, Sham M and Livescu, Karen and Sridharan, Karthik},\n\tyear         = 2009,\n\tbooktitle    = {ICML},\n\tpages        = {129--136}\n}\n@inproceedings{chaves07model,\n\ttitle        = {Dynamic Model Checking of Discourse Representation Structures with Pluralities},\n\tauthor       = {Rui Pedro Chaves},\n\tyear         = 2007,\n\tbooktitle    = {International Workshop on Computational Semantics}\n}\n@article{chawla2002smote,\n\ttitle        = {SMOTE: synthetic minority over-sampling technique},\n\tauthor       = {Nitesh V Chawla and Kevin W Bowyer and Lawrence O Hall and W Philip Kegelmeyer},\n\tyear         = 2002,\n\tjournal      = {Journal of artificial intelligence research},\n\tvolume       = 16,\n\tpages        = {321--357}\n}\n@article{chawla2004imbalanced,\n\ttitle        = {Editorial: Special Issue on Learning from Imbalanced Data Sets},\n\tauthor       = {Nitesh V. Chawla and Nathalie Japkowicz and Aleksander R. Kolcz},\n\tyear         = 2004,\n\tjournal      = {ACM SIGKDD Explorations Newsletter},\n\tvolume       = 6,\n\tnumber       = 1\n}\n@article{ChebyshevMethod-Axelsson1985,\n\ttitle        = {A survey of preconditioned iterative methods for linear systems of algebraic equations},\n\tauthor       = {Axelsson, Owe},\n\tyear         = 1985,\n\tjournal      = {BIT Numerical Mathematics},\n\tpublisher    = {Springer},\n\tvolume       = 25,\n\tnumber       = 1,\n\tpages        = {165--187}\n}\n@article{chechik2008max,\n\ttitle        = {Max-margin classification of data with absent features},\n\tauthor       = {Gal Chechik and Geremy Heitz and Gal Elidan and Pieter Abbeel and Daphne Koller},\n\tyear         = 2008,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 9,\n\tpages        = {1--21}\n}\n@article{chegireddy1987algorithms,\n\ttitle        = {Algorithms for finding {k}-best perfect matchings},\n\tauthor       = {Chandra R. Chegireddy and Horst W. Hamacher},\n\tyear         = 1987,\n\tjournal      = {Discrete applied mathematics},\n\tvolume       = 18,\n\tnumber       = 2,\n\tpages        = {155--165}\n}\n@article{chelba2013one,\n\ttitle        = {One Billion Word Benchmark for Measuring Progress in Statistical Language Modeling},\n\tauthor       = {Ciprian Chelba and Tomas Mikolov and Mike Schuster and Qi Ge and Thorsten Brants and Phillipp Koehn and Tony Robinson},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1312.3005}\n}\n@inproceedings{chen08sportscast,\n\ttitle        = {Learning to Sportscast: A Test of Grounded Language Acquisition},\n\tauthor       = {David L. Chen and Raymond J. Mooney},\n\tyear         = 2008,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {128--135}\n}\n@inproceedings{chen11navigate,\n\ttitle        = {Learning to Interpret Natural Language Navigation Instructions from Observations},\n\tauthor       = {David L. Chen and Raymond J. Mooney},\n\tyear         = 2011,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tpages        = {859--865}\n}\n@inproceedings{chen12lexicon,\n\ttitle        = {Fast Online Lexicon Learning for Grounded Language Acquisition},\n\tauthor       = {David L. Chen},\n\tyear         = 2012,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{chen2008energy,\n\ttitle        = {\n\t\tEnergy-aware server provisioning and load dispatching for connection-intensive\n\n\t\tinternet services\n\t},\n\tauthor       = {\n\t\tChen, Gong and He, Wenbo and Liu, Jie and Nath, Suman and Rigas,\n\n\t\tLeonidas and Xiao, Lin and Zhao, Feng\n\t},\n\tyear         = 2008,\n\tbooktitle    = {\n\t\tProceedings of the 5th USENIX Symposium on Networked Systems Design\n\n\t\tand Implementation\n\t},\n\tlocation     = {San Francisco, California},\n\tpublisher    = {USENIX Association},\n\taddress      = {Berkeley, CA, USA},\n\tseries       = {NSDI'08},\n\tpages        = {337--350},\n\tisbn         = {111-999-5555-22-1},\n\tacmid        = 1387613,\n\tnumpages     = 14\n}\n@article{chen2009settling,\n\ttitle        = {Settling the complexity of computing two-player {N}ash equilibria},\n\tauthor       = {Chen, Xi and Deng, Xiaotie and Teng, Shang-Hua},\n\tyear         = 2009,\n\tjournal      = {J. ACM},\n\tpublisher    = {ACM},\n\tvolume       = 56,\n\tnumber       = 3,\n\tpages        = 14,\n\tfjournal     = {Journal of the ACM}\n}\n@article{chen2010training,\n\ttitle        = {Training a multilingual sportscaster: Using perceptual context to learn language},\n\tauthor       = {D. L. Chen and J. Kim and R. J. Mooney},\n\tyear         = 2010,\n\tjournal      = {Journal of Artificial Intelligence Research (JAIR)},\n\tvolume       = 37,\n\tnumber       = 1,\n\tpages        = {397--436}\n}\n@article{chen2013completing,\n\ttitle        = {Completing any low-rank matrix, provably},\n\tauthor       = {Chen, Yudong and Bhojanapalli, Srinadh and Sanghavi, Sujay and Ward, Rachel},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1306.2979}\n}\n@article{chen2013robots,\n\ttitle        = {Robots for humanity: using assistive robotics to empower people with disabilities},\n\tauthor       = {T. L. Chen and M. Ciocarlie and S. Cousins and Phillip M. Grice and Kelsey P. Hawkins and K. Hsiao and Charles C. Kemp and C. King and Daniel A. Lazewatsky and A. Leeper and H. Nguyen and A. Paepcke and C. Pantofaru and W. Smart and L. Takayama},\n\tyear         = 2013,\n\tjournal      = {IEEE Robotics \\& Automation Magazine (RAM)},\n\tvolume       = 20,\n\tpages        = {30--39}\n}\n@article{chen2013robust,\n\ttitle        = {Robust High Dimensional Sparse Regression and Matching Pursuit},\n\tauthor       = {Yudong Chen and Constantine Caramanis and Shie Mannor},\n\tyear         = 2013,\n\tjournal      = {arXiv}\n}\n@article{chen2014clustering,\n\ttitle        = {Clustering partially observed graphs via convex optimization},\n\tauthor       = {Yudong Chen and Ali Jalali and Sujay Sanghavi and Huan Xu},\n\tyear         = 2014,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 15,\n\tpages        = {2213--2238}\n}\n@article{chen2014improved,\n\ttitle        = {Improved graph clustering},\n\tauthor       = {Yudong Chen and Sujay Sanghavi and Huan Xu},\n\tyear         = 2014,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tvolume       = 60,\n\tnumber       = 10,\n\tpages        = {6440--6455}\n}\n@inproceedings{chen2014nndep,\n\ttitle        = {A Fast and Accurate Dependency Parser using Neural Networks},\n\tauthor       = {Danqi Chen and Christopher D. Manning},\n\tyear         = 2014,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{chen2014optimal,\n\ttitle        = {Optimal primal-dual methods for a class of saddle point problems},\n\tauthor       = {Chen, Yunmei and Lan, Guanghui and Ouyang, Yuyuan},\n\tyear         = 2014,\n\tjournal      = {SIAM Journal on Optimization},\n\tpublisher    = {SIAM},\n\tvolume       = 24,\n\tnumber       = 4,\n\tpages        = {1779--1814}\n}\n@article{chen2014statistical,\n\ttitle        = {Statistical-Computational Tradeoffs in Planted Problems and Submatrix Localization with a Growing Number of Clusters and Submatrices},\n\tauthor       = {Yudong Chen and Jiaming Xu},\n\tyear         = 2014,\n\tjournal      = {arXiv}\n}\n@inproceedings{chen2015event,\n\ttitle        = {Event extraction via dynamic multi-pooling convolutional neural networks},\n\tauthor       = {Yubo Chen and Liheng Xu and Kang Liu and Daojian Zeng and Jun Zhao},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{chen2015fast,\n\ttitle        = {Fast low-rank estimation by projected gradient descent: General statistical and algorithmic guarantees},\n\tauthor       = {Chen, Yudong and Wainwright, Martin J},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1509.03025}\n}\n@inproceedings{chen2015solving,\n\ttitle        = {Solving random quadratic systems of equations is nearly as easy as solving linear systems},\n\tauthor       = {Chen, Yuxin and Candes, Emmanuel},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {739--747}\n}\n@article{chen2016dna,\n\ttitle        = {{DNA} methylation-based measures of biological age: meta-analysis predicting time to death},\n\tauthor       = {Brian H Chen and Riccardo E Marioni and Elena Colicino and Marjolein J Peters and Cavin K Ward-Caviness and Pei-Chien Tsai and Nicholas S Roetker and Allan C Just and Ellen W Demerath and Weihua Guan and others},\n\tyear         = 2016,\n\tjournal      = {Aging (Albany NY)},\n\tvolume       = 8,\n\tnumber       = 9\n}\n@article{chen2016enhancing,\n\ttitle        = {Enhancing and Combining Sequential and Tree {LSTM} for Natural Language Inference},\n\tauthor       = {Chen, Qian and Zhu, Xiaodan and Ling, Zhenhua and Wei, Si and Jiang, Hui},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@inproceedings{chen2016infogan,\n\ttitle        = {{InfoGAN}: Interpretable representation learning by information maximizing generative adversarial nets},\n\tauthor       = {Xi Chen and Yan Duan and Rein Houthooft and John Schulman and Ilya Sutskever and Pieter Abbeel},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{chen2016stochastic,\n\ttitle        = {Stochastic Primal-Dual Methods and Sample Complexity of Reinforcement Learning},\n\tauthor       = {Chen, Yichen and Wang, Mengdi},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1612.02516}\n}\n@inproceedings{chen2016thorough,\n\ttitle        = {A Thorough Examination of the {CNN} / {D}aily {M}ail Reading Comprehension Task},\n\tauthor       = {Danqi Chen and Jason Bolton and Christopher D. Manning},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{chen2017ead,\n\ttitle        = {{EAD}: Elastic-Net Attacks to Deep Neural Networks via Adversarial Examples},\n\tauthor       = {Pin-Yu Chen and Yash Sharma and Huan Zhang and Jinfeng Yi and Cho-Jui Hsieh},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@article{chen2017lower,\n\ttitle        = {Lower Bound On the Computational Complexity of Discounted Markov Decision Problems},\n\tauthor       = {Chen, Yichen and Wang, Mengdi},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1705.07312}\n}\n@inproceedings{chen2017no,\n\ttitle        = {No more discrimination: Cross city adaptation of road scene segmenters},\n\tauthor       = {Yi-Hsin Chen and Wei-Yu Chen and Yu-Ting Chen and Bo-Cheng Tsai and Yu-Chiang Frank Wang and Min Sun},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Computer Vision (ICCV)},\n\tpages        = {1992--2001}\n}\n@inproceedings{chen2017reading,\n\ttitle        = {Reading {W}ikipedia to Answer Open-Domain Questions},\n\tauthor       = {Danqi Chen and Adam Fisch and Jason Weston and Antoine Bordes},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{chen2017targeted,\n\ttitle        = {Targeted backdoor attacks on deep learning systems using data poisoning},\n\tauthor       = {Xinyun Chen and Chang Liu and Bo Li and Kimberly Lu and Dawn Song},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1712.05526}\n}\n@article{chen2018adversarial,\n\ttitle        = {Adversarial Deep Averaging Networks for Cross-Lingual Sentiment Classification},\n\tauthor       = {Xilun Chen and Yu Sun and Ben Athiwaratkun and Claire Cardie and Kilian Weinberger},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:arXiv:1606.01614}\n}\n@article{chen2018closing,\n\ttitle        = {Closing the generalization gap of adaptive gradient methods in training deep neural networks},\n\tauthor       = {Chen, Jinghui and Gu, Quanquan},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1806.06763}\n}\n@inproceedings{chen2018my,\n\ttitle        = {Why Is My Classifier Discriminatory?},\n\tauthor       = {Irene Chen and Fredrik D Johansson and David Sontag},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {3539--3550}\n}\n@inproceedings{chen2018planning,\n\ttitle        = {Planning with Trust for Human-Robot Collaboration},\n\tauthor       = {Min Chen and Stefanos Nikolaidis and Harold Soh and David Hsu and Siddhartha Srinivasa},\n\tyear         = 2018,\n\tbooktitle    = {ACM/IEEE International Conference on Human Robot Interaction (HRI)}\n}\n@inproceedings{Chen2018RecurrentNN,\n\ttitle        = {Recurrent Neural Networks as Weighted Language Recognizers},\n\tauthor       = {Yining Chen and Sorcha Gilroy and A. Maletti and Jonathan May and Kevin Knight},\n\tyear         = 2018,\n\tbooktitle    = {NAACL-HLT}\n}\n@article{chen2018statistical,\n\ttitle        = {Statistical inference for model parameters in stochastic gradient descent},\n\tauthor       = {Chen, Xi and Lee, Jason D and Tong, Xin T and Zhang, Yichen},\n\tyear         = 2020,\n\tjournal      = {The Annals of Statistics},\n\tpublisher    = {Institute of Mathematical Statistics},\n\tvolume       = 48,\n\tnumber       = 1,\n\tpages        = {251--273}\n}\n@article{chen2019can,\n\ttitle        = {Can {AI} help reduce disparities in general medical and mental health care?},\n\tauthor       = {Irene Y Chen and Peter Szolovits and Marzyeh Ghassemi},\n\tyear         = 2019,\n\tjournal      = {AMA Journal of Ethics},\n\tvolume       = 21,\n\tnumber       = 2,\n\tpages        = {167--179}\n}\n@inproceedings{chen2019evaluating,\n\ttitle        = {Evaluating Question Answering Evaluation},\n\tauthor       = {Anthony Chen and Gabriel Stanovsky and Sameer Singh and Matt Gardner},\n\tyear         = 2019,\n\tbooktitle    = {Workshop on Machine Reading for Question Answering (MRQA)}\n}\n@article{chen2019generalization,\n\ttitle        = {On generalization bounds of a family of recurrent neural networks},\n\tauthor       = {Chen, Minshuo and Li, Xingguo and Zhao, Tuo},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1910.12947}\n}\n@inproceedings{chen2019information,\n\ttitle        = {Information-Theoretic Considerations in Batch Reinforcement Learning},\n\tauthor       = {Chen, Jinglin and Jiang, Nan},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {1042--1051}\n}\n@article{chen2019invariance,\n\ttitle        = {Invariance reduces Variance: Understanding Data Augmentation in Deep Learning and Beyond},\n\tauthor       = {Shuxiao Chen and Edgar Dobriban and Jane H Lee},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1907.10905}\n}\n@inproceedings{chen2019towards,\n\ttitle        = {Towards Understanding Limitations of Pixel Discretization Against Adversarial Attacks},\n\tauthor       = {Jiefeng Chen and Xi Wu and Vaibhav Rastogi and Yingyu Liang and Somesh Jha},\n\tyear         = 2019,\n\tbooktitle    = {IEEE European Symposium on Security and Privacy (EuroS\\&P)}\n}\n@inproceedings{chen2019understanding,\n\ttitle        = {Understanding Dataset Design Choices for Multi-hop Reasoning},\n\tauthor       = {Jifan Chen and Greg Durrett},\n\tyear         = 2019,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{chen2020active,\n\ttitle        = {Active Online Domain Adaptation},\n\tauthor       = {Chen, Yining and Luo, Haipeng and Ma, Tengyu and Zhang, Chicheng},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.14481}\n}\n@article{chen2020big,\n\ttitle        = {Big self-supervised models are strong semi-supervised learners},\n\tauthor       = {Chen, Ting and Kornblith, Simon and Swersky, Kevin and Norouzi, Mohammad and Hinton, Geoffrey},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.10029}\n}\n@article{chen2020concept,\n\ttitle        = {Concept Whitening for Interpretable Image Recognition},\n\tauthor       = {Zhi Chen and Yijie Bei and Cynthia Rudin},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2002.01650}\n}\n@article{chen2020distributed,\n\ttitle        = {Distributed Estimation for Principal Component Analysis: a Gap-free Approach},\n\tauthor       = {Chen, Xi and Lee, Jason D and Li, He and Yang, Yun},\n\tyear         = 2021,\n\tjournal      = {Journal of the American Statistical Association}\n}\n@article{chen2020ethical,\n\ttitle        = {Ethical Machine Learning in Health},\n\tauthor       = {Irene Y Chen and Emma Pierson and Sherri Rose and Shalmali Joshi and Kadija Ferryman and Marzyeh Ghassemi},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2009.10576}\n}\n@article{chen2020exploring,\n\ttitle        = {Exploring Simple Siamese Representation Learning},\n\tauthor       = {Chen, Xinlei and He, Kaiming},\n\tyear         = 2020,\n\tmonth        = {June},\n\tjournal      = {arXiv preprint arXiv:2011.10566},\n\tbooktitle    = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {15750--15758}\n}\n@article{chen2020improved,\n\ttitle        = {Improved baselines with momentum contrastive learning},\n\tauthor       = {Chen, Xinlei and Fan, Haoqi and Girshick, Ross and He, Kaiming},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2003.04297}\n}\n@inproceedings{chen2020more,\n\ttitle        = {More Data Can Expand the Generalization Gap Between Adversarially Robust and Standard Models},\n\tauthor       = {Lin Chen and Yifei Min and Mingrui Zhang and Amin Karbasi},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{chen2020self,\n\ttitle        = {Self-training Avoids Using Spurious Features Under Domain Shift},\n\tauthor       = {Chen, Yining and Wei, Colin and Kumar, Ananya and Ma, Tengyu},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.10032}\n}\n@inproceedings{chen2020selftraining,\n\ttitle        = {Self-Training Avoids Using Spurious Features Under Domain Shift},\n\tauthor       = {Yining Chen and Colin Wei and Ananya Kumar and Tengyu Ma},\n\tyear         = 2020,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{chen2020simclr,\n\ttitle        = {A simple framework for contrastive learning of visual representations},\n\tauthor       = {Ting Chen and Simon Kornblith and Mohammad Norouzi and Geoffrey Hinton},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1597--1607}\n}\n@inproceedings{chen2020simple,\n\ttitle        = {A simple framework for contrastive learning of visual representations},\n\tauthor       = {Chen, Ting and Kornblith, Simon and Norouzi, Mohammad and Hinton, Geoffrey},\n\tyear         = 2020,\n\tmonth        = {13--18 Jul},\n\tbooktitle    = {International conference on machine learning},\n\tpublisher    = {PMLR},\n\tseries       = {Proceedings of Machine Learning Research},\n\tvolume       = 119,\n\tpages        = {1597--1607},\n\torganization = {PMLR}\n}\n@article{chen2020stationary,\n\ttitle        = {On Stationary-Point Hitting Time and Ergodicity of Stochastic Gradient Langevin Dynamics.},\n\tauthor       = {Chen, Xi and Du, Simon S and Tong, Xin T},\n\tyear         = 2020,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 21,\n\tnumber       = 68,\n\tpages        = {1--41}\n}\n@article{chen2020towards,\n\ttitle        = {Towards Understanding Hierarchical Learning: Benefits of Neural Representations},\n\tauthor       = {Chen, Minshuo and Bai, Yu and Lee, Jason D and Zhao, Tuo and Wang, Huan and Xiong, Caiming and Socher, Richard},\n\tyear         = 2020,\n\tjournal      = {Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{chen2020uniter,\n\ttitle        = {Uniter: Universal image-text representation learning},\n\tauthor       = {Yen-Chun Chen and Linjie Li and Licheng Yu and Ahmed El Kholy and Faisal Ahmed and Zhe Gan and Yu Cheng and Jingjing Liu},\n\tyear         = 2020,\n\tbooktitle    = {European Conference on Computer Vision (ECCV)},\n\tpages        = {104--120}\n}\n@inproceedings{chen2021active,\n\ttitle        = {Active Online Learning with Hidden Shifting Domains},\n\tauthor       = {Chen, Yining and Luo, Haipeng and Ma, Tengyu and Zhang, Chicheng},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Artificial Intelligence and Statistics},\n\tpages        = {2053--2061},\n\torganization = {PMLR}\n}\n@article{chen2021adaprompt,\n\ttitle        = {AdaPrompt: Adaptive Prompt-based Finetuning for Relation Extraction},\n\tauthor       = {Chen, Xiang and Xie, Xin and Zhang, Ningyu and Yan, Jiahuan and Deng, Shumin and Tan, Chuanqi and Huang, Fei and Si, Luo and Chen, Huajun},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2104.07650}\n}\n@article{chen2021decisiontransformer,\n\ttitle        = {Decision Transformer: Reinforcement Learning via Sequence Modeling},\n\tauthor       = {Lili Chen and Kevin Lu and Aravind Rajeswaran and Kimin Lee and Aditya Grover and M. Laskin and P. Abbeel and A. Srinivas and Igor Mordatch},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2106.01345}\n}\n@inproceedings{chen2021generalizable,\n\ttitle        = {Learning Generalizable Robotic Reward Functions from \"In-The-Wild\" Human Videos},\n\tauthor       = {Annie S. Chen and Suraj Nair and Chelsea Finn},\n\tyear         = 2021,\n\tbooktitle    = {Robotics: Science and Systems (RSS)}\n}\n@article{chen2021improved,\n\ttitle        = {Improved Corruption Robust Algorithms for Episodic Reinforcement Learning},\n\tauthor       = {Chen, Yifang and Du, Simon S and Jamieson, Kevin},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2102.06875}\n}\n@inproceedings{chen96smoothing,\n\ttitle        = {An Empirical Study of Smoothing Techniques for Language Modeling},\n\tauthor       = {Stanley F. Chen and Joshua Goodman},\n\tyear         = 1996,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{cheng2013feedback,\n\ttitle        = {Feedback-driven multiclass active learning for data streams},\n\tauthor       = {Yu Cheng and Zhengzhang Chen and Lu Liu and Jiang Wang and Ankit Agrawal and Alok Choudhary},\n\tyear         = 2013,\n\tbooktitle    = {Conference on Information and Knowledge Management (CIKM)},\n\tpages        = {1311--1320}\n}\n@inproceedings{cheng2015flock,\n\ttitle        = {Flock: Hybrid {Crowd-Machine} Learning Classifiers},\n\tauthor       = {Justin Cheng and Michael S Bernstein},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the 18th {ACM} Conference on Computer Supported Cooperative Work \\& Social Computing},\n\tpages        = {600--611}\n}\n@inproceedings{cheng2016long,\n\ttitle        = {Long short-term memory-networks for machine reading},\n\tauthor       = {Jianpeng Cheng and Li Dong and Mirella Lapata},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{cheng2016sslnlp,\n\ttitle        = {Semi-supervised learning for neural machine translation},\n\tauthor       = {Y. Cheng and W. Xu and Z. He and W. He and H. Wu and M. Sun and Y. Liu},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{cheng2017learning,\n\ttitle        = {Learning Structured Natural Language Representations for Semantic Parsing},\n\tauthor       = {Jianpeng Cheng and Siva Reddy and Vijay Saraswat and Mirella Lapata},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{cheng2019end,\n\ttitle        = {End-to-end safe reinforcement learning through barrier functions for safety-critical continuous control tasks},\n\tauthor       = {Cheng, Richard and Orosz, G{\\'a}bor and Murray, Richard M and Burdick, Joel W},\n\tyear         = 2019,\n\tbooktitle    = {Proceedings of the AAAI Conference on Artificial Intelligence},\n\tvolume       = 33,\n\tnumber       = {01},\n\tpages        = {3387--3395}\n}\n@inproceedings{cheng2019robust,\n\ttitle        = {Robust Neural Machine Translation with Doubly Adversarial Inputs},\n\tauthor       = {Yong Cheng and Lu Jiang and Wolfgang Macherey},\n\tyear         = 2019,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{cheng2019stochastic,\n\ttitle        = {Stochastic Gradient and Langevin Processes},\n\tauthor       = {Cheng, Xiang and Yin, Dong and Bartlett, Peter L and Jordan, Michael I},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1907.03215}\n}\n@inproceedings{cheng2020seq,\n\ttitle        = {{Seq2Sick}: Evaluating the Robustness of Sequence-to-Sequence Models with Adversarial Examples},\n\tauthor       = {Minhao Cheng and Jinfeng Yi and Huan Zhang and Pin-Yu Chen and Cho-Jui Hsieh},\n\tyear         = 2020,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{chentanez2005intrinsically,\n\ttitle        = {Intrinsically motivated reinforcement learning},\n\tauthor       = {N. Chentanez and A. G. Barto and S. P. Singh},\n\tyear         = 2005,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1281--1288}\n}\n@article{chevalier2013composition,\n\ttitle        = {Using concrete scales: A practical framework for effective visual depiction of complex measures},\n\tauthor       = {Fanny Chevalier and Romain Vuillemot and Guia Gali},\n\tyear         = 2013,\n\tjournal      = {IEEE Transactions on Visualization and Computer Graphics},\n\tvolume       = 19,\n\tpages        = {2426--2435}\n}\n@inproceedings{chevalierboisvert2019babyai,\n\ttitle        = {BabyAI: A Platform to Study the Sample Efficiency of Grounded Language Learning},\n\tauthor       = {Maxime Chevalier-Boisvert and Dzmitry Bahdanau and Salem Lahlou and Lucas Willems and Chitwan Saharia and Thien Huu Nguyen and Yoshua Bengio},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{chi2019nonconvex,\n\ttitle        = {Nonconvex optimization meets low-rank matrix factorization: An overview},\n\tauthor       = {Chi, Yuejie and Lu, Yue M and Chen, Yuxin},\n\tyear         = 2019,\n\tjournal      = {IEEE Transactions on Signal Processing},\n\tpublisher    = {IEEE},\n\tvolume       = 67,\n\tnumber       = 20,\n\tpages        = {5239--5269}\n}\n@article{chi99pcfg,\n\ttitle        = {Statistical Properties of Probabilistic Context-Free Grammars},\n\tauthor       = {Zhiyi Chi},\n\tyear         = 1999,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 25\n}\n@misc{chiachieri2013dictionary,\n\ttitle        = {Dictionary of Numbers},\n\tauthor       = {Glen Chiacchieri},\n\tyear         = 2013,\n\thowpublished = {\\url{http://www.dictionaryofnumbers.com/}}\n}\n@inproceedings{chiang2005hierarchical,\n\ttitle        = {A Hierarchical Phrase-Based Model for Statistical Machine Translation},\n\tauthor       = {David Chiang},\n\tyear         = 2005,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {263--270}\n}\n@article{chiappa2017recurrent,\n\ttitle        = {Recurrent environment simulators},\n\tauthor       = {Silvia Chiappa and S{'e}bastien Racaniere and Daan Wierstra and Shakir Mohamed},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1704.02254}\n}\n@inproceedings{chiappa2019path,\n\ttitle        = {Path-specific counterfactual fairness},\n\tauthor       = {Silvia Chiappa},\n\tyear         = 2019,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tvolume       = 33,\n\tpages        = {7801--7808}\n}\n@inproceedings{chin1987bayesian,\n\ttitle        = {{B}ayesian Belief Network Inference Using Simulation},\n\tauthor       = {Homer Chin and Gregory Cooper},\n\tyear         = 1987,\n\tbooktitle    = {Uncertainty in Artificial Intelligence 3 Annual Conference on Uncertainty in Artificial Intelligence (UAI-87)},\n\tpublisher    = {Elsevier Science},\n\taddress      = {Amsterdam, NL},\n\tpages        = {129--147}\n}\n@inproceedings{chin1987stochastic,\n\ttitle        = {Stochastic Simulation of {B}ayesian Belief Networks},\n\tauthor       = {Homer Chin and Gregory Cooper},\n\tyear         = 1987,\n\tbooktitle    = {Proceedings of the Third Conference Annual Conference on Uncertainty in Artificial Intelligence (UAI-87)},\n\tpublisher    = {Elsevier Science},\n\taddress      = {New York, NY},\n\tpages        = {106--113}\n}\n@inproceedings{chin2015stochastic,\n\ttitle        = {Stochastic block model and community detection in the sparse graphs: A spectral algorithm with optimal rate of recovery},\n\tauthor       = {Peter Chin and Anup Rao and Van Vu},\n\tyear         = 2015,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@article{ching2018opportunities,\n\ttitle        = {Opportunities and obstacles for deep learning in biology and medicine},\n\tauthor       = {Travers Ching and Daniel S Himmelstein and Brett K Beaulieu-Jones and Alexandr A Kalinin and Brian T Do and Gregory P Way and Enrico Ferrero and Paul-Michael Agapow and Michael Zietz and Michael M Hoffman and others},\n\tyear         = 2018,\n\tjournal      = {Journal of The Royal Society Interface},\n\tvolume       = 15,\n\tnumber       = 141\n}\n@inproceedings{chiu2020scaling,\n\ttitle        = {Scaling Hidden {Markov} Language Models},\n\tauthor       = {Justin T. Chiu and Alexander M. Rush},\n\tyear         = 2020,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{chizat2018note,\n\ttitle        = {A note on lazy training in supervised differentiable programming},\n\tauthor       = {Chizat, Lenaic and Bach, Francis},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1812.07956},\n\tvolume       = 8\n}\n@inproceedings{cho2009kernel,\n\ttitle        = {Kernel methods for deep learning},\n\tauthor       = {Cho, Youngmin and Saul, Lawrence K},\n\tyear         = 2009,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {342--350}\n}\n@article{cho2014gru,\n\ttitle        = {On the properties of neural machine translation: Encoder-decoder approaches},\n\tauthor       = {Kyunghyun Cho and Bart van Merri{\\\"e}nboer and Dzmitry Bahdanau and Yoshua Bengio},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1409.1259}\n}\n@inproceedings{cho2014statmt,\n\ttitle        = {Learning Phrase Representations using {RNN} Encoder-Decoder for Statistical Machine Translation},\n\tauthor       = {Kyunghyun Cho and Bart van Merrienboer and Caglar Gulcehre and Dzmitry Bahdanau and Fethi Bougares and Holger Schwenk and Yoshua Bengio},\n\tyear         = 2014,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1724--1734}\n}\n@inproceedings{cho2021unifying,\n\ttitle        = {Unifying vision-and-language tasks via text generation},\n\tauthor       = {Jaemin Cho and Jie Lei and Hao Tan and Mohit Bansal},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{choe2016parsing,\n\ttitle        = {Parsing as Language Modeling},\n\tauthor       = {Do Kook Choe and Eugene Charniak},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{choey1997nonlinear,\n\ttitle        = {Nonlinear trading models through sharpe ratio maximization},\n\tauthor       = {Choey, Mark and Weigend, Andreas S},\n\tyear         = 1997,\n\tjournal      = {International Journal of Neural Systems},\n\tpublisher    = {World Scientific},\n\tvolume       = 8,\n\tnumber       = {04},\n\tpages        = {417--431}\n}\n@article{choi2007low,\n\ttitle        = {Low-power filtering via minimum power soft error cancellation},\n\tauthor       = {Jun Won Choi and Byonghyo Shim and Andrew C Singer and Nam Ik Cho},\n\tyear         = 2007,\n\tjournal      = {IEEE Transactions on Signal Processing},\n\tvolume       = 55,\n\tnumber       = 10,\n\tpages        = {5084--5096}\n}\n@article{choi2011inverse,\n\ttitle        = {Inverse reinforcement learning in partially observable environments},\n\tauthor       = {Jaedeug Choi and Kee-Eung Kim},\n\tyear         = 2011,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 12,\n\tpages        = {691--730}\n}\n@inproceedings{choi2017coarse,\n\ttitle        = {Coarse-to-Fine Question Answering for Long Documents},\n\tauthor       = {Eunsol Choi and Daniel Hewlett and Alexandre Lacoste and Illia Polosukhin and Jakob Uszkoreit and Jonathan Berant},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{choi2018contingency,\n\ttitle        = {Contingency-Aware Exploration in Reinforcement Learning},\n\tauthor       = {Jongwook Choi and Yijie Guo and Marcin Moczulski and Junhyuk Oh and Neal Wu and Mohammad Norouzi and Honglak Lee},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1811.01483}\n}\n@inproceedings{choi2018quac,\n\ttitle        = {{QuAC}: Question Answering in Context},\n\tauthor       = {Eunsol Choi and He He and Mohit Iyyer and Mark Yatskar and Wen-tau Yih and Yejin Choi and Percy Liang and Luke Zettlemoyer},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{choi2020reinforcement,\n\ttitle        = {Reinforcement learning for safety-critical control under model uncertainty, using control lyapunov functions and control barrier functions},\n\tauthor       = {Choi, Jason and Castaneda, Fernando and Tomlin, Claire J and Sreenath, Koushil},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2004.07584}\n}\n@manual{chollet2015keras,\n\ttitle        = {Keras},\n\tauthor       = {Fran\\c{c}ois Chollet},\n\tyear         = 2015,\n\thowpublished = {\\url{https://github.com/fchollet/keras}}\n}\n@article{chomsky56models,\n\ttitle        = {Three models for the description of language},\n\tauthor       = {Noam Chomsky},\n\tyear         = 1956,\n\tjournal      = {IRE Transactions on Information Theory},\n\tvolume       = 2,\n\tpages        = {113--124}\n}\n@inproceedings{choromanska2015loss,\n\ttitle        = {The Loss Surfaces of Multilayer Networks.},\n\tauthor       = {Choromanska, Anna and Henaff, Mikael and Mathieu, Michael and Arous, G{\\'e}rard Ben and LeCun, Yann},\n\tyear         = 2015,\n\tbooktitle    = {AISTATS}\n}\n@inproceedings{choromanska2015open,\n\ttitle        = {Open problem: The landscape of the loss surfaces of multilayer networks},\n\tauthor       = {Choromanska, Anna and LeCun, Yann and Arous, G{\\'e}rard Ben},\n\tyear         = 2015,\n\tbooktitle    = {Conference on Learning Theory},\n\tpages        = {1756--1760}\n}\n@article{choudhary2020advancing,\n\ttitle        = {Advancing Medical Imaging Informatics by Deep Learning-Based Domain Adaptation},\n\tauthor       = {Choudhary, Anirudh and Tong, Li and Zhu, Yuanda and Wang, May D},\n\tyear         = 2020,\n\tjournal      = {Yearbook of medical informatics},\n\tpublisher    = {Thieme Medical Publishers},\n\tvolume       = 29,\n\tnumber       = 1,\n\tpages        = 129\n}\n@article{chouldechova2017,\n\ttitle        = {A study of bias in recidivism prediciton instruments},\n\tauthor       = {Alexandra Chouldechova},\n\tyear         = 2017,\n\tjournal      = {Big Data},\n\tpages        = {153--163}\n}\n@inproceedings{chouldechova2018case,\n\ttitle        = {A case study of algorithm-assisted decision making in child maltreatment hotline screening decisions},\n\tauthor       = {Alexandra Chouldechova and Diana Benavides-Prado and Oleksandr Fialko and Rhema Vaithianathan},\n\tyear         = 2018,\n\tbooktitle    = {Conference on Fairness, Accountability and Transparency},\n\tpages        = {134--148}\n}\n@article{chouldechova2018frontiers,\n\ttitle        = {The frontiers of fairness in machine learning},\n\tauthor       = {Alexandra Chouldechova and Aaron Roth},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1810.08810}\n}\n@inproceedings{chow1957optimum,\n\ttitle        = {An optimum character recognition system using decision functions},\n\tauthor       = {C. K. Chow},\n\tyear         = 1957,\n\tbooktitle    = {IRE Transactions on Electronic Computers}\n}\n@article{chow1970optimum,\n\ttitle        = {On optimum recognition error and reject tradeoff},\n\tauthor       = {Chao K Chow},\n\tyear         = 1970,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tvolume       = 16,\n\tnumber       = 1,\n\tpages        = {41--46}\n}\n@article{chow1989complexity,\n\ttitle        = {The complexity of dynamic programming},\n\tauthor       = {Chow, Chef-Seng and Tsitsiklis, John N},\n\tyear         = 1989,\n\tjournal      = {Journal of complexity},\n\tpublisher    = {Elsevier},\n\tvolume       = 5,\n\tnumber       = 4,\n\tpages        = {466--488}\n}\n@inproceedings{chow2015risk,\n\ttitle        = {Risk-sensitive and robust decision-making: a cvar optimization approach},\n\tauthor       = {Chow, Yinlam and Tamar, Aviv and Mannor, Shie and Pavone, Marco},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {1522--1530}\n}\n@article{chow2018lyapunov,\n\ttitle        = {A lyapunov-based approach to safe reinforcement learning},\n\tauthor       = {Chow, Yinlam and Nachum, Ofir and Duenez-Guzman, Edgar and Ghavamzadeh, Mohammad},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.07708},\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{chow2019lyapunov,\n\ttitle        = {Lyapunov-based safe policy optimization for continuous control},\n\tauthor       = {Chow, Yinlam and Nachum, Ofir and Faust, Aleksandra and Duenez-Guzman, Edgar and Ghavamzadeh, Mohammad},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1901.10031}\n}\n@misc{christian2018translate,\n\ttitle        = {Why Is {G}oogle Translate Spitting Out Sinister Religious Prophecies?},\n\tauthor       = {Jon Christian},\n\tyear         = 2018,\n\thowpublished = {\\url{https://www.vice.com/en_us/article/j5npeg/why-is-google-translate-spitting-out-sinister-religious-prophecies}}\n}\n@article{christiano2014provably,\n\ttitle        = {Provably Manipulation-Resistant Reputation Systems},\n\tauthor       = {Paul Christiano},\n\tyear         = 2014,\n\tjournal      = {arXiv}\n}\n@article{christiano2016robust,\n\ttitle        = {Robust Collaborative Online Learning},\n\tauthor       = {Paul Christiano},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@inproceedings{christiano2017deep,\n\ttitle        = {Deep Reinforcement Learning from Human Preferences},\n\tauthor       = {Paul Christiano and Jan Leike and Tom B. Brown and Miljan Martic and Shane Legg and Dario Amodei},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{christie2018fmow,\n\ttitle        = {Functional Map of the World},\n\tauthor       = {Gordon Christie and Neil Fendley and James Wilson and Ryan Mukherjee},\n\tyear         = 2018,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@article{christmann2004robustness,\n\ttitle        = {On robustness properties of convex risk minimization methods for pattern recognition},\n\tauthor       = {Andreas Christmann and Ingo Steinwart},\n\tyear         = 2004,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 5,\n\tnumber       = {0},\n\tpages        = {1007--1034}\n}\n@article{chronopoulou2019transfer,\n\ttitle        = {An Embarrassingly Simple Approach for Transfer Learning from Pretrained Language Models},\n\tauthor       = {Alexandra Chronopoulou and Christos Baziotis and Alexandros Potamianos},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1902.10547}\n}\n@inproceedings{chu2006map,\n\ttitle        = {Map-Reduce for Machine Learning on Multicore},\n\tauthor       = {\n\t\tCheng-Tao Chu and Sang Kyun Kim and Yi-An Lin and YuanYuan Yu and\n\n\t\tGary R. Bradski and Andrew Y. Ng and Kunle Olukotun\n\t},\n\tyear         = 2006,\n\tbooktitle    = {{NIPS} 19},\n\tpublisher    = {MIT Press},\n\tpages        = {281--288},\n\teditor       = {Sch\\\"{o}lkopf, Bernhard and Platt, John C. and Hoffman, Thomas},\n\tciteulike-article-id = 2308503,\n\tkeywords     = {mapreduce, ml, parallel},\n\towner        = {leili},\n\tpriority     = {0},\n\ttimestamp    = {2011.07.28}\n}\n@inproceedings{chu2011unbiased,\n\ttitle        = {Unbiased online active learning in data streams},\n\tauthor       = {Wei Chu and Martin Zinkevich and Lihong Li and Achint Thomas and Belle Tseng},\n\tyear         = 2011,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)},\n\tpages        = {195--203}\n}\n@inproceedings{chu2013haptic,\n\ttitle        = {Using Robotic Exploratory Procedures to Learn the Meaning of Haptic Adjectives},\n\tauthor       = {V. Chu and I. McMahon and L. Riano and C. McDonald and Q. He and J. Perez-Tejada and M. Arrigo and N. Fitter and J. Nappo and T. Darrell and others},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Intelligent Robots and Systems (IROS)}\n}\n@article{chua2018deep,\n\ttitle        = {Deep reinforcement learning in a handful of trials using probabilistic dynamics models},\n\tauthor       = {Chua, Kurtland and Calandra, Roberto and McAllister, Rowan and Levine, Sergey},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.12114}\n}\n@article{chuang2020debiased,\n\ttitle        = {Debiased contrastive learning},\n\tauthor       = {Chuang, Ching-Yao and Robinson, Joshua and Yen-Chen, Lin and Torralba, Antonio and Jegelka, Stefanie},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.00224}\n}\n@inproceedings{Chudak2005,\n\ttitle        = {{Improved Approximation Schemes for Linear Programming Relaxations of Combinatorial Optimization Problems}},\n\tauthor       = {Chudak, Fabi\\'{a}n A. and Eleut\\'{e}rio, V\\^{a}nia},\n\tyear         = 2005,\n\tbooktitle    = {Proceedings of the 11th International IPCO Conference on Integer Programming and Combinatorial Optimization},\n\tpages        = {81--96},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Chudak, Eleut\\'{e}rio - 2005 - Improved Approximation Schemes for Linear Programming Relaxations of Combinatorial Optimization Problems.pdf:pdf},\n\tmendeley-groups = {Optimization/Multiplicative Weight/LP}\n}\n@article{chung1954stochastic,\n\ttitle        = {On a Stochastic Approximation Method},\n\tauthor       = {K. L. Chung},\n\tyear         = 1954,\n\tjournal      = {Annals of Mathematical Statistics},\n\tvolume       = 25,\n\tnumber       = 3,\n\tpages        = {463--483}\n}\n@book{chung1997spectral,\n\ttitle        = {Spectral graph theory},\n\tauthor       = {Chung, Fan RK and Graham, Fan Chung},\n\tyear         = 1997,\n\tpublisher    = {American Mathematical Soc.},\n\tnumber       = 92\n}\n@article{chung2006concentration,\n\ttitle        = {Concentration inequalities and martingale inequalities: a survey},\n\tauthor       = {Chung, Fan and Lu, Linyuan},\n\tyear         = 2006,\n\tjournal      = {Internet Mathematics},\n\tpublisher    = {Taylor \\& Francis},\n\tvolume       = 3,\n\tnumber       = 1,\n\tpages        = {79--127}\n}\n@inproceedings{chung2018supervised,\n\ttitle        = {Supervised and unsupervised transfer learning for question answering},\n\tauthor       = {Yu-An Chung and Hung-Yi Lee and James Glass},\n\tyear         = 2018,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{chung2018voxceleb2,\n\ttitle        = {VoxCeleb2: Deep Speaker Recognition},\n\tauthor       = {Joon Son Chung and Arsha Nagrani and Andrew Zisserman},\n\tyear         = 2018,\n\tjournal      = {Proc. Interspeech},\n\tpages        = {1086--1090}\n}\n@article{church1990word,\n\ttitle        = {Word association norms, mutual information, and lexicography},\n\tauthor       = {Church, Kenneth Ward and Hanks, Patrick},\n\tyear         = 1990,\n\tjournal      = {Computational linguistics}\n}\n@article{cichocki2009fast,\n\ttitle        = {Fast local algorithms for large scale nonnegative matrix and tensor factorizations},\n\tauthor       = {Cichocki, Andrzej and Anh-Huy, PHAN},\n\tyear         = 2009,\n\tjournal      = {IEICE transactions on fundamentals of electronics, communications and computer sciences},\n\tpublisher    = {The Institute of Electronics, Information and Communication Engineers},\n\tvolume       = 92,\n\tnumber       = 3,\n\tpages        = {708--721}\n}\n@inproceedings{cideron2019selfeducated,\n\ttitle        = {Self-Educated Language Agent with Hindsight Experience Replay for Instruction Following},\n\tauthor       = {Geoffrey Cideron and Mathieu Seurin and Florian Strub and Olivier Pietquin},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{cidon2016cliffhanger,\n\ttitle        = {Cliffhanger: Scaling performance cliffs in web memory caches},\n\tauthor       = {Asaf Cidon and Assaf Eisenman and Mohammad Alizadeh and Sachin Katti},\n\tyear         = 2016,\n\tbooktitle    = {13th $\\{$USENIX$\\}$ Symposium on Networked Systems Design and Implementation ($\\{$NSDI$\\}$ 16)},\n\tpages        = {379--392}\n}\n@article{cifka2018,\n\ttitle        = {Eval all, trust a few, do wrong to none: Comparing sentence generation models},\n\tauthor       = {Ondřej Cífka and Aliaksei Severyn and Enrique Alfonseca and Katja Filippova},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1804.07972}\n}\n@article{ciga2020self,\n\ttitle        = {Self supervised contrastive learning for digital histopathology},\n\tauthor       = {Ozan Ciga and Anne L Martel and Tony Xu},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2011.13971}\n}\n@article{ciocarlie2009hand,\n\ttitle        = {Hand posture subspaces for dexterous robotic grasping},\n\tauthor       = {Matei T Ciocarlie and Peter K Allen},\n\tyear         = 2009,\n\tjournal      = {International Journal of Robotics Research (IJRR)},\n\tvolume       = 28,\n\tpages        = {851--867}\n}\n@article{ciresan2011high,\n\ttitle        = {High-Performance Neural Networks for Visual Object Classification},\n\tauthor       = {Dan C. Ciresan and Ueli Meier and Jonathan Masci and Luca M. Gambardella and Jurgen Schmidhuber},\n\tyear         = 2011,\n\tjournal      = {arXiv}\n}\n@inproceedings{cisse2017parseval,\n\ttitle        = {Parseval networks: Improving robustness to adversarial examples},\n\tauthor       = {Moustapha Cisse and Piotr Bojanowski and Edouard Grave and Yann Dauphin and Nicolas Usunier},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {854--863}\n}\n@article{CKKRS06,\n\ttitle        = {On the Hardness of Approximating Multicut and Sparsest-Cut},\n\tauthor       = {Chawla, Shuchi and Krauthgamer, Robert and Kumar, Ravi and Rabani, Yuval and Sivakumar, D.},\n\tyear         = 2006,\n\tmonth        = jun,\n\tjournal      = {Computational Complexity},\n\tpublisher    = {Birkhauser Verlag},\n\tvolume       = 15,\n\tnumber       = 2,\n\tpages        = {94--114},\n\tnumpages     = 21\n}\n@inproceedings{CKLS09,\n\ttitle        = {Multi-View Clustering via Canonical Correlation Analysis},\n\tauthor       = {K. Chaudhuri and S. M. Kakade and K. Livescu and K. Sridharan},\n\tyear         = 2009,\n\tbooktitle    = {ICML}\n}\n@inproceedings{CKMST2011,\n\ttitle        = {{Electrical flows, laplacian systems, and faster approximation of maximum flow in undirected graphs}},\n\tauthor       = {Christiano, Paul and Kelner, Jonathan A. and Madry, Aleksander and Spielman, Daniel A. and Teng, Shang-Hua},\n\tyear         = 2011,\n\tmonth        = oct,\n\tbooktitle    = {Proceedings of the 43rd annual ACM symposium on Theory of computing - STOC '11},\n\tpublisher    = {ACM Press},\n\taddress      = {New York, New York, USA},\n\tpages        = 273,\n\tdoi          = {10.1145/1993636.1993674},\n\tisbn         = 9781450306911,\n\tabstract     = {We introduce a new approach to computing an approximately maximum s-t flow in a capacitated, undirected graph. This flow is computed by solving a sequence of electrical flow problems. Each electrical flow is given by the solution of a system of linear equations in a Laplacian matrix, and thus may be approximately computed in nearly-linear time. Using this approach, we develop the fastest known algorithm for computing approximately maximum s-t flows. For a graph having n vertices and m edges, our algorithm computes a (1-$\\backslash$epsilon)-approximately maximum s-t flow in time $\\backslash$tilde\\{O\\}(mn\\^{}\\{1/3\\} $\\backslash$epsilon\\^{}\\{-11/3\\}). A dual version of our approach computes a (1+$\\backslash$epsilon)-approximately minimum s-t cut in time $\\backslash$tilde\\{O\\}(m+n\\^{}\\{4/3\\}$\\backslash$eps\\^{}\\{-8/3\\}), which is the fastest known algorithm for this problem as well. Previously, the best dependence on m and n was achieved by the algorithm of Goldberg and Rao (J. ACM 1998), which can be used to compute approximately maximum s-t flows in time $\\backslash$tilde\\{O\\}(m$\\backslash$sqrt\\{n\\}$\\backslash$epsilon\\^{}\\{-1\\}), and approximately minimum s-t cuts in time $\\backslash$tilde\\{O\\}(m+n\\^{}\\{3/2\\}$\\backslash$epsilon\\^{}\\{-3\\}).},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {1010.2921},\n\teprint       = {1010.2921},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Christiano et al. - 2011 - Electrical flows, laplacian systems, and faster approximation of maximum flow in undirected graphs.pdf:pdf},\n\tmendeley-groups = {Algorithms/Maxflow}\n}\n@inproceedings{clark01induction,\n\ttitle        = {Unsupervised induction of stochastic context free grammars with distributional clustering},\n\tauthor       = {Alexander Clark},\n\tyear         = 2001,\n\tbooktitle    = {Computational Natural Language Learning (CoNLL)}\n}\n@article{clark05fsa,\n\ttitle        = {{PAC}-learnability of Probabilistic Deterministic Finite State Automata},\n\tauthor       = {Alexander Clark and Franck Thollard},\n\tyear         = 2005,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 5,\n\tpages        = {473--497}\n}\n@inproceedings{clark1975bridging,\n\ttitle        = {Bridging},\n\tauthor       = {Herbert H Clark},\n\tyear         = 1975,\n\tbooktitle    = {Workshop on theoretical issues in natural language processing},\n\tpages        = {169--174}\n}\n@article{clark1986collaborative,\n\ttitle        = {Referring as a Collaborative Process},\n\tauthor       = {Herbert H. Clark and Deanna Wilkes-Gibbs},\n\tyear         = 1986,\n\tjournal      = {Cognition},\n\tvolume       = 22\n}\n@book{clark1991grounding,\n\ttitle        = {Grounding in Communication},\n\tauthor       = {Herbert H. Clark and Susan E. Brennan},\n\tyear         = 1991,\n\tpublisher    = {Perspectives on Socially Shared Cognition},\n\tpages        = {127--149}\n}\n@article{clark1996using,\n\ttitle        = {Using language},\n\tauthor       = {Herbert H Clark},\n\tyear         = 1996,\n\tjournal      = {Cambridge University Press: Cambridge},\n\tvolume       = 952,\n\tpages        = {274--296}\n}\n@inproceedings{clark2015coref,\n\ttitle        = {Entity-Centric Coreference Resolution with Model Stacking},\n\tauthor       = {Kevin Clark and Christopher D. Manning},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{clark2016deep,\n\ttitle        = {Deep reinforcement learning for mention-ranking coreference models},\n\tauthor       = {Kevin Clark and Christopher D Manning},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1609.08667}\n}\n@article{clark2016my,\n\ttitle        = {My Computer is an Honor Student but how Intelligent is it? Standardized Tests as a Measure of {AI}},\n\tauthor       = {Peter Clark and Oren Etzioni},\n\tyear         = 2016,\n\tjournal      = {AI Magazine},\n\tvolume       = 37,\n\tnumber       = 1,\n\tpages        = {5--12}\n}\n@inproceedings{clark2018simple,\n\ttitle        = {Simple and effective multi-paragraph reading comprehension},\n\tauthor       = {Christopher Clark and Matt Gardner},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{clark2019aristo,\n\ttitle        = {From `{F}' to `{A}' on the {N.Y.} {R}egents Science Exams: An Overview of the {A}risto Project},\n\tauthor       = {Peter Clark and Oren Etzioni and Daniel Khashabi and Tushar Khot and Bhavana Dalvi Mishra and Kyle Richardson and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord and Niket Tandon and Sumithra Bhakthavatsalam and Dirk Groeneveld and Michal Guerquin and Michael Schmitz},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1909.01958}\n}\n@inproceedings{clark2019dont,\n\ttitle        = {Don't Take the Easy Way Out: Ensemble Based Methods for Avoiding Known Dataset Biases},\n\tauthor       = {Christopher Clark and Mark Yatskar and Luke Zettlemoyer},\n\tyear         = 2019,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{clark2020electra,\n\ttitle        = {Electra: Pre-training text encoders as discriminators rather than generators},\n\tauthor       = {Clark, Kevin and Luong, Minh-Thang and Le, Quoc V and Manning, Christopher D},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2003.10555}\n}\n@article{clark2020tydi,\n\ttitle        = {TyDi QA: A benchmark for information-seeking question answering in typologically diverse languages},\n\tauthor       = {Jonathan H Clark and Eunsol Choi and Michael Collins and Dan Garrette and Tom Kwiatkowski and Vitaly Nikolaev and Jennimaria Palomaki},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2003.05002}\n}\n@inproceedings{clarke10world,\n\ttitle        = {Driving Semantic Parsing from the World's Response},\n\tauthor       = {James Clarke and Dan Goldwasser and Ming-Wei Chang and Dan Roth},\n\tyear         = 2010,\n\tbooktitle    = {Computational Natural Language Learning (CoNLL)},\n\tpages        = {18--27}\n}\n@article{clarke2005phantom,\n\ttitle        = {The phantom menace: Omitted variable bias in econometric research},\n\tauthor       = {Kevin A Clarke},\n\tyear         = 2005,\n\tjournal      = {Conflict management and peace science},\n\tvolume       = 22,\n\tnumber       = 4,\n\tpages        = {341--352}\n}\n@inproceedings{clarke2008diversity,\n\ttitle        = {Novelty and diversity in information retrieval evaluation},\n\tauthor       = {Charles L. A. Clarke and Maheedhar Kolla and Gordon V. Cormack and Olga Vechtomova and Azin Ashkan and Stefan Büttcher and Ian MacKinnon},\n\tyear         = 2008,\n\tbooktitle    = {ACM SIGIR}\n}\n@article{clarkson2010coresets,\n\ttitle        = {Coresets, sparse greedy approximation, and the Frank-Wolfe algorithm},\n\tauthor       = {Clarkson, Kenneth L},\n\tyear         = 2010,\n\tjournal      = {ACM Transactions on Algorithms (TALG)},\n\tpublisher    = {ACM},\n\tvolume       = 6,\n\tnumber       = 4,\n\tpages        = 63\n}\n@article{clarkson2012sublinear,\n\ttitle        = {Sublinear optimization for machine learning},\n\tauthor       = {Clarkson, Kenneth L and Hazan, Elad and Woodruff, David P},\n\tyear         = 2012,\n\tmonth        = oct,\n\tjournal      = {J. ACM},\n\tpublisher    = {ACM},\n\tvolume       = 59,\n\tnumber       = 5,\n\tpages        = 23,\n\tdoi          = {10.1145/2371656.2371658},\n\tissn         = {00045411},\n\tfjournal     = {Journal of the ACM},\n\tfile         = {:D$\\backslash$:/Mendeley Desktop/Clarkson, Hazan, Woodruff - 2012 - Sublinear optimization for machine learning.pdf:pdf},\n\tmendeley-groups = {Algorithms/Computational Geometry}\n}\n@inproceedings{clarkson2013fast,\n\ttitle        = {The Fast Cauchy Transform and faster robust linear regression},\n\tauthor       = {Clarkson, Kenneth L and Drineas, Petros and Magdon-Ismail, Malik and Mahoney, Michael W and Meng, Xiangrui and Woodruff, David P},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of the Twenty-Fourth Annual ACM-SIAM Symposium on Discrete Algorithms},\n\tpages        = {466--477},\n\torganization = {SIAM}\n}\n@inproceedings{ClarksonWoodruf2013-SVD,\n\ttitle        = {{Low rank approximation and regression in input sparsity time}},\n\tauthor       = {Clarkson, Kenneth L. and Woodruff, David P.},\n\tyear         = 2013,\n\tbooktitle    = {STOC},\n\tpages        = {81--90}\n}\n@inproceedings{clavera2019model,\n\ttitle        = {Model-Augmented Actor-Critic: Backpropagating through Paths},\n\tauthor       = {Clavera, Ignasi and Fu, Yao and Abbeel, Pieter},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations}\n}\n@inproceedings{cleverdon1962report,\n\ttitle        = {Report on the testing and analysis of an investigation into the comparative efficiency of indexing systems},\n\tauthor       = {Cyril W. Cleverdon},\n\tyear         = 1962,\n\tbooktitle    = {{ASLIB}}\n}\n@inproceedings{cleverdon1967cranfield,\n\ttitle        = {The Cranfield tests on index language devices},\n\tauthor       = {Cyril W. Cleverdon},\n\tyear         = 1967,\n\tbooktitle    = {{ASLIB}}\n}\n@inproceedings{clinchant2013aggregating,\n\ttitle        = {Aggregating Continuous Word Embeddings for Information Retrieval},\n\tauthor       = {Stéphane Clinchant and Florent Perronnin},\n\tyear         = 2013,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {100--109}\n}\n@inproceedings{clough2019global,\n\ttitle        = {Global and local interpretability for cardiac {MRI} classification},\n\tauthor       = {James R Clough and Ilkay Oksuz and Esther Puyol-Ant{\\'o}n and Bram Ruijsink and Andrew P King and Julia A Schnabel},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Medical Image Computing and Computer-Assisted Intervention},\n\tpages        = {656--664}\n}\n@misc{cmumotion,\n\ttitle        = {Motion capture database},\n\tauthor       = {CMU},\n\turl          = {http://mocap.cs.cmu.edu},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@misc{cmumulti,\n\ttitle        = {Multi-Modal Activity Database},\n\tauthor       = {CMU},\n\turl          = {http://kitchen.cs.cmu.edu}\n}\n@inproceedings{coates2011analysis,\n\ttitle        = {An analysis of single-layer networks in unsupervised feature learning},\n\tauthor       = {Coates, Adam and Ng, Andrew Y and Lee, Honglak},\n\tyear         = 2011,\n\tbooktitle    = {International Conference on Artificial Intelligence and Statistics},\n\tpages        = {215--223}\n}\n@inproceedings{coates2011stl10,\n\ttitle        = {An analysis of single-layer networks in unsupervised feature learning},\n\tauthor       = {Adam Coates and Andrew Ng and Honlak Lee},\n\tyear         = 2011,\n\tbooktitle    = {Proceedings of the Fourteenth International Conference on Artificial Intelligence and Statistics},\n\tvolume       = 15,\n\tpages        = {215--223}\n}\n@article{coates2012features,\n\ttitle        = {Learning Feature Representations with {K}-Means},\n\tauthor       = {Adam Coates and Andrew Y. Ng},\n\tyear         = 2012,\n\tjournal      = {Neural Networks: Tricks of the Trade - Second Edition},\n\tvolume       = 2,\n\tnumber       = 1,\n\tpages        = {561--580}\n}\n@article{CoatesNgLee11,\n\ttitle        = {An Analysis of Single-Layer Networks in Unsupervised Feature Learning},\n\tauthor       = {A. Coates and H. Lee and A. Y. Ng},\n\tyear         = 2011,\n\tjournal      = {Journal of Machine Learning Research - Proceedings Track},\n\tvolume       = 15,\n\tpages        = {215--223}\n}\n@inproceedings{cobbe2020leveraging,\n\ttitle        = {Leveraging Procedural Generation to Benchmark Reinforcement Learning},\n\tauthor       = {Karl Cobbe and Christopher Hesse and Jacob Hilton and John Schulman},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{codella2019skin,\n\ttitle        = {Skin lesion analysis toward melanoma detection 2018: A challenge hosted by the international skin imaging collaboration (isic)},\n\tauthor       = {Noel Codella and Veronica Rotemberg and Philipp Tschandl and M Emre Celebi and Stephen Dusza and David Gutman and Brian Helba and Aadi Kalloo and Konstantinos Liopyris and Michael Marchetti and others},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1902.03368}\n}\n@inproceedings{cogill2015primal,\n\ttitle        = {Primal-dual algorithms for discounted {M}arkov decision processes},\n\tauthor       = {Cogill, Randy},\n\tyear         = 2015,\n\tbooktitle    = {Control Conference (ECC), 2015 European},\n\tpages        = {260--265},\n\torganization = {IEEE}\n}\n@article{cogill2016analysis,\n\ttitle        = {An Analysis of Primal-Dual Algorithms for Discounted Markov Decision Processes},\n\tauthor       = {Cogill, Randy},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1601.04175}\n}\n@inproceedings{cohan2016revisiting,\n\ttitle        = {Revisiting Summarization Evaluation for Scientific Articles},\n\tauthor       = {Arman Cohan and Nazli Goharian},\n\tyear         = 2016,\n\tbooktitle    = {Language Resources and Evaluation Conference (LREC)}\n}\n@inproceedings{cohen12pcfg,\n\ttitle        = {Spectral Learning of Latent-Variable {PCFG}s},\n\tauthor       = {Shay B. Cohen and Karl Stratos and Michael Collins and Dean P. Foster and Lyle Ungar},\n\tyear         = 2012,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{cohen1993nonnegative,\n\ttitle        = {Nonnegative Ranks, Decompositions, and Factorizations of Nonnegative Matices},\n\tauthor       = {Cohen, Joel E and Rothblum, Uriel G},\n\tyear         = 1993,\n\tjournal      = {Linear Algebra and its Applications}\n}\n@inproceedings{cohen2002flexible,\n\ttitle        = {A flexible learning system for wrapping tables and lists in {HTML} documents},\n\tauthor       = {William W Cohen and Matthew Hurst and Lee S Jensen},\n\tyear         = 2002,\n\tbooktitle    = {World Wide Web (WWW)},\n\tpages        = {232--241}\n}\n@proceedings{Cohen2008,\n\ttitle        = {Machine Learning, Proceedings of the Twenty-Fifth International Conference (ICML 2008), Helsinki, Finland, June 5-9, 2008},\n\tyear         = 2008,\n\tbooktitle    = {ICML},\n\tpublisher    = {ACM},\n\tseries       = {ACM International Conference Proceeding Series},\n\tvolume       = 307,\n\teditor       = {William W. Cohen and Andrew McCallum and Sam T. Roweis},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de}\n}\n@inproceedings{cohen2012spectral,\n\ttitle        = {Spectral learning of latent-variable {PCFGs}},\n\tauthor       = {Cohen, Shay B. and Stratos, Karl and Collins, Michael and Foster, Dean P. and Ungar, Lyle},\n\tyear         = 2012,\n\tbooktitle    = {Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics: Long Papers-Volume 1}\n}\n@inproceedings{cohen2013experiments,\n\ttitle        = {Experiments with Spectral Learning of Latent-Variable PCFGs},\n\tauthor       = {Shay B Cohen and Karl Stratos and Michael Collins and Dean P Foster and Lyle H Ungar},\n\tyear         = 2013,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)},\n\tpages        = {148--157}\n}\n@inproceedings{cohen2013pcfg,\n\ttitle        = {Approximate {PCFG} Parsing Using Tensor Decomposition},\n\tauthor       = {Shay B Cohen and Giorgio Satta and Michael Collins},\n\tyear         = 2013,\n\tbooktitle    = {Human Language Technology and North American Association for Computational Linguistics (HLT/NAACL)},\n\tpages        = {487--496}\n}\n@inproceedings{Cohen2015dimensionality,\n\ttitle        = {Dimensionality reduction for k-means clustering and low rank approximation},\n\tauthor       = {Cohen, Michael B. and Elder, Sam and Musco, Cameron and Musco, Christopher and Persu, Madalina},\n\tyear         = 2015,\n\tbooktitle    = {STOC},\n\tpages        = {163--172},\n\torganization = {ACM}\n}\n@article{cohen2015ridge,\n\ttitle        = {Ridge Leverage Scores for Low-Rank Approximation},\n\tauthor       = {Cohen, Michael B and Musco, Cameron and Musco, Christopher},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1511.07263}\n}\n@inproceedings{cohen2015uniform,\n\ttitle        = {Uniform sampling for matrix approximation},\n\tauthor       = {Cohen, Michael B and Lee, Yin Tat and Musco, Cameron and Musco, Christopher and Peng, Richard and Sidford, Aaron},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the 2015 Conference on Innovations in Theoretical Computer Science},\n\tpages        = {181--190}\n}\n@inproceedings{cohen2016convolutional,\n\ttitle        = {Convolutional rectifier networks as generalized tensor decompositions},\n\tauthor       = {Cohen, Nadav and Shashua, Amnon},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {955--963},\n\torganization = {PMLR}\n}\n@inproceedings{cohen2016expressive,\n\ttitle        = {On the expressive power of deep learning: A tensor analysis},\n\tauthor       = {Cohen, Nadav and Sharir, Or and Shashua, Amnon},\n\tyear         = 2016,\n\tbooktitle    = {Conference on learning theory},\n\tpages        = {698--728},\n\torganization = {PMLR}\n}\n@inproceedings{cohen2017inducing,\n\ttitle        = {Inducing Regular Grammars Using Recurrent Neural Networks},\n\tauthor       = {Mor Cohen and Avi Calciularu and Idan Rejwan and Jonathan Berant},\n\tyear         = 2017,\n\tbooktitle    = {Workshop on Learning and Reasoning: Principles & Applications to Everyday Spatial and Temporal Knowledge}\n}\n@inproceedings{cohen2019certified,\n\ttitle        = {Certified adversarial robustness via randomized smoothing},\n\tauthor       = {Jeremy M Cohen and Elan Rosenfeld and J Zico Kolter},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@misc{cohen2021gradient,\n\ttitle        = {Gradient Descent on Neural Networks Typically Occurs at the Edge of Stability},\n\tauthor       = {Jeremy M. Cohen and Simran Kaur and Yuanzhi Li and J. Zico Kolter and Ameet Talwalkar},\n\tyear         = 2021,\n\teprint       = {2103.00065},\n\tarchiveprefix = {arXiv},\n\tprimaryclass = {cs.LG}\n}\n@inproceedings{CohenKPPRSV17,\n\ttitle        = {Almost-linear-time algorithms for Markov chains and new spectral primitives for directed graphs},\n\tauthor       = {Michael B. Cohen and Jonathan A. Kelner and John Peebles and Richard Peng and Anup B. Rao and Aaron Sidford and Adrian Vladu},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the 49th Annual {ACM} {SIGACT} Symposium on Theory of Computing, {STOC} 2017, Montreal, QC, Canada, June 19-23, 2017},\n\tpages        = {410--419}\n}\n@inproceedings{cohn09ptsg,\n\ttitle        = {Inducing Compact but Accurate Tree-Substitution Grammars},\n\tauthor       = {Trevor Cohn and Sharon Goldwater and Phil Blunsom},\n\tyear         = 2009,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)},\n\tpages        = {548--556}\n}\n@inproceedings{cohn1994active,\n\ttitle        = {Active Learning with Statistical Models},\n\tauthor       = {David A. Cohn and Zoubin Ghahramani and Michael I. Jordan},\n\tyear         = 1994,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{cohn2008sentence,\n\ttitle        = {Sentence Compression Beyond Word Deletion},\n\tauthor       = {Trevor Cohn and Mirella Lapata},\n\tyear         = 2008,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)},\n\tpages        = {137--144}\n}\n@article{coja2002coloring,\n\ttitle        = {Coloring {K}-colorable semirandom graphs in polynomial expected time via semidefinite programming},\n\tauthor       = {Amin Coja-Oghlan},\n\tyear         = 2002,\n\tjournal      = {Mathematical Foundations of Computer Science},\n\tpages        = {201--211}\n}\n@article{coja2002finding,\n\ttitle        = {Finding sparse induced subgraphs of semirandom graphs},\n\tauthor       = {Amin Coja-Oghlan},\n\tyear         = 2002,\n\tjournal      = {Randomization and Approximation Techniques in Computer Science},\n\tpages        = {139--148}\n}\n@article{coja2004coloring,\n\ttitle        = {Coloring semirandom graphs optimally},\n\tauthor       = {Amin Coja-Oghlan},\n\tyear         = 2004,\n\tjournal      = {Automata, Languages and Programming},\n\tpages        = {71--100}\n}\n@article{coja2007solving,\n\ttitle        = {Solving {NP}-hard semirandom graph problems in polynomial expected time},\n\tauthor       = {Amin Coja-Oghlan},\n\tyear         = 2007,\n\tjournal      = {Journal of Algorithms},\n\tvolume       = 62,\n\tnumber       = 1,\n\tpages        = {19--46}\n}\n@inproceedings{colas2020language,\n\ttitle        = {Language as a Cognitive Tool to Imagine Goals in Curiosity Driven Exploration},\n\tauthor       = {C{'{e}}dric Colas and Tristan Karch and Nicolas Lair and Jean-Michel Dussoux and Cl{'{e}}ment Moulin-Frier and Peter F. Dominey and Pierre-Yves Oudeyer},\n\tyear         = 2020,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{colby1971artificial,\n\ttitle        = {Artificial paranoia},\n\tauthor       = {Kenneth Mark Colby and Sylvia Weber and Franklin Dennis Hilf},\n\tyear         = 1971,\n\tjournal      = {Artificial Intelligence},\n\tvolume       = 2,\n\tnumber       = 1,\n\tpages        = {1--25}\n}\n@article{cole2021does,\n\ttitle        = {When Does Contrastive Visual Representation Learning Work?},\n\tauthor       = {Cole, Elijah and Yang, Xuan and Wilber, Kimberly and Mac Aodha, Oisin and Belongie, Serge},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2105.05837}\n}\n@inproceedings{coleman2020selection,\n\ttitle        = {Selection via Proxy: Efficient Data Selection for Deep Learning},\n\tauthor       = {Cody Coleman and Christopher Yeh and Stephen Mussmann and Baharan Mirzasoleiman and Peter Bailis and Percy Liang and Jure Leskovec and Matei Zaharia},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{collado1997estimating,\n\ttitle        = {Estimating dynamic models from time series of independent cross-sections},\n\tauthor       = {M Dolores Collado},\n\tyear         = 1997,\n\tjournal      = {Journal of Econometrics},\n\tvolume       = 82,\n\tnumber       = 1,\n\tpages        = {37--62}\n}\n@inproceedings{collins01theory,\n\ttitle        = {Parameter Estimation for Statistical Parsing Models: Theory and Practice of Distribution-Free Methods},\n\tauthor       = {Michael Collins},\n\tyear         = 2001,\n\tbooktitle    = {International Workshop on Parsing Technologies}\n}\n@inproceedings{collins02perceptron,\n\ttitle        = {Discriminative Training Methods for Hidden {M}arkov Models: Theory and Experiments with {P}erceptron Algorithms},\n\tauthor       = {Michael Collins},\n\tyear         = 2002,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{collins08exponentiated,\n\ttitle        = {Exponentiated Gradient Algorithms for Conditional Random Fields and Max-Margin {M}arkov Networks},\n\tauthor       = {M. Collins and Amir Globerson and Terry Koo and Xavier Carreras and Peter Bartlett},\n\tyear         = 2008,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 9\n}\n@article{collins2005discriminative,\n\ttitle        = {Discriminative reranking for natural language parsing},\n\tauthor       = {Michael Collins and Terry Koo},\n\tyear         = 2005,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 31,\n\tnumber       = 1,\n\tpages        = {25--70}\n}\n@inproceedings{collins2008towards,\n\ttitle        = {Towards scalable dataset construction: An active learning approach},\n\tauthor       = {Brendan Collins and Jia Deng and Kai Li and Li Fei-Fei},\n\tyear         = 2008,\n\tbooktitle    = {European Conference on Computer Vision (ECCV)},\n\tpages        = {86--98}\n}\n@article{collins2008transforming,\n\ttitle        = {Transforming environmental health protection},\n\tauthor       = {Francis S Collins and George M Gray and John R Bucher},\n\tyear         = 2008,\n\tjournal      = {Science},\n\tvolume       = 319,\n\tnumber       = 5865\n}\n@phdthesis{collins99thesis,\n\ttitle        = {Head-Driven Statistical Models for Natural Language Parsing},\n\tauthor       = {M. Collins},\n\tyear         = 1999,\n\tschool       = {University of Pennsylvania}\n}\n@inproceedings{collins99unsupervised,\n\ttitle        = {Unsupervised Models for Named Entity Classification},\n\tauthor       = {Michael Collins and Yoram Singer},\n\tyear         = 1999,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{collobert11scratch,\n\ttitle        = {Natural Language Processing (almost) from Scratch},\n\tauthor       = {Ronan Collobert and Jason Weston and Leon Bottou and Michael Karlen and Koray Kavukcuoglu and Pavel Kuksa},\n\tyear         = 2011,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 12,\n\tpages        = {2493--2537}\n}\n@inproceedings{collobert2002parallel,\n\ttitle        = {A {P}arallel {M}ixture of {SVM}s for {V}ery {L}arge {S}cale {P}roblems},\n\tauthor       = {Collobert, R. and Bengio, S. and Bengio, Y.},\n\tyear         = 2002,\n\tbooktitle    = {{NIPS}},\n\tpublisher    = {MIT Press},\n\teditor       = {Dietterich, T. G. and Becker, S. and Ghahramani, Z.},\n\tdetails      = {http://infoscience.epfl.ch/record/82786},\n\tdocumenturl  = {ftp://ftp.idiap.ch/pub/reports/2002/collobert_2002_nips.pdf},\n\tkeywords     = {learning},\n\toai-id       = {oai:infoscience.epfl.ch:82786},\n\toai-set      = {conf},\n\tunit         = {LIDIAP}\n}\n@inproceedings{collobert2008unified,\n\ttitle        = {A Unified Architecture for Natural Language Processing: Deep Neural Networks with Multitask Learning},\n\tauthor       = {Collobert, Ronan and Weston, Jason},\n\tyear         = 2008,\n\tbooktitle    = {Proceedings of the 25th International Conference on Machine Learning},\n\tpages        = {160--167}\n}\n@inproceedings{colohan2006tolerating,\n\ttitle        = {Tolerating Dependences Between Large Speculative Threads Via Sub-Threads},\n\tauthor       = {\n\t\tColohan, Christopher B. and Ailamaki, Anastassia and Steffan, J.\n\n\t\tGregory and Mowry, Todd C.\n\t},\n\tyear         = 2006,\n\tbooktitle    = {\n\t\tProceedings of the 33rd annual international symposium on Computer\n\n\t\tArchitecture\n\t},\n\tpublisher    = {IEEE Computer Society},\n\taddress      = {Washington, DC, USA},\n\tseries       = {ISCA '06},\n\tpages        = {216--226},\n\tdoi          = {http://dx.doi.org/10.1109/ISCA.2006.43},\n\tisbn         = {0-7695-2608-X},\n\tacmid        = 1136504,\n\tnumpages     = 11\n}\n@article{comfort1969test,\n\ttitle        = {Test-battery to measure ageing-rate in man},\n\tauthor       = {Alex Comfort},\n\tyear         = 1969,\n\tjournal      = {The Lancet},\n\tvolume       = 294,\n\tnumber       = 7635,\n\tpages        = {1411--1415}\n}\n@book{Comon:book,\n\ttitle        = {Handbook of Blind Source Separation: Independent Component Analysis and Applications},\n\tauthor       = {Comon, P. and Jutten, C.},\n\tyear         = 2010,\n\tpublisher    = {Elsevier},\n\tseries       = {Academic Press}\n}\n@article{comon2002tensor,\n\ttitle        = {Tensor decompositions},\n\tauthor       = {P. Comon},\n\tyear         = 2002,\n\tjournal      = {Mathematics in Signal Processing V},\n\tpublisher    = {Oxford, UK: Clarendon},\n\tpages        = {1--24}\n}\n@article{comon2009tensor,\n\ttitle        = {Tensor decompositions, alternating least squares and other tales},\n\tauthor       = {P. Comon and X. Luciani and A. De Almeida},\n\tyear         = 2009,\n\tjournal      = {Journal of Chemometrics},\n\tpublisher    = {Wiley Online Library},\n\tvolume       = 23,\n\tnumber       = {7-8},\n\tpages        = {393--405}\n}\n@article{Comon94,\n\ttitle        = {Independent Component Analysis, a new concept?},\n\tauthor       = {P. Comon},\n\tyear         = 1994,\n\tjournal      = {Signal Processing},\n\tvolume       = 36,\n\tnumber       = 3,\n\tpages        = {287--314}\n}\n@book{ComonJuttenICA,\n\ttitle        = {Handbook of Blind Source Separation: Independent Component Analysis and Applications},\n\tauthor       = {P. Comon and C. Jutten},\n\tyear         = 2010,\n\tpublisher    = {Academic Press. Elsevier}\n}\n@inproceedings{ConcentrationProjections,\n\ttitle        = {A concentration theorem for projections},\n\tauthor       = {Sanjoy Dasgupta and Daniel Hsu and Nakul Verma},\n\tyear         = 2006,\n\tbooktitle    = {Twenty-Second Conference on Uncertainty in Artificial Intelligence}\n}\n@article{condon1992complexity,\n\ttitle        = {The complexity of stochastic games},\n\tauthor       = {Condon, Anne},\n\tyear         = 1992,\n\tjournal      = {Information and Computation},\n\tpublisher    = {Elsevier},\n\tvolume       = 96,\n\tnumber       = 2,\n\tpages        = {203--224}\n}\n@article{condon2001algorithms,\n\ttitle        = {Algorithms for graph partitioning on the planted partition model},\n\tauthor       = {Anne Condon and Richard M. Karp},\n\tyear         = 2001,\n\tjournal      = {Random Structures and Algorithms},\n\tpages        = {116--140}\n}\n@inproceedings{cong2005parallel,\n\ttitle        = {Parallel mining of closed sequential patterns},\n\tauthor       = {Cong, Shengnan and Han, Jiawei and Padua, David},\n\tyear         = 2005,\n\tbooktitle    = {\n\t\tProceedings of the eleventh ACM SIGKDD international conference on\n\n\t\tKnowledge discovery in data mining\n\t},\n\tlocation     = {Chicago, Illinois, USA},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {KDD '05},\n\tpages        = {562--567},\n\tdoi          = {http://doi.acm.org/10.1145/1081870.1081937},\n\tisbn         = {1-59593-135-X},\n\tacmid        = 1081937,\n\tkeywords     = {load balancing, parallel algorithms, sampling},\n\tnumpages     = 6\n}\n@article{conneau2017supervised,\n\ttitle        = {Supervised learning of universal sentence representations from natural language inference data},\n\tauthor       = {Alexis Conneau and Douwe Kiela and Holger Schwenk and Loic Barrault and Antoine Bordes},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1705.02364}\n}\n@article{conneau2017word,\n\ttitle        = {Word Translation Without Parallel Data},\n\tauthor       = {Alexis Conneau and Guillaume Lample and Marc'Aurelio Ranzato and Ludovic Denoyer and Herve Jegou},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@article{conneau2018senteval,\n\ttitle        = {SentEval: An Evaluation Toolkit for Universal Sentence Representations},\n\tauthor       = {Alexis Conneau and Douwe Kiela},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1803.05449}\n}\n@inproceedings{conneau2018xnli,\n\ttitle        = {XNLI: Evaluating Cross-lingual Sentence Representations},\n\tauthor       = {Alexis Conneau and Ruty Rinott and Guillaume Lample and Adina Williams and Samuel Bowman and Holger Schwenk and Veselin Stoyanov},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {2475--2485}\n}\n@inproceedings{conneau2019cross,\n\ttitle        = {Cross-lingual language model pretraining},\n\tauthor       = {Alexis Conneau and Guillaume Lample},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {7059--7069}\n}\n@inproceedings{conroy2008mind,\n\ttitle        = {Mind the Gap : Dangers of Divorcing Evaluations of Summary Content from Linguistic Quality},\n\tauthor       = {John M Conroy and Hoa Trang Dang},\n\tyear         = 2008,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)},\n\tpages        = {145--152}\n}\n@inproceedings{conti2014interface,\n\ttitle        = {Interface design and control strategies for a robot assisted ultrasonic examination system},\n\tauthor       = {Fran{\\c}ois Conti and Jaeheung Park and Oussama Khatib},\n\tyear         = 2014,\n\tbooktitle    = {Experimental Robotics},\n\tpages        = {97--113}\n}\n@article{conti2017improving,\n\ttitle        = {Improving exploration in evolution strategies for deep reinforcement learning via a population of novelty-seeking agents},\n\tauthor       = {Conti, Edoardo and Madhavan, Vashisht and Such, Felipe Petroski and Lehman, Joel and Stanley, Kenneth O and Clune, Jeff},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1712.06560}\n}\n@article{cook1977detection,\n\ttitle        = {Detection of influential observation in linear regression},\n\tauthor       = {R Dennis Cook},\n\tyear         = 1977,\n\tjournal      = {Technometrics},\n\tvolume       = 19,\n\tpages        = {15--18}\n}\n@article{cook1980influence,\n\ttitle        = {Characterizations of an empirical influence function for detecting influential cases in regression},\n\tauthor       = {R Dennis Cook and Sanford Weisberg},\n\tyear         = 1980,\n\tjournal      = {Technometrics},\n\tvolume       = 22,\n\tpages        = {495--508}\n}\n@book{cook1982residuals,\n\ttitle        = {Residuals and influence in regression},\n\tauthor       = {R Dennis Cook and Sanford Weisberg},\n\tyear         = 1982,\n\tpublisher    = {New York: Chapman and Hall}\n}\n@article{cook1986assessment,\n\ttitle        = {Assessment of local influence},\n\tauthor       = {R Dennis Cook},\n\tyear         = 1986,\n\tjournal      = {Journal of the Royal Statistical Society. Series B (Methodological)},\n\tpages        = {133--169}\n}\n@article{cook2011assessing,\n\ttitle        = {Assessing {G}oogle flu trends performance in the {U}nited {S}tates during the 2009 influenza virus {A} ({H1N1}) pandemic},\n\tauthor       = {Samantha Cook and Corrie Conrad and Ashley L. Fowlkes and Matthew H. Mohebbi},\n\tyear         = 2011,\n\tjournal      = {{P}lo{S} one},\n\tvolume       = 6,\n\tnumber       = 8\n}\n@phdthesis{cooper75thesis,\n\ttitle        = {Montague's semantic theory and transformational syntax},\n\tauthor       = {Robin Cooper},\n\tyear         = 1975,\n\tschool       = {University of Massachusetts at Amherst}\n}\n@article{copestake05mrs,\n\ttitle        = {Minimal Recursion Semantics: An introduction},\n\tauthor       = {Ann Copestake and Dan Flickinger and Ivan Sag and Carl Pollard},\n\tyear         = 2005,\n\tjournal      = {Journal of Research on Language and Computation},\n\tvolume       = 3,\n\tpages        = {281--332}\n}\n@inproceedings{CORAL,\n\ttitle        = {Return of Frustratingly Easy Domain Adaptation},\n\tauthor       = {Sun, Baochen and Feng, Jiashi and Saenko, Kate},\n\tyear         = 2016,\n\tbooktitle    = {Proceedings of the Thirtieth AAAI Conference on Artificial Intelligence},\n\tlocation     = {Phoenix, Arizona},\n\tpublisher    = {AAAI Press},\n\tseries       = {AAAI'16},\n\tpages        = {2058–2065},\n\tnumpages     = 8\n}\n@inproceedings{corbett2017algorithmic,\n\ttitle        = {Algorithmic decision making and the cost of fairness},\n\tauthor       = {Corbett-Davies, Sam and Pierson, Emma and Feller, Avi and Goel, Sharad and Huq, Aziz},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},\n\tpages        = {797--806},\n\torganization = {ACM}\n}\n@article{corbett2018measure,\n\ttitle        = {The measure and mismeasure of fairness: A critical review of fair machine learning},\n\tauthor       = {Sam Corbett-Davies and Sharad Goel},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1808.00023}\n}\n@article{corcoran1998perfect,\n\ttitle        = {Perfect sampling of {H}arris recurrent {M}arkov chains},\n\tauthor       = {JN Corcoran and RL Tweedie},\n\tyear         = 1998,\n\tjournal      = {preprint}\n}\n@article{cordella1995method,\n\ttitle        = {A method for improving classification reliability of multilayer perceptrons},\n\tauthor       = {Luigi Pietro Cordella and Claudio De Stefano and Francesco Tortorella and Mario Vento},\n\tyear         = 1995,\n\tjournal      = {IEEE Transactions on Neural Networks},\n\tvolume       = 6,\n\tnumber       = 5,\n\tpages        = {1140--1147}\n}\n@article{cordts2016cityscapes,\n\ttitle        = {The Cityscapes Dataset for Semantic Urban Scene Understanding},\n\tauthor       = {Cordts, Marius and Omran, Mohamed and Ramos, Sebastian and Rehfeld, Timo and Enzweiler, Markus and Benenson, Rodrigo and Franke, Uwe and Roth, Stefan and Schiele, Bernt},\n\tyear         = 2016,\n\tmonth        = jun,\n\tjournal      = {2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},\n\tbooktitle    = {Proceedings of the IEEE conference on computer vision and pattern recognition},\n\tpublisher    = {IEEE},\n\tpages        = {3213--3223},\n\tdoi          = {10.1109/cvpr.2016.350},\n\tisbn         = 9781467388511,\n\turl          = {http://dx.doi.org/10.1109/CVPR.2016.350}\n}\n@inproceedings{coreyes2019guiding,\n\ttitle        = {Guiding Policies with Language via Meta-Learning},\n\tauthor       = {John D. Co-Reyes and Abhishek Gupta and Suvansh Sanjeev and Nick Altieri and John DeNero and Pieter Abbeel and Sergey Levine},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{corless1995singular,\n\ttitle        = {The singular value decomposition for polynomial systems},\n\tauthor       = {Robert M Corless and Patrizia M Gianni and Barry M Trager and Stephen M Watt},\n\tyear         = 1995,\n\tbooktitle    = {International Symposium on Symbolic and Algebraic Computation},\n\tpages        = {195--207}\n}\n@inproceedings{corless1997reordered,\n\ttitle        = {A reordered {S}chur factorization method for zero-dimensional polynomial systems with multiple roots},\n\tauthor       = {Corless, R. M. and Gianni, P. M. and Trager, B. M.},\n\tyear         = 1997,\n\tbooktitle    = {Proceedings of the 1997 International Symposium on Symbolic and Algebraic Computation},\n\tpages        = {133--140},\n\torganization = {ACM}\n}\n@article{corless2009symmetries,\n\ttitle        = {Using symmetries in the eigenvalue method for polynomial systems},\n\tauthor       = {Robert M Corless and Karin Gatermann and Ilias S Kotsireas},\n\tyear         = 2009,\n\tjournal      = {Journal of Symbolic Computation},\n\tvolume       = 44,\n\tnumber       = 11,\n\tpages        = {1536--1550}\n}\n@inproceedings{cormack1998efficient,\n\ttitle        = {Efficient Construction of Large Test Collections},\n\tauthor       = {Gordon V Cormack and Christopher R Palmer and Charles L A Clarke},\n\tyear         = 1998,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)}\n}\n@book{cormen2001introduction,\n\ttitle        = {Introduction to Algorithms},\n\tauthor       = {Cormen, Thomas H. and Stein, Clifford and Rivest, Ronald L. and Leiserson, Charles E.},\n\tyear         = 2001,\n\tpublisher    = {McGraw-Hill Higher Education},\n\tisbn         = {0070131511},\n\tedition      = {2nd}\n}\n@inproceedings{corney2016million,\n\ttitle        = {What do a million news articles look like?},\n\tauthor       = {David Corney and Dyaa Albakour and Miguel Martinez-Alvarez and Samir Moussa},\n\tyear         = 2016,\n\tbooktitle    = {NewsIR@ ECIR},\n\tpages        = {42--47}\n}\n@inproceedings{coronato2019reinforcement,\n\ttitle        = {A reinforcement learning based intelligent system for the healthcare treatment assistance of patients with disabilities},\n\tauthor       = {Coronato, Antonio and Naeem, Muddasar},\n\tyear         = 2019,\n\tbooktitle    = {International Symposium on Pervasive Systems, Algorithms and Networks},\n\tpages        = {15--28},\n\torganization = {Springer}\n}\n@article{corso2020survey,\n\ttitle        = {A Survey of Algorithms for Black-Box Safety Validation},\n\tauthor       = {Anthony Corso and Robert J. Moss and Mark Koren and R. Lee and Mykel J. Kochenderfer},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2005.02979}\n}\n@article{cortes1995svm,\n\ttitle        = {Support-vector networks},\n\tauthor       = {Corinna Cortes and Vladimir Vapnik},\n\tyear         = 1995,\n\tjournal      = {Machine Learning},\n\tvolume       = 20,\n\tnumber       = 3,\n\tpages        = {273--297}\n}\n@inproceedings{cortes2011domain,\n\ttitle        = {Domain adaptation in regression},\n\tauthor       = {Cortes, Corinna and Mohri, Mehryar},\n\tyear         = 2011,\n\tbooktitle    = {Algorithmic Learning Theory},\n\tpages        = {308--323},\n\torganization = {Springer}\n}\n@article{cortes2014domain,\n\ttitle        = {Domain adaptation and sample bias correction theory and algorithm for regression},\n\tauthor       = {Cortes, Corinna and Mohri, Mehryar},\n\tyear         = 2014,\n\tjournal      = {Theoretical Computer Science},\n\tpublisher    = {Elsevier},\n\tvolume       = 519,\n\tpages        = {103--126}\n}\n@inproceedings{cortes2015adaptation,\n\ttitle        = {Adaptation algorithm and theory based on generalized discrepancy},\n\tauthor       = {Cortes, Corinna and Mohri, Mehryar and Mu{\\~n}oz Medina, Andr{\\'e}s},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},\n\tpages        = {169--178},\n\torganization = {ACM}\n}\n@article{cortes2019adaptation,\n\ttitle        = {Adaptation based on generalized discrepancy},\n\tauthor       = {Cortes, Corinna and Mohri, Mehryar and Medina, Andr{\\'e}s Munoz},\n\tyear         = 2019,\n\tjournal      = {The Journal of Machine Learning Research},\n\tpublisher    = {JMLR. org},\n\tvolume       = 20,\n\tnumber       = 1,\n\tpages        = {1--30}\n}\n@article{cortez2008using,\n\ttitle        = {Using data mining to predict secondary school student performance},\n\tauthor       = {Paulo Cortez and Alice Maria Gon{\\c{c}}alves Silva},\n\tyear         = 2008,\n\tjournal      = {Proceedings of 5th FUture BUsiness TEChnology Conference}\n}\n@article{CosinePowerSum,\n\ttitle        = {A Note on Cosine Power Sums},\n\tauthor       = {Mircea Merca},\n\tyear         = 2012,\n\tmonth        = may,\n\tjournal      = {Journal of Integer Sequences},\n\tvolume       = 15,\n\tpages        = {12.5.3}\n}\n@article{cote2012chernoff,\n\ttitle        = {A Chernoff-type lower bound for the {G}aussian {Q}-function},\n\tauthor       = {François D Côté and Ioannis N Psaromiligkos and Warren J Gross},\n\tyear         = 2012,\n\tjournal      = {arXiv preprint arXiv:1202.6483}\n}\n@inproceedings{coulom2006efficient,\n\ttitle        = {Efficient selectivity and backup operators in {M}onte-{C}arlo tree search},\n\tauthor       = {Remi Coulom},\n\tyear         = 2006,\n\tbooktitle    = {International Conference on Computers and Games},\n\tpages        = {72--83}\n}\n@article{coulom2007computing,\n\ttitle        = {Computing elo ratings of move patterns in the game of go},\n\tauthor       = {Rémi Coulom},\n\tyear         = 2007,\n\tjournal      = {Computer Games Workshop}\n}\n@inproceedings{cour2007solving,\n\ttitle        = {Solving {M}arkov random fields with spectral relaxation},\n\tauthor       = {T. Cour and J. Shi},\n\tyear         = 2007,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)},\n\tpages        = {75--82}\n}\n@article{courtiol2019deep,\n\ttitle        = {Deep learning-based classification of mesothelioma improves prediction of patient outcome},\n\tauthor       = {Pierre Courtiol and Charles Maussion and Matahi Moarii and Elodie Pronier and Samuel Pilcer and Meriem Sefta and Pierre Manceron and Sylvain Toldo and Mikhail Zaslavskiy and Nolwenn Le Stang and others},\n\tyear         = 2019,\n\tjournal      = {Nature medicine},\n\tvolume       = 25,\n\tnumber       = 10,\n\tpages        = {1519--1525}\n}\n@inproceedings{cousot77abstract,\n\ttitle        = {Abstract interpretation: a unified lattice model for static analysis of programs by construction or approximation of fixpoints},\n\tauthor       = {Patrick Cousot and Radhia Cousot},\n\tyear         = 1977,\n\tbooktitle    = {Principles of Programming Languages (POPL)},\n\tpages        = {238--252}\n}\n@book{cover2012elements,\n\ttitle        = {Elements of information theory},\n\tauthor       = {Cover, Thomas M. and Thomas, Joy A.},\n\tyear         = 2012,\n\tpublisher    = {John Wiley \\& Sons}\n}\n@article{covertype,\n\ttitle        = {Comparative accuracies of artificial neural networks and discriminant analysis in predicting forest cover types from cartographic variables},\n\tauthor       = {Blackard, Jock A and Dean, Denis J},\n\tyear         = 1999,\n\tjournal      = {Computers and electronics in agriculture},\n\tpublisher    = {Elsevier},\n\tvolume       = 24,\n\tnumber       = 3,\n\tpages        = {131--151}\n}\n@article{cowles1996markov,\n\ttitle        = {{M}arkov chain {M}onte {C}arlo convergence diagnostics: a comparative review},\n\tauthor       = {Mary Kathryn Cowles and Bradley P Carlin},\n\tyear         = 1996,\n\tjournal      = {Journal of the American Statistical Association (JASA)},\n\tvolume       = 91,\n\tnumber       = 434,\n\tpages        = {883--904}\n}\n@article{cox04pseudo,\n\ttitle        = {A note on pseudolikelihood constructed from marginal densities},\n\tauthor       = {David R. Cox and Nancy Reid},\n\tyear         = 2004,\n\tjournal      = {Biometrika},\n\tvolume       = 91,\n\tpages        = {729--737}\n}\n@incollection{cozman2006risks,\n\ttitle        = {Risks of Semi-Supervised Learning: How Unlabeled Data Can Degrade Performance of Generative Classifiers},\n\tauthor       = {Fabio Cozman and Ira Cohen},\n\tyear         = 2006,\n\tbooktitle    = {Semi-Supervised Learning}\n}\n@article{cps18,\n\ttitle        = {Dynamical Isometry and a Mean Field Theory of {RNNs}: Gating Enables Signal Propagation in Recurrent Neural Networks},\n\tauthor       = {Chen, Minmin and Pennington, Jeffrey and Schoenholz, Samuel S.},\n\tyear         = 2018,\n\tjournal      = {arXiv:1806.05394},\n\turl          = {http://arxiv.org/abs/1806.05394}\n}\n@inproceedings{CQ-adaptive-sampling,\n\ttitle        = {Stochastic Dual Coordinate Ascent with Adaptive Probabilities},\n\tauthor       = {Dominik Csiba and Zheng Qu and Peter Richt{\\'{a}}rik},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the 32nd International Conference on Machine Learning, {ICML} 2015, Lille, France, 6-11 July 2015},\n\tpages        = {674--683}\n}\n@inproceedings{CR08,\n\ttitle        = {Learning Mixtures of Product Distributions using Correlations and Independence},\n\tauthor       = {K. Chaudhuri and S. Rao},\n\tyear         = 2008,\n\tbooktitle    = {COLT}\n}\n@article{craig1933tchebychef,\n\ttitle        = {On the Tchebychef inequality of Bernstein},\n\tauthor       = {Craig, Cecil C},\n\tyear         = 1933,\n\tjournal      = {The Annals of Mathematical Statistics},\n\tpublisher    = {JSTOR},\n\tvolume       = 4,\n\tnumber       = 2,\n\tpages        = {94--102}\n}\n@book{craig2003introduction,\n\ttitle        = {Introduction to Aerodynamics},\n\tauthor       = {Gale Craig},\n\tyear         = 2003,\n\tpublisher    = {Regenerative Press},\n\taddress      = {Anderson, IN},\n\tvolume       = 1,\n\tedition      = {1st}\n}\n@inproceedings{crammar07multiple,\n\ttitle        = {Learning from Multiple Sources},\n\tauthor       = {Koby Crammar and Michael Kearns and Jennifer Wortman},\n\tyear         = 2007,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{crammer06passive,\n\ttitle        = {Online Passive-Aggressive Algorithms},\n\tauthor       = {Koby Crammer and Ofer Dekel and Joseph Keshet and Shai Shalev-Shwartz and Yoram Singer},\n\tyear         = 2006,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 7,\n\tpages        = {551--585}\n}\n@article{crammer2002learnability,\n\ttitle        = {On the learnability and design of output codes for multiclass problems},\n\tauthor       = {Koby Crammer and Yoram Singer},\n\tyear         = 2002,\n\tjournal      = {Machine learning},\n\tvolume       = 47,\n\tnumber       = 2,\n\tpages        = {201--233}\n}\n@inproceedings{craven1999constructing,\n\ttitle        = {Constructing biological knowledge bases by extracting information from text sources},\n\tauthor       = {Mark Craven and Johan Kumlien and others},\n\tyear         = 1999,\n\tbooktitle    = {ISMB},\n\tpages        = {77--86}\n}\n@article{craven78gcv,\n\ttitle        = {Smoothing noisy data with spline functions. Estimating the correct degree of smoothing by the method of generalized cross-validation},\n\tauthor       = {P. Craven and G. Wahba},\n\tyear         = 1978,\n\tjournal      = {Numerische Mathematik},\n\tvolume       = 31,\n\tnumber       = 4,\n\tpages        = {377--403}\n}\n@phdthesis{craven96trepan,\n\ttitle        = {Extracting comprehensible models from trained neural networks},\n\tauthor       = {Mark W. Craven},\n\tyear         = 1996,\n\tschool       = {University of Wisconsin at Madison}\n}\n@article{crawford1990learning,\n\ttitle        = {Learning how to cooperate: Optimal play in repeated coordination games},\n\tauthor       = {Vincent P Crawford and Hans Haller},\n\tyear         = 1990,\n\tjournal      = {Econometrica: Journal of the Econometric Society},\n\tpages        = {571--595}\n}\n@article{creager2019flexibly,\n\ttitle        = {Flexibly fair representation learning by disentanglement},\n\tauthor       = {Elliot Creager and David Madras and J{\\\"o}rn-Henrik Jacobsen and Marissa A Weis and Kevin Swersky and Toniann Pitassi and Richard Zemel},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1906.02589}\n}\n@inproceedings{creager2021environment,\n\ttitle        = {Environment inference for invariant learning},\n\tauthor       = {Elliot Creager and J{\\\"o}rn-Henrik Jacobsen and Richard Zemel},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {2189--2200}\n}\n@article{creel2021algorithmic,\n\ttitle        = {The Algorithmic Leviathan: Arbitrariness, Fairness, and Opportunity in Algorithmic Decision Making Systems},\n\tauthor       = {Kathleen Creel and Deborah Hellman},\n\tyear         = 2021,\n\tjournal      = {Virginia Public Law and Legal Theory Research Paper},\n\tvolume       = 13\n}\n@inproceedings{crescenzi2001roadrunner,\n\ttitle        = {Roadrunner: Towards automatic data extraction from large web sites},\n\tauthor       = {Valter Crescenzi and Giansalvatore Mecca and Paolo Merialdo and others},\n\tyear         = 2001,\n\tbooktitle    = {VLDB},\n\tvolume       = 1,\n\tpages        = {109--118}\n}\n@inproceedings{cretu2008casting,\n\ttitle        = {Casting out demons: Sanitizing training data for anomaly sensors},\n\tauthor       = {Gabriela F. Cretu and Angelos Stavrou and Michael E. Locasto and Salvatore J. Stolfo and Angelos D. Keromytis},\n\tyear         = 2008,\n\tbooktitle    = {IEEE Symposium on Security and Privacy},\n\tpages        = {81--95}\n}\n@article{crowson2017calibration,\n\ttitle        = {Assessing Calibration of Prognostic Risk Scores},\n\tauthor       = {Cynthia S. Crowson and Elizabeth J. Atkinson and Terry M. Therneau},\n\tyear         = 2017,\n\tjournal      = {Statistical Methods in Medical Research},\n\tvolume       = 25,\n\tpages        = {1692--1706}\n}\n@inproceedings{CRT,\n\ttitle        = {Stable signal recovery from incomplete and inaccurate measurements},\n\tauthor       = {E. Candes and J. Romberg and T. Tao},\n\tyear         = 2006,\n\tbooktitle    = {Communications of Pure and Applied Math},\n\tpages        = {1207--1223}\n}\n@article{CS93,\n\ttitle        = {Blind beamforming for non {G}aussian signals},\n\tauthor       = {J.-F. Cardoso and A. Souloumiac},\n\tyear         = 1993,\n\tjournal      = {IEE Proceedings-F},\n\tvolume       = 140,\n\tnumber       = 6,\n\tpages        = {362--370}\n}\n@inproceedings{CSCFU12,\n\ttitle        = {Spectral Learning of Latent-Variable {PCFG}s},\n\tauthor       = {S. B. Cohen and K. Stratos and M. Collins and D. P. Foster and L. Ungar},\n\tyear         = 2012,\n\tbooktitle    = {ACL}\n}\n@article{csiszar04info,\n\ttitle        = {Information Theory and Statistics: A Tutorial},\n\tauthor       = {Imre Csisz\\'ar and Paul Shields},\n\tyear         = 2004,\n\tjournal      = {Foundations and Trends in Communications and Information Theory},\n\tvolume       = 1,\n\tpages        = {417--528}\n}\n@inproceedings{css-missing-data,\n\ttitle        = {Column Subset Selection with Missing Data via Active Sampling},\n\tauthor       = {Wang, Yining and Singh, Aarti},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Artificial Intelligence and Statisticss}\n}\n@inproceedings{CT,\n\ttitle        = {Decoding by linear programming},\n\tauthor       = {E. Candes and T. Tao},\n\tyear         = 2005,\n\tbooktitle    = {IEEE Trans. on Information Theory},\n\tpages        = {4203--4215}\n}\n@inproceedings{cuayahuitl2015strategic,\n\ttitle        = {Strategic Dialogue Management via Deep Reinforcement Learning},\n\tauthor       = {Heriberto Cuayáhuitl and Simon Keizer and Oliver Lemon},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{cubuk2017intriguing,\n\ttitle        = {Intriguing properties of adversarial examples},\n\tauthor       = {Ekin D Cubuk and Barret Zoph and Samuel S Schoenholz and Quoc V Le},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1711.02846}\n}\n@inproceedings{cubuk2019autoaugment,\n\ttitle        = {Autoaugment: Learning augmentation policies from data},\n\tauthor       = {Ekin D Cubuk and Barret Zoph and Dandelion Mane and Vijay Vasudevan and Quoc V Le},\n\tyear         = 2019,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@inproceedings{cubuk2020randaugment,\n\ttitle        = {Randaugment: Practical automated data augmentation with a reduced search space},\n\tauthor       = {Cubuk, Ekin D and Zoph, Barret and Shlens, Jonathon and Le, Quoc V},\n\tyear         = 2020,\n\tbooktitle    = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops},\n\tpages        = {702--703}\n}\n@article{cuccarese2020functional,\n\ttitle        = {Functional immune mapping with deep-learning enabled phenomics applied to immunomodulatory and {COVID-19} drug discovery},\n\tauthor       = {Michael F Cuccarese and Berton A Earnshaw and Katie Heiser and Ben Fogelson and Chadwick T Davis and Peter F McLean and Hannah B Gordon and Kathleen-Rose Skelly and Fiona L Weathersby and Vlad Rodic and others},\n\tyear         = 2020,\n\tjournal      = {bioRxiv}\n}\n@inproceedings{cui2005dependency,\n\ttitle        = {Question answering passage retrieval using dependency relations},\n\tauthor       = {Hang Cui and Renxu Sun and Keya Li and Min-Yen Kan and Tat-Seng Chua},\n\tyear         = 2005,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)},\n\tpages        = {400--407}\n}\n@inproceedings{cui2018large,\n\ttitle        = {Large scale fine-grained categorization and domain-specific transfer learning},\n\tauthor       = {Yin Cui and Yang Song and Chen Sun and Andrew Howard and Serge Belongie},\n\tyear         = 2018,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {4109--4118}\n}\n@inproceedings{cui2019class,\n\ttitle        = {Class-balanced loss based on effective number of samples},\n\tauthor       = {Cui, Yin and Jia, Menglin and Lin, Tsung-Yi and Song, Yang and Belongie, Serge},\n\tyear         = 2019,\n\tbooktitle    = {Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},\n\tpages        = {9268--9277}\n}\n@article{cule2010logconcave,\n\ttitle        = {Maximum likelihood estimation of a multi-dimensional log-concave density},\n\tauthor       = {Madeleine Cule and Richard Samworth and Michael Stewart},\n\tyear         = 2010,\n\tjournal      = {Journal of the Royal Statistical Society},\n\tvolume       = 73,\n\tpages        = {545--603}\n}\n@inproceedings{culotta2005reducing,\n\ttitle        = {Reducing labeling effort for structured prediction tasks},\n\tauthor       = {Aron Culotta and Andrew McCallum},\n\tyear         = 2005,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tpages        = {746--751}\n}\n@inproceedings{culy2003limits,\n\ttitle        = {The Limits of n-gram Translation Evaluation Metrics},\n\tauthor       = {Christopher Culy and Susanne Z Riehemann},\n\tyear         = 2003,\n\tbooktitle    = {MT Summit IX},\n\tpages        = {71--78}\n}\n@article{culy96null,\n\ttitle        = {Null objects in {E}nglish recipes},\n\tauthor       = {Christopher Culy},\n\tyear         = 1996,\n\tjournal      = {Language Variation and Change},\n\tvolume       = 8,\n\tpages        = {91--124}\n}\n@inproceedings{cur-missing,\n\ttitle        = {{CUR} Algorithm for Partially Observed Matrices},\n\tauthor       = {Xu, Miao and Jin, Rong and Zhou, Zhi-Hua},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Machine Learning}\n}\n@inproceedings{curran2003language,\n\ttitle        = {Language independent {NER} using a maximum entropy tagger},\n\tauthor       = {James R Curran and Stephen Clark},\n\tyear         = 2003,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)},\n\tpages        = {164--167}\n}\n@article{curtis2014trust,\n\ttitle        = {A trust region algorithm with a worst-case iteration complexity of {O}({$\\epsilon^{-3/2}$}) for nonconvex optimization},\n\tauthor       = {Curtis, Frank E and Robinson, Daniel P and Samadi, Mohammadreza},\n\tyear         = 2014,\n\tjournal      = {Mathematical Programming},\n\tpublisher    = {Springer},\n\tpages        = {1--32}\n}\n@article{curtis2016trust,\n\ttitle        = {A trust region algorithm with a worst-case iteration complexity of$\\backslash$ mathcal $\\{$O$\\}$($\\backslash$ epsilon\\^{}$\\{$-3/2$\\}$) for nonconvex optimization},\n\tauthor       = {Curtis, Frank E and Robinson, Daniel P and Samadi, Mohammadreza},\n\tyear         = 2016,\n\tjournal      = {Mathematical Programming},\n\tpublisher    = {Springer},\n\tpages        = {1--32}\n}\n@book{curto1996solution,\n\ttitle        = {Solution of the truncated complex moment problem for flat data},\n\tauthor       = {Ra{\\'u}l E Curto and Lawrence A Fialkow},\n\tyear         = {1996 1996},\n\tpublisher    = {American Mathematical Society},\n\tvolume       = 568\n}\n@book{curto1998flat,\n\ttitle        = {Flat extensions of positive moment matrices: Recursively generated relations},\n\tauthor       = {Ra{\\'u}l E Curto and Lawrence A Fialkow},\n\tyear         = 1998,\n\tpublisher    = {American Mathematical Society},\n\tvolume       = 648\n}\n@article{curto2000truncated,\n\ttitle        = {The truncated complex {K}-moment problem},\n\tauthor       = {Ra{\\'u}l Curto and Lawrence Fialkow},\n\tyear         = 2000,\n\tjournal      = {Transactions of the American mathematical society},\n\tvolume       = 352,\n\tnumber       = 6,\n\tpages        = {2825--2855}\n}\n@article{curto2005truncated,\n\ttitle        = {Truncated {K}-moment problems in several variables},\n\tauthor       = {Ra{\\'u}l E Curto and Lawrence A Fialkow},\n\tyear         = 2005,\n\tjournal      = {arXiv preprint arXiv:math/0507067}\n}\n@article{cvbb14,\n\ttitle        = {On the properties of neural machine translation: Encoder-decoder approaches},\n\tauthor       = {Cho, Kyunghyun and Van Merri{\\\"e}nboer, Bart and Bahdanau, Dzmitry and Bengio, Yoshua},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1409.1259}\n}\n@inproceedings{cvgbbsb14,\n\ttitle        = {Learning phrase representations using RNN encoder-decoder for statistical machine translation},\n\tauthor       = {Cho, Kyunghyun and Van Merri{\\\"e}nboer, Bart and Gulcehre, Caglar and Bahdanau, Dzmitry and Bougares, Fethi and Schwenk, Holger and Bengio, Yoshua},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1406.1078},\n\tbooktitle    = {EMNLP},\n\tpages        = {1724--1734}\n}\n@article{cybenko1989approximation,\n\ttitle        = {Approximation by superpositions of a sigmoidal function},\n\tauthor       = {Cybenko, George},\n\tyear         = 1989,\n\tjournal      = {Mathematics of control, signals and systems},\n\tpublisher    = {Springer},\n\tvolume       = 2,\n\tnumber       = 4,\n\tpages        = {303--314}\n}\n@book{cypher93pbd,\n\ttitle        = {Watch what {I} do: Programming by demonstration},\n\tauthor       = {A. Cypher},\n\tyear         = 1993,\n\tpublisher    = {MIT Press}\n}\n@inproceedings{d16,\n\ttitle        = {Complexity theoretic limitations on learning halfspaces},\n\tauthor       = {Daniely, Amit},\n\tyear         = 2016,\n\tbooktitle    = {Proceedings of the forty-eighth annual ACM symposium on Theory of Computing (STOC)},\n\tpages        = {105--117},\n\torganization = {ACM}\n}\n@inproceedings{d17,\n\ttitle        = {{SGD} learns the conjugate kernel class of the network},\n\tauthor       = {Daniely, Amit},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NIPS)},\n\tpages        = {2422--2430}\n}\n@article{d1963probabilistic,\n\ttitle        = {A probabilistic production and inventory problem},\n\tauthor       = {d'Epenoux, F},\n\tyear         = 1963,\n\tjournal      = {Management Science},\n\tpublisher    = {INFORMS},\n\tvolume       = 10,\n\tnumber       = 1,\n\tpages        = {98--108}\n}\n@inproceedings{d63036efc9d24f07b8908864667e28aa,\n\ttitle        = {A lower bound for the smallest eigenvalue of the Laplacian},\n\tauthor       = {Jeff Cheeger},\n\tyear         = 1969,\n\tbooktitle    = {Proceedings of the Princeton conference in honor of Professor S. Bochner},\n\tpages        = {195--199},\n\tlanguage     = {English (US)}\n}\n@article{dafoe2020open,\n\ttitle        = {Open Problems in Cooperative {AI}},\n\tauthor       = {Allan Dafoe and Edward Hughes and Yoram Bachrach and Tantum Collins and Kevin R McKee and Joel Z Leibo and Kate Larson and Thore Graepel},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2012.08630}\n}\n@incollection{dagan2006pascal,\n\ttitle        = {The {PASCAL} recognising textual entailment challenge},\n\tauthor       = {Ido Dagan and Oren Glickman and Bernardo Magnini},\n\tyear         = 2006,\n\tbooktitle    = {Machine learning challenges. evaluating predictive uncertainty, visual object classification, and recognising tectual entailment},\n\tpages        = {177--190}\n}\n@book{dagan2013rte,\n\ttitle        = {Recognizing Textual Entailment: Models and Applications},\n\tauthor       = {Ido Dagan and Dan Roth and Mark Sammons and Fabio Massimo Zanzotto},\n\tyear         = 2013,\n\tpublisher    = {Morgan and Claypool Publishers}\n}\n@techreport{dahl03splitmerge,\n\ttitle        = {An Improved Merge-Split Sampler for Conjugate {D}irichlet Process Mixture Models},\n\tauthor       = {D. B. Dahl},\n\tyear         = 2003,\n\tinstitution  = {Department of Statistics, University of Wisconsin (U. Wisconsin)}\n}\n@techreport{dahl03univariate,\n\ttitle        = {Modal Clustering in a Univariate Class of Product Partition Models},\n\tauthor       = {D. B. Dahl},\n\tyear         = 2003,\n\tinstitution  = {Department of Statistics, University of Wisconsin (U. Wisconsin)}\n}\n@inproceedings{dahl1994expanding,\n\ttitle        = {Expanding the scope of the {ATIS} task: The {ATIS-3} corpus},\n\tauthor       = {Deborah A Dahl and Madeleine Bates and Michael Brown and William Fisher and Kate Hunicke-Smith and David Pallett and Christine Pao and Alexander Rudnicky and Elizabeth Shriberg},\n\tyear         = 1994,\n\tbooktitle    = {Workshop on Human Language Technology},\n\tpages        = {43--48}\n}\n@inproceedings{dai17learning,\n\ttitle        = {Learning from Conditional Distributions via Dual Embeddings},\n\tauthor       = {Bo Dai and Niao He and Yunpeng Pan and Byron Boots and Le Song},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the 20th International Conference on Artificial Intelligence and Statistics (AISTATS)},\n\tpages        = {1458--1467}\n}\n@inproceedings{dai18boosting,\n\ttitle        = {Boosting the Actor with Dual Critic},\n\tauthor       = {Bo Dai and Albert Shaw and Niao He and Lihong Li and Le Song},\n\tyear         = 2018,\n\tbooktitle    = {Proceedings of the 6th International Conference on Learning Representations (ICLR)},\n\tnote         = {arXiv:1712.10282}\n}\n@inproceedings{dai2010decision,\n\ttitle        = {Decision-theoretic control of crowd-sourced workflows},\n\tauthor       = {Peng Dai and Mausam and Daniel S Weld},\n\tyear         = 2010,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{dai2015semi,\n\ttitle        = {Semi-supervised sequence learning},\n\tauthor       = {Andrew M. Dai and Quoc V. Le},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{dai2018towards,\n\ttitle        = {Towards Theoretical Understanding of Large Batch Training in Stochastic Gradient Descent},\n\tauthor       = {Dai, Xiaowu and Zhu, Yuhua},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1812.00542}\n}\n@article{dalal2018safe,\n\ttitle        = {Safe exploration in continuous action spaces},\n\tauthor       = {Dalal, Gal and Dvijotham, Krishnamurthy and Vecerik, Matej and Hester, Todd and Paduraru, Cosmin and Tassa, Yuval},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1801.08757}\n}\n@article{dalalyan06second,\n\ttitle        = {Penalized maximum likelihood and semiparametric second-order efficiency},\n\tauthor       = {A. S. Dalalyan and G. K. Golubev and A. B. Tsybakov},\n\tyear         = 2006,\n\tjournal      = {Annals of Statistics},\n\tvolume       = 34,\n\tnumber       = 1,\n\tpages        = {169--201}\n}\n@article{dalalyan2017theoretical,\n\ttitle        = {Theoretical guarantees for approximate sampling from smooth and log-concave densities},\n\tauthor       = {Dalalyan, Arnak S},\n\tyear         = 2017,\n\tjournal      = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)},\n\tpublisher    = {Wiley Online Library},\n\tvolume       = 79,\n\tnumber       = 3,\n\tpages        = {651--676}\n}\n@inproceedings{dale03navigation,\n\ttitle        = {CORAL: using natural language generation for navigational assistance},\n\tauthor       = {Robert Dale and Sabine Geldof and Jean-Philippe Prost},\n\tyear         = 2003,\n\tbooktitle    = {Australasian computer science conference},\n\tpages        = {35--44}\n}\n@inproceedings{dalvi2004adversarial,\n\ttitle        = {Adversarial Classification},\n\tauthor       = {Nilesh Dalvi and Pedro Domingos and Mausam and Sumit Sanghai and Deepak Verma},\n\tyear         = 2004,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)}\n}\n@article{dalvi2011automatic,\n\ttitle        = {Automatic wrappers for large scale web extraction},\n\tauthor       = {Nilesh Dalvi and Ravi Kumar and Mohamed Soliman},\n\tyear         = 2011,\n\tjournal      = {Proceedings of the VLDB Endowment},\n\tvolume       = 4,\n\tnumber       = 4,\n\tpages        = {219--230}\n}\n@inproceedings{dalvi2012websets,\n\ttitle        = {WebSets: Extracting sets of entities from the web using unsupervised information extraction},\n\tauthor       = {Bhavana Dalvi and William Cohen and Jamie Callan},\n\tyear         = 2012,\n\tbooktitle    = {Web Search and Data Mining (WSDM)},\n\tpages        = {243--252}\n}\n@inproceedings{dalvi2013aggregating,\n\ttitle        = {Aggregating crowdsourced binary ratings},\n\tauthor       = {Nilesh Dalvi and Anirban Dasgupta and Ravi Kumar and Vibhor Rastogi},\n\tyear         = 2013,\n\tbooktitle    = {World Wide Web (WWW)},\n\tpages        = {285--294}\n}\n@inproceedings{damen2018kitchens,\n\ttitle        = {Scaling Egocentric Vision: The {EPIC}-{KITCHENS} Dataset},\n\tauthor       = {Dima Damen and Hazel Doughty and Giovanni Maria Farinella and Sanja Fidler and Antonino Furnari and Evangelos Kazakos and Davide Moltisanti and Jonathan Munro and Toby Perrett and Will Price and Michael Wray},\n\tyear         = 2018,\n\tbooktitle    = {European Conference on Computer Vision (ECCV)}\n}\n@misc{damian2021label,\n\ttitle        = {Label Noise SGD Provably Prefers Flat Global Minimizers},\n\tauthor       = {Alex Damian and Tengyu Ma and Jason Lee},\n\tyear         = 2021,\n\teprint       = {2106.06530},\n\tarchiveprefix = {arXiv},\n\tprimaryclass = {cs.LG}\n}\n@inproceedings{damour2020fairness,\n\ttitle        = {Fairness is not static: deeper understanding of long term fairness via simulation studies},\n\tauthor       = {Alexander D'Amour and Hansa Srinivasan and James Atwood and Pallavi Baljekar and D Sculley and Yoni Halpern},\n\tyear         = 2020,\n\tbooktitle    = {Proceedings of the 2020 Conference on Fairness, Accountability, and Transparency},\n\tpages        = {525--534}\n}\n@article{damour2020underspecification,\n\ttitle        = {Underspecification Presents Challenges for Credibility in Modern Machine Learning},\n\tauthor       = {Alexander D'Amour and Katherine Heller and Dan Moldovan and Ben Adlam and Babak Alipanahi and Alex Beutel and Christina Chen and Jonathan Deaton and Jacob Eisenstein and Matthew D Hoffman and others},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2011.03395}\n}\n@article{dan2020scaling,\n\ttitle        = {Scaling out-of-distribution detection for real-world settings},\n\tauthor       = {Dan Hendrycks and Steven Basart and Mantas Mazeika and Mohammadreza Mostajabi and Jacob Steinhardt and Dawn Song},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:1911.11132}\n}\n@inproceedings{dang2006overview,\n\ttitle        = {Overview of {DUC} 2006},\n\tauthor       = {Hoa Trang Dang},\n\tyear         = 2006,\n\tbooktitle    = {Document Understanding Conference}\n}\n@article{dang2014randomized,\n\ttitle        = {Randomized first-order methods for saddle point optimization},\n\tauthor       = {Dang, Cong and Lan, Guanghui},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1409.8625}\n}\n@article{dang2016kbp,\n\ttitle        = {Cold Start Knowledge Base Population at {TAC} {KBP} 2016},\n\tauthor       = {Hoa Trang Dang},\n\tyear         = 2016,\n\tjournal      = {Text Analytics Conference}\n}\n@inproceedings{dani2007price,\n\ttitle        = {The price of bandit information for online optimization},\n\tauthor       = {Dani, Varsha and Kakade, Sham M and Hayes, Thomas P},\n\tyear         = 2007,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {345--352}\n}\n@inproceedings{dani2008stochastic,\n\ttitle        = {Stochastic linear optimization under bandit feedback},\n\tauthor       = {Dani, Varsha and Hayes, Thomas P and Kakade, Sham M},\n\tyear         = 2008,\n\tbooktitle    = {Conference on Learning Theory}\n}\n@article{daniel2016hierarchical,\n\ttitle        = {Hierarchical relative entropy policy search},\n\tauthor       = {C. Daniel and G. Neumann and O. Kroemer and J. Peters},\n\tyear         = 2016,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 17,\n\tpages        = {3190--3239}\n}\n@article{daniels2010monotone,\n\ttitle        = {Monotone and partially monotone neural networks},\n\tauthor       = {Hennie Daniels and Marina Velikova},\n\tyear         = 2010,\n\tjournal      = {IEEE Transactions on Neural Networks},\n\tvolume       = 21,\n\tnumber       = 6,\n\tpages        = {906--917}\n}\n@inproceedings{daniely2016toward,\n\ttitle        = {Toward deeper understanding of neural networks: The power of initialization and a dual view on expressivity},\n\tauthor       = {Daniely, Amit and Frostig, Roy and Singer, Yoram},\n\tyear         = 2016,\n\tbooktitle    = {Advances In Neural Information Processing Systems},\n\tpages        = {2253--2261}\n}\n@article{dann2014policy,\n\ttitle        = {Policy evaluation with temporal differences: a survey and comparison.},\n\tauthor       = {Dann, Christoph and Neumann, Gerhard and Peters, Jan},\n\tyear         = 2014,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 15,\n\tnumber       = 1,\n\tpages        = {809--883}\n}\n@inproceedings{dann2015sample,\n\ttitle        = {Sample complexity of episodic fixed-horizon reinforcement learning},\n\tauthor       = {Dann, Christoph and Brunskill, Emma},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {2818--2826}\n}\n@inproceedings{dann2017unifying,\n\ttitle        = {Unifying {PAC} and Regret: Uniform {PAC} Bounds for Episodic Reinforcement Learning},\n\tauthor       = {Dann, Christoph and Lattimore, Tor and Brunskill, Emma},\n\tyear         = 2017,\n\tjournal      = {Advances in Neural Information Processing Systems},\n\tbooktitle    = {Proceedings of the 31st International Conference on Neural Information Processing Systems},\n\tlocation     = {Long Beach, California, USA},\n\tpublisher    = {Curran Associates Inc.},\n\taddress      = {Red Hook, NY, USA},\n\tseries       = {NIPS’17},\n\tpages        = {5717–5727},\n\tisbn         = 9781510860964,\n\tnumpages     = 11\n}\n@inproceedings{dann2018oracle,\n\ttitle        = {On Oracle-Efficient {PAC}-{RL} with Rich Observations},\n\tauthor       = {Christoph Dann and Nan Jiang and Akshay Krishnamurthy and Alekh Agarwal and John Langford and Robert E. Schapire},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems}\n}\n@article{dann2018polynomial,\n\ttitle        = {On Polynomial Time PAC Reinforcement Learning with Rich Observations},\n\tauthor       = {Dann, Christoph and Jiang, Nan and Krishnamurthy, Akshay and Agarwal, Alekh and Langford, John and Schapire, Robert E},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1803.00606}\n}\n@inproceedings{dann2019policy,\n\ttitle        = {Policy Certificates: Towards Accountable Reinforcement Learning},\n\tauthor       = {Dann, Christoph and Li, Lihong and Wei, Wei and Brunskill, Emma},\n\tyear         = 2019,\n\tbooktitle    = {Proceedings of the 36th International Conference on Machine Learning},\n\tpages        = {1507--1516}\n}\n@article{danskin1966theory,\n\ttitle        = {The theory of max-min with applications},\n\tauthor       = {John M Danskin},\n\tyear         = {1966 1966},\n\tjournal      = {SIAM Journal on Applied Math}\n}\n@book{dantzig2016linear,\n\ttitle        = {Linear Programming and Extensions},\n\tauthor       = {Dantzig, George},\n\tyear         = 2016,\n\tpublisher    = {Princeton University Press, Princeton, NJ}\n}\n@inproceedings{dao2019kernel,\n\ttitle        = {A kernel theory of modern data augmentation},\n\tauthor       = {Dao, Tri and Gu, Albert and Ratner, Alexander and Smith, Virginia and De Sa, Chris and R{\\'e}, Christopher},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {1528--1537},\n\torganization = {PMLR}\n}\n@inproceedings{das2009paraphrase,\n\ttitle        = {Paraphrase identification as probabilistic quasi-synchronous recognition},\n\tauthor       = {Dipanjan Das and Noah A Smith},\n\tyear         = 2009,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {468--476}\n}\n@inproceedings{das2017learning,\n\ttitle        = {Learning Cooperative Visual Dialog Agents with Deep Reinforcement Learning},\n\tauthor       = {Abhishek Das and Satwik Kottur and Jos{'e} MF Moura and Stefan Lee and Dhruv Batra},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Computer Vision (ICCV)}\n}\n@inproceedings{das2017question,\n\ttitle        = {Question Answering on Knowledge Bases and Text using Universal Schema and Memory Networks},\n\tauthor       = {Rajarshi Das and Manzil Zaheer and Siva Reddy and Andrew McCallum},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{das2017visdial,\n\ttitle        = {Visual Dialog},\n\tauthor       = {Abhishek Das and Satwik Kottur and Khushi Gupta and Avi Singh and Deshraj Yadav and Jos'e M.F. Moura and Devi Parikh and Dhruv Batra},\n\tyear         = 2017,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@inproceedings{das2018go,\n\ttitle        = {Go for a Walk and Arrive at the Answer: Reasoning Over Paths in Knowledge Bases using Reinforcement Learning},\n\tauthor       = {Rajarshi Das and Shehzaad Dhuliawala and Manzil Zaheer and Luke Vilnis and Ishan Durugkar and Akshay Krishnamurthy and Alex Smola and Andrew McCallum},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{das2018modular,\n\ttitle        = {Neural Modular Control for Embodied Question Answering},\n\tauthor       = {Abhishek Das and Georgia Gkioxari and Stefan Lee and Devi Parikh and Dhruv Batra},\n\tyear         = 2018,\n\tbooktitle    = {Conference on Robot Learning (CORL)}\n}\n@inproceedings{das2019multi,\n\ttitle        = {Multi-step Retriever-Reader Interaction for Scalable Open-domain Question Answering},\n\tauthor       = {Rajarshi Das and Shehzaad Dhuliawala and Manzil Zaheer and Andrew McCallum},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{Das99,\n\ttitle        = {Learning Mixutres of {G}aussians},\n\tauthor       = {S. Dasgupta},\n\tyear         = 1999,\n\tbooktitle    = {FOCS}\n}\n@inproceedings{dasari2019robonet,\n\ttitle        = {RoboNet: Large-Scale Multi-Robot Learning},\n\tauthor       = {Sudeep Dasari and F. Ebert and Stephen Tian and Suraj Nair and Bernadette Bucher and K. Schmeckpeper and Siddharth Singh and Sergey Levine and Chelsea Finn},\n\tyear         = 2019,\n\tbooktitle    = {Conference on Robot Learning (CORL)}\n}\n@inproceedings{Dasgupta:GaussianMixture,\n\ttitle        = {Learning mixutres of Gaussians},\n\tauthor       = {Sanjoy Dasgupta},\n\tyear         = 1999,\n\tbooktitle    = {FOCS}\n}\n@article{dasgupta07em,\n\ttitle        = {A Probabilistic Analysis of {EM} for Mixtures of Separated, Spherical {G}aussians},\n\tauthor       = {Sanjoy Dasgupta and Leonard Schulman},\n\tyear         = 2007,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 8\n}\n@inproceedings{dasgupta1999learning,\n\ttitle        = {Learning mixtures of Gaussians},\n\tauthor       = {Dasgupta, Sanjoy},\n\tyear         = 1999,\n\tbooktitle    = {Foundations of Computer Science, 1999. 40th Annual Symposium on},\n\tpages        = {634--644},\n\torganization = {IEEE}\n}\n@article{dasgupta2002pac,\n\ttitle        = {PAC generalization bounds for co-training},\n\tauthor       = {Dasgupta, Sanjoy and Littman, Michael L and McAllester, David},\n\tyear         = 2002,\n\tjournal      = {Advances in neural information processing systems},\n\tpublisher    = {MIT; 1998},\n\tvolume       = 1,\n\tpages        = {375--382}\n}\n@article{dasgupta2003elementary,\n\ttitle        = {An elementary proof of a theorem of Johnson and Lindenstrauss},\n\tauthor       = {Dasgupta, Sanjoy and Gupta, Anupam},\n\tyear         = 2003,\n\tjournal      = {Random Structures \\& Algorithms},\n\tpublisher    = {Wiley Online Library},\n\tvolume       = 22,\n\tnumber       = 1,\n\tpages        = {60--65}\n}\n@inproceedings{dasgupta2009mine,\n\ttitle        = {Mine the easy, classify the hard: a semi-supervised approach to automatic sentiment classification},\n\tauthor       = {Sajib Dasgupta and Vincent Ng},\n\tyear         = 2009,\n\tbooktitle    = {Conference on Natural Language Processing (KONVENS)},\n\tpages        = {701--709}\n}\n@inproceedings{dasgupta2012consistency,\n\ttitle        = {Consistency of nearest neighbor classification under selective sampling},\n\tauthor       = {Dasgupta, Sanjoy},\n\tyear         = 2012,\n\tbooktitle    = {Conference on Learning Theory},\n\tpages        = {18--1}\n}\n@inproceedings{dasgupta2013crowdsourced,\n\ttitle        = {Crowdsourced judgement elicitation with endogenous proficiency},\n\tauthor       = {Anirban Dasgupta and Arpita Ghosh},\n\tyear         = 2013,\n\tbooktitle    = {World Wide Web (WWW)},\n\tpages        = {319--330}\n}\n@inproceedings{dasgupta99mixgauss,\n\ttitle        = {Learning mixtures of {G}aussians},\n\tauthor       = {S. Dasgupta},\n\tyear         = 1999,\n\tbooktitle    = {Foundations of Computer Science (FOCS)}\n}\n@inproceedings{Daskalakis2011,\n\ttitle        = {{Near-optimal no-regret algorithms for zero-sum games}},\n\tauthor       = {Daskalakis, Constantinos and Deckelbaum, Alan and Kim, Anthony},\n\tyear         = 2011,\n\tbooktitle    = {Proceedings of the Twenty-Second Annual ACM-SIAM Symposium on Discrete Algorithms - SODA '11},\n\tpages        = {235--254},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Daskalakis, Deckelbaum, Kim - 2011 - Near-optimal no-regret algorithms for zero-sum games.pdf:pdf},\n\tmendeley-groups = {Game Theory/Zero-sum Games}\n}\n@article{daskalakis2016ten,\n\ttitle        = {Ten steps of {EM} suffice for mixtures of two Gaussians},\n\tauthor       = {Daskalakis, Constantinos and Tzamos, Christos and Zampetakis, Manolis},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1609.00368}\n}\n@article{daspremont2008smooth,\n\ttitle        = {Smooth  optimization  with  approximate  gradient},\n\tauthor       = {Alexandre d{\\'}Aspremont},\n\tyear         = 2008,\n\tjournal      = {SIAM Journal on Optimization},\n\tvolume       = 19,\n\tnumber       = 3,\n\tpages        = {1171--1183}\n}\n@misc{dataset2017era5,\n\ttitle        = {{ERA5}: Fifth generation of {ECMWF} atmospheric reanalyses of the global climate},\n\tauthor       = {C3S},\n\tyear         = 2017\n}\n@inproceedings{dathathri2020sdp,\n\ttitle        = {Enabling certification of verification-agnostic networks via memory-efficient semidefinite programming},\n\tauthor       = {Sumanth Dathathri and Krishnamurthy Dvijotham and Alexey Kurakin and Aditi Raghunathan and Jonathan Uesato and Rudy Bunel and Shreya Shankar and Jacob Steinhardt and Ian Goodfellow and Percy Liang and Pushmeet Kohli},\n\tyear         = 2020,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{datta2016algorithmic,\n\ttitle        = {Algorithmic transparency via quantitative input influence: Theory and experiments with learning systems},\n\tauthor       = {Anupam Datta and Shayak Sen and Yair Zick},\n\tyear         = 2016,\n\tbooktitle    = {Security and Privacy (SP), 2016 IEEE Symposium on},\n\tpages        = {598--617}\n}\n@book{dattorro05convexoptimization,\n\ttitle        = {Convex Optimization and Euclidean Distance Geometry},\n\tauthor       = {Jon Dattorro},\n\tyear         = {2005 2005},\n\tpublisher    = {Meboo}\n}\n@inproceedings{daume04alignment,\n\ttitle        = {Phrase-Based {HMM} Approach to Document/Abstract Alignment},\n\tauthor       = {Hal {Daum{\\'e} III} and Daniel Marcu},\n\tyear         = 2004,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{daume05clustering,\n\ttitle        = {A {B}ayesian Model for Supervised Clustering with the {D}irichlet Process Prior},\n\tauthor       = {Hal {Daum{\\'e} III} and Daniel Marcu},\n\tyear         = 2005,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 1,\n\tpages        = {1--48}\n}\n@inproceedings{daume06sum,\n\ttitle        = {{B}ayesian Query-Focused Summarization},\n\tauthor       = {Hal {Daum{\\'e} III} and Daniel Marcu},\n\tyear         = 2006,\n\tbooktitle    = {International Conference on Computational Linguistics and Association for Computational Linguistics (COLING/ACL)}\n}\n@inproceedings{daume07astar,\n\ttitle        = {Fast search for {D}irichlet process mixture models},\n\tauthor       = {Hal {Daum{\\'e} III}},\n\tyear         = 2007,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)}\n}\n@inproceedings{daume07easyadapt,\n\ttitle        = {Frustratingly Easy Domain Adaptation},\n\tauthor       = {Hal {Daum{\\'e} III}},\n\tyear         = 2007,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{daume09searn,\n\ttitle        = {Search-based Structured Prediction},\n\tauthor       = {Hal {Daum{\\'e} III} and John Langford and Daniel Marcu},\n\tyear         = 2009,\n\tjournal      = {Machine Learning},\n\tvolume       = 75,\n\tpages        = {297--325}\n}\n@inproceedings{daume10easyss,\n\ttitle        = {Frustratingly Easy Semi-Supervised Domain Adaptation},\n\tauthor       = {Hal {Daum{\\'e} III} and Abhishek Kumar and Avishek Saha},\n\tyear         = 2010,\n\tbooktitle    = {Workshop on Domain Adaptation for NLP}\n}\n@inproceedings{dauphin2014identifying,\n\ttitle        = {Identifying and attacking the saddle point problem in high-dimensional non-convex optimization},\n\tauthor       = {Dauphin, Yann N and Pascanu, Razvan and Gulcehre, Caglar and Cho, Kyunghyun and Ganguli, Surya and Bengio, Yoshua},\n\tyear         = 2014,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {2933--2941}\n}\n@article{dauphin2016language,\n\ttitle        = {Language modeling with gated convolutional networks},\n\tauthor       = {Dauphin, Yann N and Fan, Angela and Auli, Michael and Grangier, David},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1612.08083}\n}\n@article{davenport20141,\n\ttitle        = {1-bit matrix completion},\n\tauthor       = {Davenport, Mark A and Plan, Yaniv and van den Berg, Ewout and Wootters, Mary},\n\tyear         = 2014,\n\tjournal      = {Information and Inference},\n\tpublisher    = {Oxford University Press},\n\tvolume       = 3,\n\tnumber       = 3,\n\tpages        = {189--223}\n}\n@article{davidson2001local,\n\ttitle        = {Local operator theory, random matrices and Banach spaces},\n\tauthor       = {Davidson, Kenneth R and Szarek, Stanislaw J},\n\tyear         = 2001,\n\tjournal      = {Handbook of the geometry of Banach spaces},\n\tvolume       = 1,\n\tnumber       = {317-366},\n\tpages        = 131\n}\n@article{davidson2018,\n\ttitle        = {Hyperspherical Variational Auto-Encoders},\n\tauthor       = {Tim R. Davidson and Luca Falorsi and Nicola De Cao and Thomas Kipf and Jakub M. Tomczak},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1804.00891}\n}\n@inproceedings{davies1987logical,\n\ttitle        = {A logical approach to reasoning by analogy},\n\tauthor       = {Todd R. Davies and Stuart J. Russell},\n\tyear         = 1987,\n\tbooktitle    = {In IJCAI-87},\n\tpublisher    = {Morgan Kaufmann},\n\tpages        = {264--270}\n}\n@misc{davies2008COCA,\n\ttitle        = {The Corpus of Contemporary {A}merican {E}nglish ({COCA}): One billion words, 1990-2019},\n\tauthor       = {Mark Davies},\n\tyear         = 2008,\n\thowpublished = {\\url{https://www.english-corpora.org/coca/}}\n}\n@article{davis1997adaptive,\n\ttitle        = {Adaptive greedy approximations},\n\tauthor       = {Davis, Geoff and Mallat, Stephane and Avellaneda, Marco},\n\tyear         = 1997,\n\tjournal      = {Constructive approximation},\n\tpublisher    = {Springer},\n\tvolume       = 13,\n\tnumber       = 1,\n\tpages        = {57--98},\n\towner        = {gewor_000},\n\ttimestamp    = {2013.11.10}\n}\n@misc{davis2003psycholinguistic,\n\ttitle        = {Psycholinguistic evidence on scrambled letters in reading},\n\tauthor       = {Matt Davis},\n\tyear         = 2003,\n\thowpublished = {\\url{https://www.mrc-cbu.cam.ac.uk/}}\n}\n@article{davis2017calibration,\n\ttitle        = {Calibration drift in regression and machine learning models for acute kidney injury},\n\tauthor       = {Sharon E Davis and Thomas A Lasko and Guanhua Chen and Edward D Siew and Michael E Matheny},\n\tyear         = 2017,\n\tjournal      = {Journal of the American Medical Informatics Association},\n\tvolume       = 24,\n\tnumber       = 6,\n\tpages        = {1052--1061}\n}\n@article{davis2018stochastic,\n\ttitle        = {Stochastic subgradient method converges on tame functions},\n\tauthor       = {Davis, Damek and Drusvyatskiy, Dmitriy and Kakade, Sham and Lee, Jason D},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1804.07795},\n\tpublisher    = {Springer},\n\tvolume       = 20,\n\tnumber       = 1,\n\tpages        = {119--154}\n}\n@inproceedings{davis2021catformer,\n\ttitle        = {Catformer: Designing Stable Transformers via Sensitivity Analysis},\n\tauthor       = {Jared Quincy Davis and Albert Gu and Krzysztof Choromanski and Tri Dao and Christopher Re and Chelsea Finn and Percy Liang},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{DavisKahan,\n\ttitle        = {The rotation of eigenvectors by a perturbation. {III}},\n\tauthor       = {Davis, Chandler and Kahan, William Morton},\n\tyear         = 1970,\n\tjournal      = {SIAM Journal on Numerical Analysis},\n\tpublisher    = {SIAM},\n\tvolume       = 7,\n\tnumber       = 1,\n\tpages        = {1--46}\n}\n@article{dawid1979maximum,\n\ttitle        = {Maximum likelihood estimation of observer error-rates using the {EM} algorithm},\n\tauthor       = {Alexander Philip Dawid and Allan M. Skene},\n\tyear         = 1979,\n\tjournal      = {Applied Statistics},\n\tvolume       = 1,\n\tpages        = {20--28}\n}\n@article{dawid1982well,\n\ttitle        = {The Well-Calibrated {B}ayesian},\n\tauthor       = {Dawid, A. Philip},\n\tyear         = 1982,\n\tjournal      = {Journal of the American Statistical Association (JASA)},\n\tvolume       = 77,\n\tnumber       = 379,\n\tpages        = {605--610}\n}\n@article{dawid1984prequential,\n\ttitle        = {Present position and potential developments: Some personal views: Statistical theory: The prequential approach},\n\tauthor       = {A Philip Dawid},\n\tyear         = 1984,\n\tjournal      = {Journal of the Royal Statistical Society. Series A (General)},\n\tvolume       = 147,\n\tpages        = {278--292}\n}\n@article{dawid1984present,\n\ttitle        = {Present position and potential developments: Some personal views statistical theory the prequential approach},\n\tauthor       = {Dawid, A Philip},\n\tyear         = 1984,\n\tjournal      = {Journal of the Royal Statistical Society: Series A (General)},\n\tpublisher    = {Wiley Online Library},\n\tvolume       = 147,\n\tnumber       = 2,\n\tpages        = {278--290}\n}\n@article{dawid1985calibration,\n\ttitle        = {Calibration-based empirical probability},\n\tauthor       = {Dawid, A Philip},\n\tyear         = 1985,\n\tjournal      = {The Annals of Statistics},\n\tpublisher    = {JSTOR},\n\tpages        = {1251--1274}\n}\n@article{dawid1999prequential,\n\ttitle        = {Prequential probability: Principles and properties},\n\tauthor       = {Dawid, A Philip and Vovk, Vladimir G and others},\n\tyear         = 1999,\n\tjournal      = {Bernoulli},\n\tpublisher    = {Bernoulli Society for Mathematical Statistics and Probability},\n\tvolume       = 5,\n\tnumber       = 1,\n\tpages        = {125--162}\n}\n@article{dawid2014theory,\n\ttitle        = {Theory and applications of proper scoring rules},\n\tauthor       = {Dawid, Alexander Philip and Musio, Monica},\n\tyear         = 2014,\n\tjournal      = {Metron},\n\tpublisher    = {Springer},\n\tvolume       = 72,\n\tnumber       = 2,\n\tpages        = {169--183}\n}\n@article{day1969estimating,\n\ttitle        = {Estimating the Components of a Mixture of Normal Distributions},\n\tauthor       = {N E Day},\n\tyear         = 1969,\n\tjournal      = {Biometrika},\n\tvolume       = 56,\n\tnumber       = 3,\n\tpages        = {463--474}\n}\n@inproceedings{DBLP:conf/acl/MaasDPHNP11,\n\ttitle        = {Learning Word Vectors for Sentiment Analysis},\n\tauthor       = {Andrew L. Maas and Raymond E. Daly and Peter T. Pham and Dan Huang and Andrew Y. Ng and Christopher Potts},\n\tyear         = 2011,\n\tbooktitle    = {The 49th Annual Meeting of the Association for Computational Linguistics}\n}\n@proceedings{DBLP:conf/aistats/2015,\n\ttitle        = {Proceedings of the Eighteenth International Conference on Artificial Intelligence and Statistics, {AISTATS} 2015, San Diego, California, USA, May 9-12, 2015},\n\tyear         = 2015,\n\tpublisher    = {JMLR.org},\n\tseries       = {{JMLR} Workshop and Conference Proceedings},\n\tvolume       = 38,\n\turl          = {http://jmlr.org/proceedings/papers/v38/},\n\teditor       = {Guy Lebanon and S. V. N. Vishwanathan},\n\ttimestamp    = {Tue, 12 Jul 2016 21:51:16 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/aistats/2015},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{DBLP:conf/colt/0002N15,\n\ttitle        = {Fast Exact Matrix Completion with Finite Samples},\n\tauthor       = {Prateek Jain and Praneeth Netrapalli},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of The 28th Conference on Learning Theory, {COLT} 2015, Paris, France, July 3-6, 2015},\n\tpages        = {1007--1034},\n\turl          = {http://jmlr.org/proceedings/papers/v40/Jain15.html},\n\tcrossref     = {DBLP:conf/colt/2015},\n\ttimestamp    = {Mon, 06 Jul 2015 08:31:46 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/colt/0002N15},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{DBLP:conf/colt/LeeSJR16,\n\ttitle        = {Gradient Descent Only Converges to Minimizers},\n\tauthor       = {Jason D. Lee and Max Simchowitz and Michael I. Jordan and Benjamin Recht},\n\tyear         = 2016,\n\tbooktitle    = {Proceedings of the 29th Conference on Learning Theory, {COLT} 2016, New York, USA, June 23-26, 2016},\n\tpages        = {1246--1257},\n\turl          = {http://jmlr.org/proceedings/papers/v49/lee16.html},\n\tcrossref     = {DBLP:conf/colt/2016},\n\ttimestamp    = {Wed, 13 Jul 2016 17:28:13 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/colt/LeeSJR16},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{DBLP:conf/eccv/HeZRS16,\n\ttitle        = {Identity Mappings in Deep Residual Networks},\n\tauthor       = {Kaiming He and Xiangyu Zhang and Shaoqing Ren and Jian Sun},\n\tyear         = 2016,\n\tbooktitle    = {Computer Vision - {ECCV} 2016 - 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part {IV}},\n\tpages        = {630--645},\n\tdoi          = {10.1007/978-3-319-46493-0_38},\n\turl          = {http://dx.doi.org/10.1007/978-3-319-46493-0_38},\n\tcrossref     = {DBLP:conf/eccv/2016-4},\n\ttimestamp    = {Tue, 20 Sep 2016 08:40:38 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/eccv/HeZRS16},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{DBLP:conf/emnlp/PenningtonSM14,\n\ttitle        = {Glove: Global Vectors for Word Representation},\n\tauthor       = {Jeffrey Pennington and Richard Socher and Christopher D. Manning},\n\tyear         = 2014,\n\tjournal      = {Proceedings of the Empiricial Methods in Natural Language Processing},\n\tbooktitle    = {Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing(EMNLP), 2014},\n\tpages        = {1532--1543},\n\turl          = {http://aclweb.org/anthology/D/D14/D14-1162.pdf},\n\tcrossref     = {DBLP:conf/emnlp/2014},\n\ttimestamp    = {Sat, 15 Nov 2014 14:45:18 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/emnlp/PenningtonSM14},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@proceedings{DBLP:conf/focs/1999,\n\ttitle        = {40th Annual Symposium on Foundations of Computer Science, FOCS '99, 17-18 October, 1999, New York, NY, USA},\n\tyear         = 1999,\n\tbooktitle    = {FOCS},\n\tpublisher    = {IEEE Computer Society},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de}\n}\n@inproceedings{DBLP:conf/focs/Dasgupta99,\n\ttitle        = {Learning Mixtures of Gaussians},\n\tauthor       = {Sanjoy Dasgupta},\n\tyear         = 1999,\n\tbooktitle    = {FOCS},\n\tpages        = {634--644},\n\tee           = {http://doi.ieeecomputersociety.org/10.1109/SFFCS.1999.814639},\n\tcrossref     = {DBLP:conf/focs/1999},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de}\n}\n@proceedings{DBLP:conf/icml/2008,\n\ttitle        = {Machine Learning, Proceedings of the Twenty-Fifth International Conference (ICML 2008), Helsinki, Finland, June 5-9, 2008},\n\tyear         = 2008,\n\tbooktitle    = {ICML},\n\tpublisher    = {ACM},\n\tseries       = {ACM International Conference Proceeding Series},\n\tvolume       = 307,\n\teditor       = {William W. Cohen and Andrew McCallum and Sam T. Roweis},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de}\n}\n@proceedings{DBLP:conf/icml/2013,\n\ttitle        = {Proceedings of the 30th International Conference on Machine Learning, {ICML} 2013, Atlanta, GA, USA, 16-21 June 2013},\n\tyear         = 2013,\n\tpublisher    = {JMLR.org},\n\tseries       = {{JMLR} Proceedings},\n\tvolume       = 28,\n\turl          = {http://jmlr.org/proceedings/papers/v28/},\n\ttimestamp    = {Thu, 11 Sep 2014 07:28:55 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/icml/2013},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{DBLP:conf/icml/IoffeS15,\n\ttitle        = {Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift},\n\tauthor       = {Sergey Ioffe and Christian Szegedy},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1502.03167},\n\tbooktitle    = {Proceedings of the 32nd International Conference on Machine Learning, {ICML} 2015, Lille, France, 6-11 July 2015},\n\tpages        = {448--456},\n\turl          = {http://jmlr.org/proceedings/papers/v37/ioffe15.html},\n\tcrossref     = {DBLP:conf/icml/2015},\n\ttimestamp    = {Tue, 12 Jul 2016 21:51:15 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/icml/IoffeS15},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{DBLP:conf/icml/NairH10,\n\ttitle        = {Rectified Linear Units Improve Restricted Boltzmann Machines},\n\tauthor       = {Vinod Nair and Geoffrey E. Hinton},\n\tyear         = 2010,\n\tbooktitle    = {Proceedings of the 27th International Conference on Machine Learning (ICML-10), June 21-24, 2010, Haifa, Israel},\n\tpages        = {807--814},\n\turl          = {http://www.icml2010.org/papers/432.pdf},\n\tcrossref     = {DBLP:conf/icml/2010},\n\ttimestamp    = {Fri, 12 Jun 2015 19:15:11 +0200},\n\tbiburl       = {http://dblp2.uni-trier.de/rec/bib/conf/icml/NairH10},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{DBLP:conf/icml/SaRO15,\n\ttitle        = {Global Convergence of Stochastic Gradient Descent for Some Non-convex Matrix Problems},\n\tauthor       = {Christopher De Sa and Christopher R{\\'{e}} and Kunle Olukotun},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the 32nd International Conference on Machine Learning, {ICML} 2015, Lille, France, 6-11 July 2015},\n\tpages        = {2332--2341},\n\turl          = {http://jmlr.org/proceedings/papers/v37/sa15.html},\n\tcrossref     = {DBLP:conf/icml/2015},\n\ttimestamp    = {Tue, 12 Jul 2016 21:51:16 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/icml/SaRO15},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{DBLP:conf/icml/VincentLBM08,\n\ttitle        = {Extracting and composing robust features with denoising autoencoders},\n\tauthor       = {Pascal Vincent and Hugo Larochelle and Yoshua Bengio and Pierre-Antoine Manzagol},\n\tyear         = 2008,\n\tbooktitle    = {ICML},\n\tpages        = {1096--1103},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de},\n\tee           = {http://doi.acm.org/10.1145/1390156.1390294},\n\towner        = {rongge},\n\ttimestamp    = {2013.09.25}\n}\n@inproceedings{DBLP:conf/nips/ArgyriouEP06,\n\ttitle        = {Multi-Task Feature Learning},\n\tauthor       = {Andreas Argyriou and Theodoros Evgeniou and Massimiliano Pontil},\n\tyear         = 2006,\n\tbooktitle    = {NIPS},\n\tpages        = {41--48},\n\tee           = {http://books.nips.cc/papers/files/nips19/NIPS2006_0251.pdf},\n\tcrossref     = {DBLP:conf/nips/2006},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de}\n}\n@inproceedings{DBLP:conf/nips/LevyG14,\n\ttitle        = {Neural Word Embedding as Implicit Matrix Factorization},\n\tauthor       = {Omer Levy and Yoav Goldberg},\n\tbooktitle    = {Advances in Neural Information Processing Systems (NIPS), 2015},\n\turl          = {http://papers.nips.cc/paper/5477-neural-word-embedding-as-implicit-matrix-factorization},\n\tcrossref     = {DBLP:conf/nips/2014},\n\ttimestamp    = {Wed, 10 Dec 2014 21:34:12 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/nips/LevyG14},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{DBLP:conf/nips/MaronLB10,\n\ttitle        = {Sphere Embedding: An Application to Part-of-Speech Induction},\n\tauthor       = {Yariv Maron and Michael Lamar and Elie Bienenstock},\n\tyear         = 2010,\n\tbooktitle    = {Advances in Neural Information Processing Systems}\n}\n@inproceedings{DBLP:conf/nips/MaW15,\n\ttitle        = {Sum-of-Squares Lower Bounds for Sparse {PCA}},\n\tauthor       = {Tengyu Ma and Avi Wigderson},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015, December 7-12, 2015, Montreal, Quebec, Canada},\n\tpages        = {1612--1620},\n\turl          = {http://papers.nips.cc/paper/5724-sum-of-squares-lower-bounds-for-sparse-pca},\n\tcrossref     = {DBLP:conf/nips/2015},\n\ttimestamp    = {Fri, 08 Apr 2016 19:32:52 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/nips/MaW15},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{DBLP:conf/nips/SutskeverVL14,\n\ttitle        = {Sequence to Sequence Learning with Neural Networks},\n\tauthor       = {Ilya Sutskever and Oriol Vinyals and Quoc V. Le},\n\tyear         = 2014,\n\tbooktitle    = {Proc.~$27$th NIPS},\n\tpages        = {3104--3112},\n\turl          = {http://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks},\n\ttimestamp    = {Wed, 10 Dec 2014 21:34:12 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/nips/SutskeverVL14},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@proceedings{DBLP:conf/soda/2008,\n\ttitle        = {Proceedings of the Nineteenth Annual ACM-SIAM Symposium on Discrete Algorithms, SODA 2008, San Francisco, California, USA, January 20-22, 2008},\n\tyear         = 2008,\n\tbooktitle    = {SODA},\n\tpublisher    = {SIAM},\n\teditor       = {Shang-Hua Teng},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de},\n\tee           = {http://dl.acm.org/citation.cfm?id=1347082}\n}\n@inproceedings{DBLP:conf/soda/Indyk08,\n\ttitle        = {Explicit constructions for compressed sensing of sparse signals},\n\tauthor       = {Piotr Indyk},\n\tyear         = 2008,\n\tbooktitle    = {SODA},\n\tpages        = {30--33},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de},\n\tcrossref     = {DBLP:conf/soda/2008},\n\tee           = {http://dl.acm.org/citation.cfm?id=1347082.1347086}\n}\n@inproceedings{DBLP:conf/stoc/BarakKS15,\n\ttitle        = {Dictionary Learning and Tensor Decomposition via the Sum-of-Squares Method},\n\tauthor       = {Boaz Barak and Jonathan A. Kelner and David Steurer},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the Forty-Seventh Annual {ACM} on Symposium on Theory of Computing, {STOC} 2015, Portland, OR, USA, June 14-17, 2015},\n\tpages        = {143--151},\n\tdoi          = {10.1145/2746539.2746605},\n\turl          = {http://doi.acm.org/10.1145/2746539.2746605},\n\tcrossref     = {DBLP:conf/stoc/2015},\n\ttimestamp    = {Wed, 10 Jun 2015 17:20:57 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/stoc/BarakKS15},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{DBLP:conf/stoc/HopkinsSSS16,\n\ttitle        = {Fast spectral algorithms from sum-of-squares proofs: tensor decomposition and planted sparse vectors},\n\tauthor       = {Samuel B. Hopkins and Tselil Schramm and Jonathan Shi and David Steurer},\n\tyear         = 2016,\n\tbooktitle    = {Proceedings of the 48th Annual {ACM} {SIGACT} Symposium on Theory of Computing, {STOC} 2016, Cambridge, MA, USA, June 18-21, 2016},\n\tpages        = {178--191},\n\tdoi          = {10.1145/2897518.2897529},\n\turl          = {http://doi.acm.org/10.1145/2897518.2897529},\n\tcrossref     = {DBLP:conf/stoc/2016},\n\ttimestamp    = {Fri, 10 Jun 2016 10:47:01 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/stoc/HopkinsSSS16},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@article{DBLP:journals/corr/AgarwalA0NT13,\n\ttitle        = {Learning Sparsely Used Overcomplete Dictionaries via Alternating Minimization},\n\tauthor       = {Alekh Agarwal and Animashree Anandkumar and Prateek Jain and Praneeth Netrapalli and Rashish Tandon},\n\tyear         = 2013,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1310.7991},\n\tee           = {http://arxiv.org/abs/1310.7991},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de}\n}\n@article{DBLP:journals/corr/AgarwalAN13,\n\ttitle        = {Exact Recovery of Sparsely Used Overcomplete Dictionaries},\n\tauthor       = {Alekh Agarwal and Animashree Anandkumar and Praneeth Netrapalli},\n\tyear         = 2013,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1309.1952},\n\tee           = {http://arxiv.org/abs/1309.1952},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de}\n}\n@article{DBLP:journals/corr/AroraLM15,\n\ttitle        = {Why are deep nets reversible: {A} simple theory, with implications for training},\n\tauthor       = {Sanjeev Arora and Yingyu Liang and Tengyu Ma},\n\tyear         = 2015,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1511.05653},\n\turl          = {http://arxiv.org/abs/1511.05653},\n\ttimestamp    = {Tue, 01 Dec 2015 19:22:34 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/journals/corr/AroraLM15},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@article{DBLP:journals/corr/CohenS16,\n\ttitle        = {Convolutional Rectifier Networks as Generalized Tensor Decompositions},\n\tauthor       = {Nadav Cohen and Amnon Shashua},\n\tyear         = 2016,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1603.00162},\n\turl          = {http://arxiv.org/abs/1603.00162},\n\tarchiveprefix = {arXiv},\n\teprint       = {1603.00162},\n\ttimestamp    = {Wed, 07 Jun 2017 14:41:05 +0200},\n\tbiburl       = {http://dblp.org/rec/bib/journals/corr/CohenS16},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@article{DBLP:journals/corr/GeM15,\n\ttitle        = {Decomposing Overcomplete 3rd Order Tensors using Sum-of-Squares Algorithms},\n\tauthor       = {Rong Ge and Tengyu Ma},\n\tyear         = 2015,\n\tmonth        = apr,\n\tjournal      = {CoRR},\n\tbooktitle    = {Approximation, Randomization, and Combinatorial Optimization. Algorithms and Techniques(APPROX/RANDOM), 2015},\n\tvolume       = {abs/1504.05287},\n\tdoi          = {10.4230/LIPIcs.APPROX-RANDOM.2015.829},\n\turl          = {http://arxiv.org/abs/1504.05287},\n\ttimestamp    = {Sat, 02 May 2015 17:50:32 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/journals/corr/GeM15},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org},\n\tcrossref     = {DBLP:conf/approx/2015}\n}\n@article{DBLP:journals/corr/HardtM16,\n\ttitle        = {Identity Matters in Deep Learning},\n\tauthor       = {Moritz Hardt and Tengyu Ma},\n\tyear         = 2016,\n\tjournal      = {CoRR},\n\tbooktitle    = {5th International Conference on Learning Representations (ICLR 2017)},\n\tvolume       = {abs/1611.04231},\n\turl          = {http://arxiv.org/abs/1611.04231},\n\ttimestamp    = {Thu, 01 Dec 2016 19:32:08 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/journals/corr/HardtM16},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org},\n\talteditor    = {editor}\n}\n@article{DBLP:journals/corr/HardtMR16,\n\ttitle        = {Gradient Descent Learns Linear Dynamical Systems.},\n\tauthor       = {Moritz Hardt and Tengyu Ma and Benjamin Recht},\n\tyear         = 2016,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1609.05191},\n\tnumber       = 29,\n\tpages        = {1--44},\n\turl          = {http://arxiv.org/abs/1609.05191},\n\ttimestamp    = {Mon, 03 Oct 2016 17:51:10 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/journals/corr/HardtMR16},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@article{DBLP:journals/corr/HardtP14,\n\ttitle        = {Sharp bounds for learning a mixture of two gaussians},\n\tauthor       = {Moritz Hardt and Eric Price},\n\tyear         = 2014,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1404.4997},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de},\n\tee           = {http://arxiv.org/abs/1404.4997}\n}\n@article{DBLP:journals/corr/HuangLW16a,\n\ttitle        = {Densely Connected Convolutional Networks},\n\tauthor       = {Gao Huang and Zhuang Liu and Kilian Q. Weinberger},\n\tyear         = 2016,\n\tjournal      = {CoRR},\n\tbooktitle    = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},\n\tvolume       = {abs/1608.06993},\n\tpages        = {4700--4708},\n\turl          = {http://arxiv.org/abs/1608.06993},\n\ttimestamp    = {Fri, 02 Sep 2016 17:46:24 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/journals/corr/HuangLW16a},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@article{DBLP:journals/corr/ZhangPS17,\n\ttitle        = {Electron-Proton Dynamics in Deep Learning},\n\tauthor       = {Qiuyi Zhang and Rina Panigrahy and Sushant Sachdeva},\n\tyear         = 2017,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1702.00458},\n\turl          = {http://arxiv.org/abs/1702.00458},\n\ttimestamp    = {Wed, 07 Jun 2017 14:43:10 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/journals/corr/ZhangPS17},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@article{DBLP:journals/ijcv/RussakovskyDSKS15,\n\ttitle        = {ImageNet Large Scale Visual Recognition Challenge},\n\tauthor       = {Olga Russakovsky and Jia Deng and Hao Su and Jonathan Krause and Sanjeev Satheesh and Sean Ma and Zhiheng Huang and Andrej Karpathy and Aditya Khosla and Michael S. Bernstein and Alexander C. Berg and Fei{-}Fei Li},\n\tyear         = 2015,\n\tjournal      = {International Journal of Computer Vision},\n\tvolume       = 115,\n\tnumber       = 3,\n\tpages        = {211--252},\n\tdoi          = {10.1007/s11263-015-0816-y},\n\turl          = {http://dx.doi.org/10.1007/s11263-015-0816-y},\n\ttimestamp    = {Thu, 12 Nov 2015 16:51:37 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/journals/ijcv/RussakovskyDSKS15},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org},\n\teprint       = {arXiv:1409.0575},\n\tdate-modified = {2018-07-26 04:14:08 +0000},\n\tbdsk-url-1   = {http://dx.doi.org/10.1007/s11263-015-0816-y}\n}\n@article{DBLP:journals/jcss/Raghavan88,\n\ttitle        = {Probabilistic Construction of Deterministic Algorithms: Approximating Packing Integer Programs},\n\tauthor       = {Prabhakar Raghavan},\n\tyear         = 1988,\n\tjournal      = {J. Comput. Syst. Sci.},\n\tvolume       = 37,\n\tnumber       = 2,\n\tpages        = {130--143},\n\turl          = {http://www.cc.gatech.edu/~mihail/Rag88.pdf},\n\tee           = {http://dx.doi.org/10.1016/0022-0000(88)90003-7},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de}\n}\n@article{DBLP:journals/jmlr/LohW15,\n\ttitle        = {Regularized M-estimators with nonconvexity: statistical and algorithmic theory for local optima},\n\tauthor       = {Po{-}Ling Loh and Martin J. Wainwright},\n\tyear         = 2015,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 16,\n\tpages        = {559--616},\n\turl          = {http://dl.acm.org/citation.cfm?id=2789291},\n\ttimestamp    = {Thu, 11 Feb 2016 17:46:04 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/journals/jmlr/LohW15},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@article{DBLP:journals/jmlr/SpielmanWW12,\n\ttitle        = {Exact Recovery of Sparsely-Used Dictionaries},\n\tauthor       = {Daniel A. Spielman and Huan Wang and John Wright},\n\tyear         = 2012,\n\tjournal      = {Journal of Machine Learning Research - Proceedings Track},\n\tvolume       = 23,\n\tpages        = {37.1--37.18},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de},\n\tee           = {http://www.jmlr.org/proceedings/papers/v23/spielman12/spielman12.pdf}\n}\n@article{DBLP:journals/ml/HazanAK07,\n\ttitle        = {Logarithmic regret algorithms for online convex optimization},\n\tauthor       = {Elad Hazan and Amit Agarwal and Satyen Kale},\n\tyear         = 2007,\n\tmonth        = aug,\n\tjournal      = {Machine Learning},\n\tvolume       = 69,\n\tnumber       = {2-3},\n\tpages        = {169--192},\n\tdoi          = {10.1007/s10994-007-5016-8},\n\tissn         = {0885-6125},\n\turl          = {http://dx.doi.org/10.1007/s10994-007-5016-8},\n\ttimestamp    = {Thu, 13 Mar 2008 10:35:45 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/journals/ml/HazanAK07},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Hazan, Agarwal, Kale - 2007 - Logarithmic regret algorithms for online convex optimization.pdf:pdf},\n\tmendeley-groups = {Optimization/Stochastic Online Optimization}\n}\n@article{DBLP:journals/tit/JafarpourXHC09,\n\ttitle        = {Efficient and robust compressed sensing using optimized expander graphs},\n\tauthor       = {Sina Jafarpour and Weiyu Xu and Babak Hassibi and A. Robert Calderbank},\n\tyear         = 2009,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tvolume       = 55,\n\tnumber       = 9,\n\tpages        = {4299--4308},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de},\n\tee           = {http://dx.doi.org/10.1109/TIT.2009.2025528}\n}\n@article{DDV01,\n\ttitle        = {Independent Component Analysis and (Simultaneous) Third-Order Tensor Diagonalization},\n\tauthor       = {De Lathauwer, L. and De Moor, B. and Vandewalle, J.},\n\tyear         = 2001,\n\tjournal      = {IEEE Transactions on Signal Processing},\n\tvolume       = 49,\n\tnumber       = 10\n}\n@inproceedings{DE,\n\ttitle        = {Optimally sparse representation in general (non-orthogonal) dictionaries via $\\ell_1$-minimization},\n\tauthor       = {D. Donoho and M. Elad},\n\tyear         = 2003,\n\tbooktitle    = {PNAS},\n\tpages        = {2197--2202}\n}\n@article{de1931sul,\n\ttitle        = {On the subjective meaning of probability},\n\tauthor       = {De Finetti, Bruno},\n\tyear         = 1931,\n\tjournal      = {Fundamenta mathematicae},\n\tvolume       = 17,\n\tnumber       = 1,\n\tpages        = {298--329}\n}\n@article{de1960problemes,\n\ttitle        = {Les problemes de decisions sequentielles},\n\tauthor       = {De Ghellinck, Guy},\n\tyear         = 1960,\n\tjournal      = {Cahiers du Centre d’Etudes de Recherche Op{\\'e}rationnelle},\n\tvolume       = 2,\n\tnumber       = 2,\n\tpages        = {161--179}\n}\n@article{de1995decoupling,\n\ttitle        = {Decoupling inequalities for the tail probabilities of multivariate U-statistics},\n\tauthor       = {de la Pe{\\~n}a, Victor H and Montgomery-Smith, Stephen J},\n\tyear         = 1995,\n\tjournal      = {The Annals of Probability},\n\tpublisher    = {JSTOR},\n\tpages        = {806--816}\n}\n@article{de2000reject,\n\ttitle        = {To reject or not to reject: that is the question-an answer in case of neural classifiers},\n\tauthor       = {Claudio De Stefano and Carlo Sansone and Mario Vento},\n\tyear         = 2000,\n\tjournal      = {IEEE Transactions on Systems, Man, and Cybernetics, Part C (Applications and Reviews)},\n\tvolume       = 30,\n\tnumber       = 1,\n\tpages        = {84--94}\n}\n@article{de2003linear,\n\ttitle        = {The linear programming approach to approximate dynamic programming},\n\tauthor       = {de Farias, Daniela Pucci and Van Roy, Benjamin},\n\tyear         = 2003,\n\tjournal      = {Operations Research},\n\tpublisher    = {INFORMS},\n\tvolume       = 51,\n\tnumber       = 6,\n\tpages        = {850--865}\n}\n@article{de2005tutorial,\n\ttitle        = {A tutorial on the cross-entropy method},\n\tauthor       = {De Boer, Pieter-Tjerk and Kroese, Dirk P and Mannor, Shie and Rubinstein, Reuven Y},\n\tyear         = 2005,\n\tjournal      = {Annals of operations research},\n\tpublisher    = {Springer},\n\tvolume       = 134,\n\tnumber       = 1,\n\tpages        = {19--67}\n}\n@article{de2007fourth,\n\ttitle        = {Fourth-order cumulant-based blind identification of underdetermined mixtures},\n\tauthor       = {De Lathauwer, L. and Castaing, J. and Cardoso, J.-F.},\n\tyear         = 2007,\n\tjournal      = {Signal Processing, IEEE Transactions on},\n\tvolume       = 55,\n\tnumber       = 6,\n\tpages        = {2965--2973}\n}\n@article{de2016deepdive,\n\ttitle        = {Deepdive: declarative knowledge base construction},\n\tauthor       = {Christopher De Sa and Alex Ratner and Christopher R\\'{e} and Jaeho Shin and Feiran Wang and Sen Wu and Ce Zhang},\n\tyear         = 2016,\n\tjournal      = {ACM SIGMOD Record},\n\tvolume       = 45,\n\tnumber       = 1,\n\tpages        = {60--67}\n}\n@book{de2017theory,\n\ttitle        = {Theory of probability: A critical introductory treatment},\n\tauthor       = {De Finetti, Bruno},\n\tyear         = 2017,\n\tpublisher    = {John Wiley \\& Sons},\n\tvolume       = 6\n}\n@article{de2018clinically,\n\ttitle        = {Clinically applicable deep learning for diagnosis and referral in retinal disease},\n\tauthor       = {Jeffrey De Fauw and Joseph R Ledsam and Bernardino Romera-Paredes and Stanislav Nikolov and Nenad Tomasev and Sam Blackwell and Harry Askham and Xavier Glorot and Brendan O’Donoghue and Daniel Visentin and others},\n\tyear         = 2018,\n\tjournal      = {Nature Medicine},\n\tvolume       = 24,\n\tnumber       = 9,\n\tpages        = {1342--1350}\n}\n@article{de2020independent,\n\ttitle        = {Is Independent Learning All You Need in the StarCraft Multi-Agent Challenge?},\n\tauthor       = {de Witt, Christian Schroeder and Gupta, Tarun and Makoviichuk, Denys and Makoviychuk, Viktor and Torr, Philip HS and Sun, Mingfei and Whiteson, Shimon},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2011.09533}\n}\n@inproceedings{de2020regression,\n\ttitle        = {Regression under Human Assistance},\n\tauthor       = {Abir De and Paramita Koley and Niloy Ganguly and Manuel Gomez-Rodriguez},\n\tyear         = 2020,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tpages        = {2611--2620}\n}\n@inproceedings{dean2008mapreduce,\n\ttitle        = {MapReduce: simplified data processing on large clusters},\n\tauthor       = {Dean, Jeffrey and Ghemawat, Sanjay},\n\tyear         = 2008,\n\tmonth        = jan,\n\tjournal      = {Commun. ACM},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tvolume       = 51,\n\tpages        = {107--113},\n\tdoi          = {http://doi.acm.org/10.1145/1327452.1327492},\n\tissn         = {0001-0782},\n\tacmid        = 1327492,\n\tissue        = 1,\n\tissue_date   = {January 2008},\n\tnumpages     = 7\n}\n@article{dean2019sample,\n\ttitle        = {On the sample complexity of the linear quadratic regulator},\n\tauthor       = {Dean, Sarah and Mania, Horia and Matni, Nikolai and Recht, Benjamin and Tu, Stephen},\n\tyear         = 2019,\n\tjournal      = {Foundations of Computational Mathematics},\n\tpublisher    = {Springer},\n\tpages        = {1--47}\n}\n@article{deb2002fast,\n\ttitle        = {A fast and elitist multiobjective genetic algorithm: {NSGA}-{II}},\n\tauthor       = {Kalyanmoy Deb and Amrit Pratap and Sameer Agarwal and TAMT Meyarivan},\n\tyear         = 2002,\n\tjournal      = {IEEE transactions on evolutionary computation},\n\tvolume       = 6,\n\tnumber       = 2,\n\tpages        = {182--197}\n}\n@article{debruyne2008model,\n\ttitle        = {Model selection in kernel based regression using the influence function},\n\tauthor       = {Michiel Debruyne and Mia Hubert and Johan AK Suykens},\n\tyear         = 2008,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 9,\n\tnumber       = {0},\n\tpages        = {2377--2400}\n}\n@article{debruyne2010detecting,\n\ttitle        = {Detecting influential observations in Kernel {PCA}},\n\tauthor       = {Michiel Debruyne and Mia Hubert and Johan Van Horebeek},\n\tyear         = 2010,\n\tjournal      = {Computational Statistics \\& Data Analysis},\n\tvolume       = 54,\n\tnumber       = 12,\n\tpages        = {3007--3019}\n}\n@article{decao2018question,\n\ttitle        = {Question answering by reasoning across documents with graph convolutional networks},\n\tauthor       = {Nicola De Cao and Wilker Aziz and Ivan Titov},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1808.09920}\n}\n@article{decelle2011asymptotic,\n\ttitle        = {Asymptotic analysis of the stochastic block model for modular networks and its algorithmic applications},\n\tauthor       = {Aurelien Decelle and Florent Krzakala and Cristopher Moore and Lenka Zdeborov{\\'a}},\n\tyear         = 2011,\n\tjournal      = {Physical Review E},\n\tvolume       = 84,\n\tnumber       = 6\n}\n@article{decelle2011inference,\n\ttitle        = {Inference and phase transitions in the detection of modules in sparse networks},\n\tauthor       = {Aurelien Decelle and Florent Krzakala and Cristopher Moore and Lenka Zdeborov{\\'a}},\n\tyear         = 2011,\n\tjournal      = {Physical Review Letters},\n\tvolume       = 107,\n\tnumber       = 6\n}\n@book{dechter03constraint,\n\ttitle        = {Constraint Processing},\n\tauthor       = {Rina Dechter},\n\tyear         = 2003,\n\tpublisher    = {Morgan Kaufmann}\n}\n@article{deemter2005real,\n\ttitle        = {Real versus template-based natural language generation: A false opposition?},\n\tauthor       = {Kees Van Deemter and Mari{\\\"e}t Theune and Emiel Krahmer},\n\tyear         = 2005,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 31,\n\tnumber       = 1,\n\tpages        = {15--24}\n}\n@article{deepapply,\n\ttitle        = {Context-Dependent Pre-Trained Deep Neural Networks for Large-Vocabulary Speech Recognition.},\n\tauthor       = {Dahl, George E. and Yu, Dong and Deng, Li and Acero, Alex},\n\tyear         = 2012,\n\tjournal      = {IEEE Transactions on Audio, Speech \\& Language Processing},\n\tvolume       = 20,\n\tnumber       = 1,\n\tpages        = {30--42},\n\tkeywords     = {dblp}\n}\n@article{deepsurvey2,\n\ttitle        = {Deep Learning in Neural Networks: An Overview},\n\tauthor       = {J. Schmidhuber},\n\tyear         = 2015,\n\tjournal      = {Neural Networks},\n\tvolume       = 61,\n\tpages        = {85--117},\n\tdoi          = {10.1016/j.neunet.2014.09.003},\n\tnote         = {Published online 2014; based on TR arXiv:1404.7828 [cs.NE]}\n}\n@article{deerwester1990indexing,\n\ttitle        = {Indexing by latent semantic analysis},\n\tauthor       = {Deerwester, Scott C. and Dumais, Susan T and Landauer, Thomas K. and Furnas, George W. and Harshman, Richard A.},\n\tyear         = 1990,\n\tjournal      = {Journal of the American Society for Information Science}\n}\n@article{defarias04constraint,\n\ttitle        = {On Constraint Sampling in the Linear Programming Approach to Approximate Dynamic Programming},\n\tauthor       = {Daniela Pucci {de Farias }and Benjamin {Van Roy}},\n\tyear         = 2004,\n\tjournal      = {Mathematics of Operations Research},\n\tvolume       = 29,\n\tnumber       = 3,\n\tpages        = {462--478}\n}\n@inproceedings{Defazio2014-Finito,\n\ttitle        = {{Finito: A Faster, Permutable Incremental Gradient Method for Big Data Problems}},\n\tauthor       = {Defazio, Aaron J. and Caetano, Tib\\'{e}rio S. and Domke, Justin},\n\tyear         = 2014,\n\tbooktitle    = {Proceedings of the 31st International Conference on Machine Learning},\n\tseries       = {ICML 2014},\n\turl          = {http://jmlr.org/proceedings/papers/v32/defazio14.pdf},\n\tabstract     = {Recent advances in optimization theory have shown that smooth strongly convex finite sums can be minimized faster than by treating them as a black box ”batch” problem. In this work we introduce a new method in this class with a theoretical convergence rate four times faster than existing methods, for sums with sufficiently many terms. This method is also amendable to a sampling without replacement scheme that in practice gives further speed-ups. We give empirical results showing state of the art performance. 1},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {1407.2710},\n\teprint       = {1407.2710},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Defazio, Caetano, Domke - 2014 - Finito A Faster, Permutable Incremental Gradient Method for Big Data Problems.pdf:pdf},\n\tmendeley-groups = {Optimization/[with Yuan Yang],Optimization/Variance Reduction}\n}\n@inproceedings{Defazio2014-SAGA,\n\ttitle        = {{SAGA: A Fast Incremental Gradient Method With Support for Non-Strongly Convex Composite Objectives}},\n\tauthor       = {Defazio, Aaron and Bach, Francis and {Lacoste-Julien}, Simon},\n\tyear         = 2014,\n\tbooktitle    = {NIPS},\n\tpages        = {1646--1654},\n\turl          = {http://arxiv.org/abs/1407.0202},\n\tabstract     = {In this work we introduce a new optimisation method called SAGA in the spirit of SAG, SDCA, MISO and SVRG, a set of recently proposed incremental gradient algorithms with fast linear convergence rates. SAGA improves on the theory behind SAG and SVRG, with better theoretical convergence rates, and has support for composite objectives where a proximal operator is used on the regulariser. Unlike SDCA, SAGA supports non-strongly convex problems directly, and is adaptive to any inherent strong convexity of the problem. We give experimental results showing the effectiveness of our method.},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {arXiv:1407.0202v2},\n\teprint       = {arXiv:1407.0202v2},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Defazio, Bach, Lacoste-Julien - 2014 - SAGA A Fast Incremental Gradient Method With Support for Non-Strongly Convex Composite Objectives.pdf:pdf},\n\tmendeley-groups = {Optimization/[with Yuan Yang],Optimization/Variance Reduction}\n}\n@inproceedings{defazio2014saga,\n\ttitle        = {SAGA: A Fast Incremental Gradient Method With Support for Non-Strongly Convex Composite Objectives},\n\tauthor       = {Aaron Defazio and Francis Bach and Simon Lacoste-Julien},\n\tyear         = 2014,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{defries1994ndvi,\n\ttitle        = {{NDVI}-derived land cover classifications at a global scale},\n\tauthor       = {R S DeFries and JRG Townshend},\n\tyear         = 1994,\n\tjournal      = {International Journal of Remote Sensing},\n\tvolume       = 15,\n\tnumber       = 17,\n\tpages        = {3567--3586}\n}\n@article{defries1995AVHRR,\n\ttitle        = {Global discrimination of land cover types from metrics derived from {AVHRR} pathfinder data},\n\tauthor       = {Ruth DeFries and Matthew Hansen and John Townshend},\n\tyear         = 1995,\n\tjournal      = {Remote Sensing of Environment},\n\tvolume       = 54,\n\tnumber       = 3,\n\tpages        = {209--222}\n}\n@article{degrave2020ai,\n\ttitle        = {{AI} for radiographic {COVID-19} detection selects shortcuts over signal},\n\tauthor       = {Alex J DeGrave and Joseph D Janizek and Su-In Lee},\n\tyear         = 2020,\n\tjournal      = {medRxiv}\n}\n@article{degroot1983forecasters,\n\ttitle        = {The Comparison and Evaluation of Forecasters},\n\tauthor       = {Morris H. DeGroot and Stephen E. Fienberg},\n\tyear         = 1983,\n\tjournal      = {Journal of the Royal Statistical Society. Series D (The Statistician)},\n\tvolume       = 32,\n\tpages        = {12--22}\n}\n@inproceedings{degwekar2019computational,\n\ttitle        = {Computational Limitations in Robust Classification and Win-Win Results},\n\tauthor       = {Akshay Degwekar and Preetum Nakkiran and Vinod Vaikuntanathan},\n\tyear         = 2019,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@article{DeH06a,\n\ttitle        = {The rate of convergence for the cyclic projections algorithm. {I}. {A}ngles between convex sets},\n\tauthor       = {Deutsch, Frank and Hundal, Hein},\n\tyear         = 2006,\n\tjournal      = {J. Approx. Theory},\n\tvolume       = 142,\n\tnumber       = 1,\n\tpages        = {36--55},\n\tdoi          = {10.1016/j.jat.2006.02.005},\n\tissn         = {0021-9045},\n\turl          = {http://dx.doi.org/10.1016/j.jat.2006.02.005},\n\tfjournal     = {Journal of Approximation Theory},\n\tmrclass      = {41A65 (46N10 47H09)},\n\tmrnumber     = 2257064\n}\n@article{DeH06b,\n\ttitle        = {The rate of convergence for the cyclic projections algorithm. {II}. {N}orms of nonlinear operators},\n\tauthor       = {Deutsch, Frank and Hundal, Hein},\n\tyear         = 2006,\n\tjournal      = {J. Approx. Theory},\n\tvolume       = 142,\n\tnumber       = 1,\n\tpages        = {56--82},\n\tdoi          = {10.1016/j.jat.2006.02.006},\n\tissn         = {0021-9045},\n\turl          = {http://dx.doi.org/10.1016/j.jat.2006.02.006},\n\tfjournal     = {Journal of Approximation Theory},\n\tmrclass      = {41A65 (46N10 47H09)},\n\tmrnumber     = 2257065,\n\tmrreviewer   = {Heinz H. Bauschke}\n}\n@article{DeH08,\n\ttitle        = {The rate of convergence for the Cyclic Projections Algorithm {III}: {R}egularity of Convex Sets},\n\tauthor       = {Deutsch, Frank and Hundal, Hein},\n\tyear         = 2008,\n\tjournal      = {J. Approx. Theory},\n\tpublisher    = {Academic Press, Inc.},\n\taddress      = {Orlando, FL, USA},\n\tvolume       = 155,\n\tnumber       = 2,\n\tpages        = {155--184},\n\tdoi          = {10.1016/j.jat.2008.04.001},\n\tissn         = {0021-9045},\n\turl          = {http://dx.doi.org/10.1016/j.jat.2008.04.001},\n\tacmid        = 1465355,\n\tissue_date   = {December, 2008},\n\tkeywords     = {Alternating projections, Angle between convex sets, Angle between subspaces, Convex feasibility problem, Cyclic projections, Norm of nonlinear operators, Orthogonal projections, POCS, Projections onto convex sets, Rate of convergence, Regularity properties of convex sets: regular, linearly regular, boundedly regular, boundedly linearly regular, normal, weakly normal, uniformly normal, The strong conical hull intersection property (strong CHIP)},\n\tnumpages     = 30\n}\n@article{dehaene2020self,\n\ttitle        = {Self-Supervision Closes the Gap Between Weak and Strong Supervision in Histology},\n\tauthor       = {Olivier Dehaene and Axel Camara and Olivier Moindrot and Axel de Lavergne and Pierre Courtiol},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2012.03583}\n}\n@inproceedings{deisenrothmodel,\n\ttitle        = {{PILCO:} A model-based and data-efficient approach to policy search},\n\tauthor       = {Deisenroth, Marc P and Rasmussen, Carl M},\n\tbooktitle    = {Proceedings of the 28th International Conference on Machine Learning},\n\tpages        = {465--472}\n}\n@inproceedings{deka2016erica,\n\ttitle        = {ERICA: Interaction Mining Mobile Apps},\n\tauthor       = {Biplab Deka and Zifeng Huang and Ranjitha Kumar},\n\tyear         = 2016,\n\tbooktitle    = {User Interface Software and Technology (UIST)},\n\tpages        = {767--776}\n}\n@article{Dekel2012,\n\ttitle        = {{Optimal distributed online prediction using mini-batches}},\n\tauthor       = {Dekel, Ofer and {Gilad-Bachrach}, Ran and Shamir, Ohad and Xiao, Lin},\n\tyear         = 2012,\n\tjournal      = {The Journal of Machine Learning Research},\n\tvolume       = 13,\n\tnumber       = 1,\n\tpages        = {165--202},\n\tisbn         = {978-1-4503-0619-5},\n\tissn         = {1532-4435},\n\tabstract     = {Online prediction methods are typically presented as serial algorithms running on a single processor. However, in the age of web-scale prediction problems, it is increasingly common to encounter situations where a single processor cannot keep up with the high rate at which inputs arrive. In this work, we present the $\\backslash$emph\\{distributed mini-batch\\} algorithm, a method of converting many serial gradient-based online prediction algorithms into distributed algorithms. We prove a regret bound for this method that is asymptotically optimal for smooth convex loss functions and stochastic inputs. Moreover, our analysis explicitly takes into account communication latencies between nodes in the distributed environment. We show how our method can be used to solve the closely-related distributed stochastic optimization problem, achieving an asymptotically linear speed-up over multiple processors. Finally, we demonstrate the merits of our approach on a web-scale online prediction problem.},\n\tannote       = {Contains some information about \"using mirror descent steps\" on smooth objectives, though analyzed in stochastic way.},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {1012.1367},\n\teprint       = {1012.1367},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Dekel et al. - 2012 - Optimal distributed online prediction using mini-batches.pdf:pdf},\n\tkeywords     = {convex,distributed computing,online learning,regret bounds,stochastic optimization},\n\tmendeley-groups = {Optimization/Stochastic Online Optimization}\n}\n@book{dekel2015epistemic,\n\ttitle        = {Epistemic game theory},\n\tauthor       = {Eddie Dekel and Marciano Siniscalchi},\n\tyear         = 2015,\n\tpublisher    = {Handbook of Game Theory with Economic Applications},\n\tvolume       = 4,\n\tpages        = {619--702}\n}\n@inproceedings{delage06dbn,\n\ttitle        = {A dynamic {B}ayesian network model for autonomous 3d reconstruction},\n\tauthor       = {Erick Delage and Honglak Lee and Andrew Y. Ng},\n\tyear         = 2006,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@article{delage2010distributionally,\n\ttitle        = {Distributionally robust optimization under moment uncertainty with application to data-driven problems},\n\tauthor       = {Erick Delage and Yinyu Ye},\n\tyear         = 2010,\n\tjournal      = {Operations research},\n\tvolume       = 58,\n\tnumber       = 3,\n\tpages        = {595--612}\n}\n@article{delathauwer2001independent,\n\ttitle        = {Independent component analysis and (simultaneous) third-order tensor diagonalization},\n\tauthor       = {Lieven De Lathauwer and Bart De Moor and Joos Vandewalle},\n\tyear         = 2001,\n\tjournal      = {Signal Processing, IEEE Transactions on},\n\tvolume       = 49,\n\tnumber       = 10,\n\tpages        = {2262--2271}\n}\n@article{delathauwer2006decomposition,\n\ttitle        = {A Link Between the Canonical Decomposition in Multilinear Algebra and Simultaneous Matrix Diagonalization},\n\tauthor       = {Lieven De Lathauwer},\n\tyear         = 2006,\n\tjournal      = {SIAM Journal of Matrix Analysis and Applications},\n\tvolume       = 28,\n\tnumber       = 3,\n\tpages        = {642--666}\n}\n@article{deletang2021causal,\n\ttitle        = {Causal Analysis of Agent Behavior for {AI} Safety},\n\tauthor       = {Grégoire Déletang and J. Grau-Moya and Miljan Martic and Tim Genewein and Tom McGrath and Vladimir Mikulik and M. Kunesch and S. Legg and Pedro A. Ortega},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2103.03938}\n}\n@article{delfosse1995adaptive,\n\ttitle        = {Adaptive blind separation of independent sources: a deflation approach},\n\tauthor       = {Delfosse, N. and Loubaton, P.},\n\tyear         = 1995,\n\tjournal      = {Signal processing},\n\tpublisher    = {Elsevier},\n\tvolume       = 45,\n\tnumber       = 1,\n\tpages        = {59--83}\n}\n@article{della1997inducing,\n\ttitle        = {Inducing features of random fields},\n\tauthor       = {Stephen Della Pietra and Vincent Della Pietra and John Lafferty},\n\tyear         = 1997,\n\tjournal      = {IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI)},\n\tvolume       = 19,\n\tnumber       = 4,\n\tpages        = {380--393}\n}\n@article{dellarocas2006strategic,\n\ttitle        = {Strategic manipulation of internet opinion forums: Implications for consumers and firms},\n\tauthor       = {Chrysanthos Dellarocas},\n\tyear         = 2006,\n\tjournal      = {Management science},\n\tvolume       = 52,\n\tnumber       = 10,\n\tpages        = {1577--1593}\n}\n@inproceedings{demeester2016lifted,\n\ttitle        = {Lifted Rule Injection for Relation Embeddings},\n\tauthor       = {Thomas Demeester and Tim Rockt{\\\"{a}}schel and Sebastian Riedel},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{deming1944theory,\n\ttitle        = {Theory of Games and Economic Behavior},\n\tauthor       = {W. E. Deming and J. Neumann and O. Morgenstern},\n\tyear         = 1944,\n\tjournal      = {Journal of the American Statistical Association (JASA)},\n\tvolume       = 40\n}\n@article{demszky2018transforming,\n\ttitle        = {Transforming Question Answering Datasets Into Natural Language Inference Datasets},\n\tauthor       = {Dorottya Demszky and Kelvin Guu and Percy Liang},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1809.02922}\n}\n@inproceedings{dendamrongvit2009undersampling,\n\ttitle        = {Undersampling Approach for Imbalanced Training Sets and Induction from Multi-label Text-Categorization Domains},\n\tauthor       = {Sareewan Dendamrongvit and Miroslav Kubat},\n\tyear         = 2009,\n\tbooktitle    = {PAKDD Workshop on New Frontiers in Applied Data Mining}\n}\n@inproceedings{denero08phrase,\n\ttitle        = {Sampling Alignment Structure under a {B}ayesian Translation Model},\n\tauthor       = {John DeNero and Alexandre Bouchard-C\\^ot\\'e and Dan Klein},\n\tyear         = 2008,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {314--323}\n}\n@inproceedings{denero09efficient,\n\ttitle        = {Efficient Parsing for Transducer Grammars},\n\tauthor       = {John DeNero and Mohit Bansal and Adam Pauls and Dan Klein},\n\tyear         = 2009,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)},\n\tpages        = {227--235}\n}\n@inproceedings{deng2009imagenet,\n\ttitle        = {{I}mage{N}et: A large-scale hierarchical image database},\n\tauthor       = {Jia Deng and Wei Dong and Richard Socher and Li-Jia Li and Kai Li and Li Fei-Fei},\n\tyear         = 2009,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {248--255}\n}\n@article{deng2016global,\n\ttitle        = {On the global and linear convergence of the generalized alternating direction method of multipliers},\n\tauthor       = {Deng, Wei and Yin, Wotao},\n\tyear         = 2016,\n\tjournal      = {Journal of Scientific Computing},\n\tpublisher    = {Springer},\n\tvolume       = 66,\n\tnumber       = 3,\n\tpages        = {889--916}\n}\n@inproceedings{deng2018adversarial,\n\ttitle        = {Adversarial Active Learning for Sequences Labeling and Generation},\n\tauthor       = {Yue Deng and KaWai Chen and Yilin Shen and Hongxia Jin},\n\tyear         = 2018,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)},\n\tpages        = {4012--4018}\n}\n@inproceedings{denkowski2014meteor,\n\ttitle        = {Meteor Universal: Language Specific Translation Evaluation for Any Target Language},\n\tauthor       = {Michael Denkowski and Alon Lavie},\n\tyear         = 2014,\n\tbooktitle    = {Workshop on Statistical Machine Translation}\n}\n@article{derksen2013matrix,\n\ttitle        = {Matrix Completion and Tensor Rank},\n\tauthor       = {Derksen, Harm},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1302.2639}\n}\n@inproceedings{desai09smoothed,\n\ttitle        = {A Smoothed Approximate Linear Program},\n\tauthor       = {Vijay Desai and Vivek Farias and Ciamac C. Moallemi},\n\tyear         = 2009,\n\tbooktitle    = {Advances in Neural Information Processing Systems 22 (NIPS)},\n\tpages        = {459--467}\n}\n@inproceedings{desai2016program,\n\ttitle        = {Program synthesis using natural language},\n\tauthor       = {Aditya Desai and Sumit Gulwani and Vineet Hingorani and Nidhi Jain and Amey Karkare and Mark Marron and Sailesh R and Subhajit Roy},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Software Engineering (ICSE)},\n\tpages        = {345--356}\n}\n@article{deselaers12latent,\n\ttitle        = {Latent log-linear models for handwritten digit classification},\n\tauthor       = {Thomas Deselaers and Tobias Gass and Georg Heigold and Hermann Ney},\n\tyear         = 2012,\n\tjournal      = {IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI)},\n\tvolume       = 34,\n\tpages        = {1105--1117}\n}\n@inproceedings{deshpande2004model,\n\ttitle        = {Model-driven data acquisition in sensor networks},\n\tauthor       = {\n\t\tDeshpande, Amol and Guestrin, Carlos and Madden, Samuel R. and Hellerstein,\n\n\t\tJoseph M. and Hong, Wei\n\t},\n\tyear         = 2004,\n\tbooktitle    = {\n\t\tProceedings of the Thirtieth international conference on Very large\n\n\t\tdata bases - Volume 30\n\t},\n\tlocation     = {Toronto, Canada},\n\tpublisher    = {VLDB Endowment},\n\tseries       = {VLDB '04},\n\tpages        = {588--599},\n\tisbn         = {0-12-088469-0},\n\tacmid        = 1316741,\n\tnumpages     = 12\n}\n@article{desilva2008tensor,\n\ttitle        = {Tensor Rank and the {Ill-Posedness} of the Best {Low-Rank} Approximation Problem},\n\tauthor       = {de Silva, V and Lim, L},\n\tyear         = 2008,\n\tjournal      = {SIAM Journal on Matrix Analysis and Applications},\n\tvolume       = 30,\n\tpages        = {1084--1127}\n}\n@article{desmarais2012review,\n\ttitle        = {A review of recent advances in learner and skill modeling in intelligent learning environments},\n\tauthor       = {Michel C Desmarais and Ryan Baker},\n\tyear         = 2012,\n\tjournal      = {User Modeling and User-Adapted Interaction},\n\tvolume       = 22,\n\tnumber       = 1,\n\tpages        = {9--38}\n}\n@inproceedings{devault2015toward,\n\ttitle        = {Toward Natural Turn-taking in a Virtual Human Negotiation Agent},\n\tauthor       = {David DeVault and Johnathan Mell and Jonathan Gratch},\n\tyear         = 2015,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{devlin2012dynamic,\n\ttitle        = {Dynamic potential-based reward shaping},\n\tauthor       = {Sam Devlin and Daniel Kudenko},\n\tyear         = 2012,\n\tbooktitle    = {International Conference on Autonomous Agents and Multiagent Systems (AAMAS)}\n}\n@inproceedings{devlin2016captioning,\n\ttitle        = {Language Models for Image Captioning: The Quirks and What Works},\n\tauthor       = {Jacob Devlin and Hao Cheng and Hao Fang and Saurabh Gupta and Li Deng and Xiaodong He and Geoffrey Zweig and Margaret Mitchell},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{devlin2017robustfill,\n\ttitle        = {RobustFill: Neural Program Learning Under Noisy I/O},\n\tauthor       = {Jacob Devlin and Jonathan Uesato and Surya Bhupatiraju and Rishabh Singh and Abdel-rahman Mohamed and Pushmeet Kohli},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{devlin2017varreplace,\n\ttitle        = {Semantic Code Repair Using Neuro-Symbolic Transformation Networks},\n\tauthor       = {Jacob Devlin and Jonathan Uesato and Rishabh Singh and Pushmeet Kohli},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1710.11054}\n}\n@article{devlin2018bert,\n\ttitle        = {Bert: Pre-training of deep bidirectional transformers for language understanding},\n\tauthor       = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1810.04805}\n}\n@inproceedings{devlin2019bert,\n\ttitle        = {{BERT}: Pre-training of Deep Bidirectional Transformers for Language Understanding},\n\tauthor       = {Jacob Devlin and Ming-Wei Chang and Kenton Lee and Kristina Toutanova},\n\tyear         = 2019,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {4171--4186}\n}\n@article{devolder2014first,\n\ttitle        = {First-order methods of smooth convex optimization with inexact oracle},\n\tauthor       = {Olivier Devolder and Fran{\\c{c}}ois Glineur and Yurii Nesterov},\n\tyear         = 2014,\n\tjournal      = {Mathematical Programming},\n\tvolume       = 146,\n\tpages        = {37--75}\n}\n@article{devries2017improved,\n\ttitle        = {Improved regularization of convolutional neural networks with cutout},\n\tauthor       = {DeVries, Terrance and Taylor, Graham W},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1708.04552}\n}\n@article{devries2018talk,\n\ttitle        = {Talk the Walk: Navigating New York City through Grounded Dialogue},\n\tauthor       = {Harm de Vries and Kurt Shuster and Dhruv Batra and Devi Parikh and Jason Weston and Douwe Kiela},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1807.03367}\n}\n@book{devroye2012combinatorial,\n\ttitle        = {Combinatorial methods in density estimation},\n\tauthor       = {Devroye, Luc and Lugosi, G{\\'a}bor},\n\tyear         = 2012,\n\tpublisher    = {Springer Science \\& Business Media}\n}\n@article{deyes1984towards,\n\ttitle        = {Towards an authentic `discourse cloze'},\n\tauthor       = {Tony Deyes},\n\tyear         = 1984,\n\tjournal      = {Applied Linguistics},\n\tvolume       = 5,\n\tnumber       = 2,\n\tpages        = {128--137}\n}\n@misc{dfc2008dfc,\n\ttitle        = {\n\t\tDFC Intelligence Forecasts Video Game Market to Reach \\$ 57 Billion\n\n\t\tin 2009\n\t},\n\tauthor       = {DFC},\n\tyear         = 2008,\n\tmonth        = jun,\n\turl          = {http://www.dfcint.com/wp/?p=222},\n\towner        = {leili},\n\ttimestamp    = {2009.11.20}\n}\n@article{DG03,\n\ttitle        = {An elementary proof of a theorem of {J}ohnson and {L}indenstrauss},\n\tauthor       = {S. Dasgupta and A. Gupta},\n\tyear         = 2003,\n\tjournal      = {Random Structures and Algorithms},\n\tvolume       = 22,\n\tnumber       = 1,\n\tpages        = {60--65}\n}\n@article{dg18,\n\ttitle        = {Improved Learning of One-hidden-layer Convolutional Neural Networks with Overlaps},\n\tauthor       = {Du, Simon S and Goel, Surbhi},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.07798}\n}\n@inproceedings{DH,\n\ttitle        = {Uncertainty principles and ideal atomic decomposition},\n\tauthor       = {D. Donoho and X. Huo},\n\tyear         = 1999,\n\tbooktitle    = {IEEE Trans. on Information Theory},\n\tpages        = {2845--2862}\n}\n@article{dhamdhere2017abductive,\n\ttitle        = {Abductive Matching in Question Answering},\n\tauthor       = {Kedar Dhamdhere and Kevin S McCurley and Mukund Sundararajan and Ankur Taly},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1709.03036}\n}\n@inproceedings{dhamdhere2017analyza,\n\ttitle        = {Analyza: Exploring data with conversation},\n\tauthor       = {Kedar Dhamdhere and Kevin S McCurley and Ralfi Nahmias and Mukund Sundararajan and Qiqi Yan},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the 22nd International Conference on Intelligent User Interfaces}\n}\n@inproceedings{dhillon2011multi,\n\ttitle        = {Multi-View Learning of Word Embeddings via CCA},\n\tauthor       = {Dhillon, Paramveer and Foster, Dean P and Ungar, Lyle H},\n\tyear         = 2011,\n\tbooktitle    = {NIPS},\n\tpages        = {199--207}\n}\n@inproceedings{dhingra2017information,\n\ttitle        = {End-to-End Reinforcement Learning of Dialogue Agents for Information Access},\n\tauthor       = {Bhuwan Dhingra and Lihong Li and Xiujun Li and Jianfeng Gao and Yun-Nung Chen and Faisal Ahmed and Li Deng},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{di2009multilevel,\n\ttitle        = {Multilevel functional principal component analysis},\n\tauthor       = {Chong-Zhi Di and Ciprian M Crainiceanu and Brian S Caffo and Naresh M Punjabi},\n\tyear         = 2009,\n\tjournal      = {The Annals of Applied Statistics},\n\tvolume       = 3,\n\tnumber       = 1\n}\n@article{diakonikolas2016bayes,\n\ttitle        = {Robust learning of fixed-structure {B}ayesian networks},\n\tauthor       = {Ilias Diakonikolas and Daniel Kane and Alistair Stewart},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@inproceedings{diakonikolas2016robust,\n\ttitle        = {Robust Estimators in High Dimensions without the Computational Intractability},\n\tauthor       = {Ilias Diakonikolas and Gautam Kamath and Daniel Kane and Jerry Li and Ankur Moitra and Alistair Stewart},\n\tyear         = 2016,\n\tbooktitle    = {Foundations of Computer Science (FOCS)}\n}\n@article{diakonikolas2016statistical,\n\ttitle        = {Statistical query lower bounds for robust estimation of high-dimensional {G}aussians and {G}aussian mixtures},\n\tauthor       = {Ilias Diakonikolas and Daniel M. Kane and Alistair Stewart},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@article{diakonikolas2017learning,\n\ttitle        = {Learning Geometric Concepts with Nasty Noise},\n\tauthor       = {Ilias Diakonikolas and Daniel M. Kane and Alistair Stewart},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@article{diakonikolas2017practical,\n\ttitle        = {Being Robust (in High Dimensions) Can Be Practical},\n\tauthor       = {Ilias Diakonikolas and Gautam Kamath and Daniel Kane and Jerry Li and Ankur Moitra and Alistair Stewart},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@article{diakonikolas2017robustly,\n\ttitle        = {Robustly Learning a {G}aussian: Getting Optimal Error, Efficiently},\n\tauthor       = {Ilias Diakonikolas and Gautam Kamath and Daniel M. Kane and Jerry Li and Ankur Moitra and Alistair Stewart},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@inproceedings{diakonikolas2018list,\n\ttitle        = {List-Decodable Robust Mean Estimation and Learning Mixtures of Spherical {G}aussians},\n\tauthor       = {Ilias Diakonikolas and Daniel M. Kane and Alistair Stewart},\n\tyear         = 2018,\n\tbooktitle    = {Symposium on Theory of Computing (STOC)}\n}\n@article{diakonikolas2018sever,\n\ttitle        = {Sever: A Robust Meta-Algorithm for Stochastic Optimization},\n\tauthor       = {Ilias Diakonikolas and Gautam Kamath and Daniel M. Kane and Jerry Li and Jacob Steinhardt and Alistair Stewart},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1803.02815}\n}\n@article{diamond2016cvxpy,\n\ttitle        = {{CVXPY}: A {P}ython-Embedded Modeling Language for Convex Optimization},\n\tauthor       = {Steven Diamond and Stephen Boyd},\n\tyear         = 2016,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 17,\n\tnumber       = 83,\n\tpages        = {1--5}\n}\n@article{dickinson2014computational,\n\ttitle        = {On the computational complexity of membership problems for the completely positive cone and its dual},\n\tauthor       = {Dickinson, Peter JC and Gijben, Luuk},\n\tyear         = 2014,\n\tjournal      = {Computational optimization and applications},\n\tpublisher    = {Springer},\n\tvolume       = 57,\n\tnumber       = 2,\n\tpages        = {403--415}\n}\n@inproceedings{dietterich1998maxq,\n\ttitle        = {The {MAXQ} Method for Hierarchical Reinforcement Learning},\n\tauthor       = {T. G. Dietterich},\n\tyear         = 1998,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{dietterich2000hierarchical,\n\ttitle        = {Hierarchical reinforcement learning with the {MAXQ} value function decomposition},\n\tauthor       = {T. G. Dietterich},\n\tyear         = 2000,\n\tjournal      = {Journal of Artificial Intelligence Research},\n\tpages        = {227--303}\n}\n@inproceedings{dietterich2000state,\n\ttitle        = {State abstraction in {MAXQ} hierarchical reinforcement learning},\n\tauthor       = {Thomas G Dietterich},\n\tyear         = 2000,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {994--1000}\n}\n@inproceedings{dietterich2013pac,\n\ttitle        = {{PAC} Optimal Planning for Invasive Species Management: Improved Exploration for Reinforcement Learning from Simulator-Defined {MDP}s},\n\tauthor       = {Dietterich, Thomas G. and Taleghan, Majid Alkaee and Crowley, Mark},\n\tyear         = 2013,\n\tbooktitle    = {AAAI}\n}\n@article{dieuleveut2020bridging,\n\ttitle        = {Bridging the gap between constant step size stochastic gradient descent and markov chains},\n\tauthor       = {Dieuleveut, Aymeric and Durmus, Alain and Bach, Francis},\n\tyear         = 2020,\n\tjournal      = {Annals of Statistics},\n\tpublisher    = {Institute of Mathematical Statistics},\n\tvolume       = 48,\n\tnumber       = 3,\n\tpages        = {1348--1382}\n}\n@article{digitalglobe2016spacenet,\n\ttitle        = {SpaceNet},\n\tauthor       = {N. DigitalGlobe and CosmiQ Works},\n\tyear         = 2016,\n\tjournal      = {https://aws.amazon.com/publicdatasets/spacenet/}\n}\n@article{dijkstra1978ewd667,\n\ttitle        = {On the foolishness of ``natural language programming''},\n\tauthor       = {Edsger W. Dijkstra},\n\tyear         = 1978,\n\tjournal      = {EWD667}\n}\n@inproceedings{dimarco1993nearsynonym,\n\ttitle        = {The semantic and stylistic differentiation of synonyms and near-synonyms},\n\tauthor       = {Chrysanne DiMarco and Graeme Hirst and Manfred Stede},\n\tyear         = 1993,\n\tbooktitle    = {AAAI Spring Symposium on Building Lexicons for Machine Translation}\n}\n@article{dinan2018wizard,\n\ttitle        = {Wizard of {Wikipedia}: Knowledge-Powered Conversational agents},\n\tauthor       = {Emily Dinan and Stephen Roller and Kurt Shuster and Angela Fan and Michael Auli and Jason Weston},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1811.01241}\n}\n@inproceedings{dinesh11regulatory,\n\ttitle        = {Computing Logical Form on Regulatory Texts},\n\tauthor       = {Nikhil Dinesh and Aravind Joshi and Insup Lee},\n\tyear         = 2011,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{ding2004k,\n\ttitle        = {K-means clustering via principal component analysis},\n\tauthor       = {Ding, Chris and He, Xiaofeng},\n\tyear         = 2004,\n\tbooktitle    = {\n\t\tProceedings of the twenty-first international conference on Machine\n\n\t\tlearning\n\t},\n\tlocation     = {Banff, Alberta, Canada},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {ICML '04},\n\tpages        = {29--},\n\tdoi          = {http://doi.acm.org/10.1145/1015330.1015408},\n\tisbn         = {1-58113-838-5},\n\tacmid        = 1015408\n}\n@article{ding2010convex,\n\ttitle        = {Convex and semi-nonnegative matrix factorizations},\n\tauthor       = {Ding, Chris and Li, Tao and Jordan, Michael I},\n\tyear         = 2010,\n\tjournal      = {Pattern Analysis and Machine Intelligence, IEEE Transactions on},\n\tpublisher    = {IEEE},\n\tvolume       = 32,\n\tnumber       = 1,\n\tpages        = {45--55}\n}\n@article{ding2014efficient,\n\ttitle        = {Efficient Distributed Topic Modeling with Provable Guarantees},\n\tauthor       = {Weicong Ding and Mohammad H. Rohban and Prakash Ishwar and Venkatesh Saligrama},\n\tyear         = 2014,\n\tjournal      = {JMLR},\n\tpages        = {167--175}\n}\n@inproceedings{ding2019goal,\n\ttitle        = {Goal-conditioned Imitation Learning},\n\tauthor       = {Yiming Ding and Carlos Florensa and Mariano Phielipp and P. Abbeel},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{ding2020patching,\n\ttitle        = {Patching as Translation: the Data and the Metaphor},\n\tauthor       = {Yangruibo Ding and Baishakhi Ray and Premkumar Devanbu and Vincent J. Hellendoorn},\n\tyear         = 2020,\n\tbooktitle    = {Automated Software Engineering (ASE)}\n}\n@inproceedings{dinh2017sharp,\n\ttitle        = {Sharp minima can generalize for deep nets},\n\tauthor       = {Dinh, Laurent and Pascanu, Razvan and Bengio, Samy and Bengio, Yoshua},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the 34th International Conference on Machine Learning-Volume 70},\n\tpages        = {1019--1028},\n\torganization = {JMLR. org}\n}\n@article{Dinic1970,\n\ttitle        = {Algorithm for solution of a problem of maximum flow in networks with power estimation},\n\tauthor       = {Dinic, E. A.},\n\tyear         = 1970,\n\tjournal      = {Soviet Math Doklady},\n\tvolume       = 11,\n\tpages        = {1277--1280}\n}\n@inproceedings{dixon2018measuring,\n\ttitle        = {Measuring and mitigating unintended bias in text classification},\n\tauthor       = {Lucas Dixon and John Li and Jeffrey Sorensen and Nithum Thain and Lucy Vasserman},\n\tyear         = 2018,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tpages        = {67--73}\n}\n@article{djolonga2020robustness,\n\ttitle        = {On robustness and transferability of convolutional neural networks},\n\tauthor       = {Josip Djolonga and Jessica Yung and Michael Tschannen and Rob Romijnders and Lucas Beyer and Alexander Kolesnikov and Joan Puigcerver and Matthias Minderer and Alexander D'Amour and Dan Moldovan and others},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.08558}\n}\n@article{DLR,\n\ttitle        = {Maximum likelihood from incomplete data via the EM Algorithm},\n\tauthor       = {A.~P. Dempster and N.~M. Laird and D.~B. Rubin},\n\tyear         = 1977,\n\tjournal      = {J. Roy. Statist. Soc. Ser. B},\n\tvolume       = 39,\n\tnumber       = 1,\n\tpages        = {1--38}\n}\n@inproceedings{dlt18,\n\ttitle        = {When is a Convolutional Filter Easy to Learn?},\n\tauthor       = {Du, Simon S and Lee, Jason D and Tian, Yuandong},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1709.06129},\n\tbooktitle    = {ICLR}\n}\n@inproceedings{dltps18,\n\ttitle        = {Gradient Descent Learns One-hidden-layer {CNN:} Don't be Afraid of Spurious Local Minima},\n\tauthor       = {Simon S. Du and Jason D. Lee and Yuandong Tian and Barnab{\\'{a}}s P{\\'{o}}czos and Aarti Singh},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpublisher    = {http://arxiv.org/abs/1712.00779}\n}\n@inproceedings{DMA,\n\ttitle        = {Greedy adaptive approximations},\n\tauthor       = {G. Davis and S. Mallat and M. Avellaneda},\n\tyear         = 1997,\n\tbooktitle    = {J. of Constructive Approximation},\n\tpages        = {57--98}\n}\n@inproceedings{do2005transfer,\n\ttitle        = {Transfer learning for text classification},\n\tauthor       = {Do, Chuong and Ng, Andrew Y},\n\tyear         = 2005,\n\tbooktitle    = {NIPS},\n\tpages        = {299--306}\n}\n@inproceedings{doan2003learning,\n\ttitle        = {Learning to match ontologies on the semantic web},\n\tauthor       = {AnHai Doan and Jayant Madhavan and Robin Dhamankar and Pedro Domingos and Alon Halevy},\n\tyear         = 2003,\n\tbooktitle    = {Very Large Data Bases (VLDB)},\n\tpages        = {303--320}\n}\n@article{dobriban2015high,\n\ttitle        = {High-Dimensional Asymptotics of Prediction: Ridge Regression and Classification},\n\tauthor       = {Edgar Dobriban and Stefan Wager},\n\tyear         = 2015,\n\tjournal      = {arXiv}\n}\n@article{dodge2020finetuning,\n\ttitle        = {Finetuning pretrained language models: Weight initializations, data orders, and early stopping},\n\tauthor       = {Jesse Dodge and Gabriel Ilharco and Roy Schwartz and Ali Farhadi and Hannaneh Hajishirzi and Noah Smith},\n\tyear         = 2020,\n\tjournal      = {arXiv}\n}\n@inproceedings{doeblin1940elements,\n\ttitle        = {Elements d'une theorie generale des chaines simples constantes de Markoff},\n\tauthor       = {W Doeblin},\n\tyear         = 1940,\n\tbooktitle    = {Annales scientifiques de l'École Normale Supérieure},\n\tvolume       = 57,\n\tpages        = {61--111}\n}\n@article{doersch2016tutorial,\n\ttitle        = {Tutorial on variational autoencoders},\n\tauthor       = {Carl Doersch},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1606.05908}\n}\n@article{doi:10.1093/qmath/11.1.50,\n\ttitle        = {SYMMETRIC GAUGE FUNCTIONS AND UNITARILY INVARIANT NORMS},\n\tauthor       = {MIRSKY, L.},\n\tyear         = 1960,\n\tjournal      = {The Quarterly Journal of Mathematics},\n\tvolume       = 11,\n\tnumber       = 1,\n\tpages        = {50--59},\n\tdoi          = {10.1093/qmath/11.1.50},\n\turl          = {+ http://dx.doi.org/10.1093/qmath/11.1.50},\n\teprint       = {/oup/backfile/content_public/journal/qjmath/11/1/10.1093_qmath_11.1.50/3/11-1-50.pdf}\n}\n@inproceedings{dolan2004unsupervised,\n\ttitle        = {Unsupervised construction of large paraphrase corpora: Exploiting massively parallel news sources},\n\tauthor       = {Bill Dolan and Chris Quirk and Chris Brockett},\n\tyear         = 2004,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)}\n}\n@inproceedings{dolan2005mrpc,\n\ttitle        = {Automatically constructing a corpus of sentential paraphrases},\n\tauthor       = {William B Dolan and Chris Brockett},\n\tyear         = 2005,\n\tbooktitle    = {International Workshop on Paraphrasing (IWP)}\n}\n@inproceedings{domingos2000unified,\n\ttitle        = {A unified bias-variance decomposition},\n\tauthor       = {Pedro Domingos},\n\tyear         = 2000,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {231--238}\n}\n@inproceedings{domke2011parameter,\n\ttitle        = {Parameter learning with truncated message-passing},\n\tauthor       = {Justin Domke},\n\tyear         = 2011,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {2937--2943}\n}\n@inproceedings{donahue2014decaf,\n\ttitle        = {DeCAF: A Deep Convolutional Activation Feature for Generic Visual Recognition},\n\tauthor       = {Jeff Donahue and Yangqing Jia and Oriol Vinyals and Judy Hoffman and Ning Zhang and Eric Tzeng and Trevor Darrell},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tvolume       = 32,\n\tpages        = {647--655}\n}\n@inproceedings{donahue2020infilling,\n\ttitle        = {Enabling Language Models to Fill in the Blanks},\n\tauthor       = {Chris Donahue and Mina Lee and Percy Liang},\n\tyear         = 2020,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{dong2014knowledge,\n\ttitle        = {Knowledge {v}ault: A web-scale approach to probabilistic knowledge fusion},\n\tauthor       = {Xin Dong and Evgeniy Gabrilovich and Geremy Heitz and Wilko Horn and Ni Lao and Kevin Murphy and Thomas Strohmann and Shaohua Sun and Wei Zhang},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)},\n\tpages        = {601--610}\n}\n@inproceedings{dong2016logical,\n\ttitle        = {Language to Logical Form with Neural Attention},\n\tauthor       = {Li Dong and Mirella Lapata},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{dong2018coarse,\n\ttitle        = {Coarse-to-Fine Decoding for Neural Semantic Parsing},\n\tauthor       = {Li Dong and Mirella Lapata},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{dong2018confidence,\n\ttitle        = {Confidence modeling for neural semantic parsing},\n\tauthor       = {Li Dong and Chris Quirk and Mirella Lapata},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{dong2018information,\n\ttitle        = {An information-theoretic analysis for thompson sampling with many actions},\n\tauthor       = {Dong, Shi and Van Roy, Benjamin},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.11845}\n}\n@inproceedings{dong2019performance,\n\ttitle        = {On the Performance of Thompson Sampling on Logistic Bandits},\n\tauthor       = {Dong, Shi and Ma, Tengyu and Van Roy, Benjamin},\n\tyear         = 2019,\n\tbooktitle    = {Conference on Learning Theory},\n\tpages        = {1158--1160}\n}\n@article{dong2019q,\n\ttitle        = {Q-learning with ucb exploration is sample efficient for infinite-horizon mdp},\n\tauthor       = {Dong, Kefan and Wang, Yuanhao and Chen, Xiaoyu and Wang, Liwei},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1901.09311}\n}\n@inproceedings{dong2019sqrt,\n\ttitle        = {$\\sqrt{n}$-Regret for Learning in {M}arkov Decision Processes with Function Approximation and Low {B}ellman Rank},\n\tauthor       = {Dong, Kefan and Peng, Jian and Wang, Yining and Zhou, Yuan},\n\tyear         = 2020,\n\tbooktitle    = {Conference on Learning Theory}\n}\n@inproceedings{dong2020expressivity,\n\ttitle        = {On the expressivity of neural networks for deep reinforcement learning},\n\tauthor       = {Dong, Kefan and Luo, Yuping and Yu, Tianhe and Finn, Chelsea and Ma, Tengyu},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {2627--2637},\n\torganization = {PMLR}\n}\n@misc{dong2020provably,\n\ttitle        = {Provably Efficient Reinforcement Learning with Aggregated States},\n\tauthor       = {Shi Dong and Benjamin Van Roy and Zhengyuan Zhou},\n\tyear         = 2020,\n\teprint       = {1912.06366},\n\tarchiveprefix = {arXiv},\n\tprimaryclass = {stat.ML}\n}\n@inproceedings{dong2020root,\n\ttitle        = {Root-n-Regret for Learning in {Markov} Decision Processes with Function Approximation and Low {Bellman} Rank},\n\tauthor       = {Dong, Kefan and Peng, Jian and Wang, Yining and Zhou, Yuan},\n\tyear         = 2020,\n\tbooktitle    = {Conference on Learning Theory},\n\tpages        = {1554--1557},\n\torganization = {PMLR}\n}\n@article{dong2021provable,\n\ttitle        = {Provable Model-based Nonlinear Bandit and Reinforcement Learning: Shelve Optimism, Embrace Virtual Curvature},\n\tauthor       = {Dong, Kefan and Yang, Jiaqi and Ma, Tengyu},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2102.04168}\n}\n@inproceedings{donmez2008proactive,\n\ttitle        = {Proactive learning: cost-sensitive active learning with multiple imperfect oracles},\n\tauthor       = {Pinar Donmez and Jaime G Carbonell},\n\tyear         = 2008,\n\tbooktitle    = {Conference on Information and Knowledge Management (CIKM)},\n\tpages        = {619--628}\n}\n@article{donmez2010unsupervised,\n\ttitle        = {Unsupervised supervised learning {I}: Estimating classification and regression errors without labels},\n\tauthor       = {Pinar Donmez and Guy Lebanon and Krishnakumar Balasubramanian},\n\tyear         = 2010,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 11,\n\tpages        = {1323--1351}\n}\n@article{donoho06compressed,\n\ttitle        = {Compressed sensing},\n\tauthor       = {David Donoho},\n\tyear         = 2006,\n\tjournal      = {IEEE Trans. on Information Theory},\n\tvolume       = 52,\n\tnumber       = 4,\n\tpages        = {1289--1306}\n}\n@misc{donoho1982breakdown,\n\ttitle        = {Breakdown properties of multivariate location estimators},\n\tauthor       = {David L. Donoho},\n\tyear         = 1982,\n\thowpublished = {Ph.D.~qualifying paper},\n\tschool       = {Department of Statistics, Harvard University}\n}\n@article{donoho1992breakdown,\n\ttitle        = {Breakdown properties of location estimates based on halfspace depth and projected outlyingness},\n\tauthor       = {David L. Donoho and Miriam Gasko},\n\tyear         = 1992,\n\tjournal      = {Annals of Statistics},\n\tvolume       = 20,\n\tnumber       = 4,\n\tpages        = {1803--1827}\n}\n@article{donoho2001uncertainty,\n\ttitle        = {Uncertainty principles and ideal atomic decomposition},\n\tauthor       = {Donoho, David L and Huo, Xiaoming},\n\tyear         = 2001,\n\tjournal      = {Information Theory, IEEE Transactions on},\n\tpublisher    = {IEEE},\n\tvolume       = 47,\n\tnumber       = 7,\n\tpages        = {2845--2862}\n}\n@inproceedings{donoho2004does,\n\ttitle        = {When does non-negative matrix factorization give a correct decomposition into parts?},\n\tauthor       = {Donoho, David and Stodden, Victoria},\n\tyear         = 2004,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {1141--1148}\n}\n@article{donoho2006compressed,\n\ttitle        = {Compressed sensing},\n\tauthor       = {D. Donoho},\n\tyear         = 2006,\n\tjournal      = {Information Theory, IEEE Transactions on},\n\tpublisher    = {IEEE},\n\tvolume       = 52,\n\tnumber       = 4,\n\tpages        = {1289--1306}\n}\n@article{donoho95soft,\n\ttitle        = {De-noising by soft-thresholding},\n\tauthor       = {D. L. Donoho},\n\tyear         = 1995,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tvolume       = 41,\n\tpages        = {613--627}\n}\n@article{donti2020enforcing,\n\ttitle        = {Enforcing robust control guarantees within neural network policies},\n\tauthor       = {Donti, Priya L and Roderick, Melrose and Fazlyab, Mahyar and Kolter, J Zico},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2011.08105}\n}\n@article{dorazio2010review,\n\ttitle        = {A review of vision-based systems for soccer video analysis},\n\tauthor       = {Tiziana D'Orazio and Marco Leo},\n\tyear         = 2010,\n\tjournal      = {Pattern recognition},\n\tvolume       = 43,\n\tnumber       = 8,\n\tpages        = {2911--2926}\n}\n@article{dorfman2020offline,\n\ttitle        = {Offline Meta Reinforcement Learning},\n\tauthor       = {Ron Dorfman and Aviv Tamar},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2008.02598}\n}\n@techreport{dorfmullerulhaas2003robust,\n\ttitle        = {Robust Optical User Motion Tracking Using a Kalman Filter},\n\tauthor       = {Klaus Dorfm{\\\"u}ller-Ulhaas},\n\tyear         = 2003,\n\tmonth        = may,\n\taddress      = {Institut fuer Informatik, Universit{\\\"a}tsstr. 2, 86159 Augsburg},\n\tnumber       = {2003-6}\n}\n@inproceedings{dosovitskiy2016generating,\n\ttitle        = {Generating Images with Perceptual Similarity Metrics based on Deep Networks},\n\tauthor       = {Matt J Kusner and Joshua R Loftus and Chris Russell and Ricardo Silva},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {658--666}\n}\n@inproceedings{dosovitskiy2016inverting,\n\ttitle        = {Inverting visual representations with convolutional networks},\n\tauthor       = {Alexey Dosovitskiy and Thomas Brox},\n\tyear         = 2016,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@inproceedings{dosovitskiy2017carla,\n\ttitle        = {CARLA: An Open Urban Driving Simulator},\n\tauthor       = {Alexey Dosovitskiy and German Ros and Felipe Codevilla and Antonio Lopez and Vladlen Koltun},\n\tyear         = 2017,\n\tbooktitle    = {Conference on Robot Learning},\n\tpages        = {1--16}\n}\n@inproceedings{dostert1969rel,\n\ttitle        = {{REL}: A Rapidly Extensible Language System {I}},\n\tauthor       = {Bozena Dostert and Frederick B. Thompson},\n\tyear         = 1969,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)}\n}\n@inproceedings{dostert1969rel2,\n\ttitle        = {{REL}: A Rapidly Extensible Language System {II}. {REL} {E}nglish},\n\tauthor       = {Bozena Dostert and Frederick B. Thompson},\n\tyear         = 1969,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)}\n}\n@article{doucet2000sequential,\n\ttitle        = {On sequential {M}onte {C}arlo sampling methods for {B}ayesian filtering},\n\tauthor       = {Arnaud Doucet and Simon Godsill and Christophe Andrieu},\n\tyear         = 2000,\n\tjournal      = {Statistics and computing},\n\tvolume       = 10,\n\tnumber       = 3,\n\tpages        = {197--208}\n}\n@article{doucet2011tutorial,\n\ttitle        = {A tutorial on particle filtering and smoothing: fifteen years later},\n\tauthor       = {Doucet, Arnaud and Johansen, Adam M.},\n\tyear         = 2011,\n\tmonth        = dec,\n\tjournal      = {The Oxford Handbook of Nonlinear Filtering},\n\tbooktitle    = {OXFORD HANDBOOK OF NONLINEAR FILTERING},\n\tpages        = {4--6},\n\tabstract     = {Optimal estimation problems for non-linear {non-Gaussian} state-space models do not typically admit analytic solutions. Since their introduction in 1993, particle filtering methods have become a very popular class of algorithms to solve these estimation problems numerically in an online manner, i.e. recursively as observations become available, and are now routinely used in fields as diverse as computer vision, econometrics, robotics and navigation. The objective of this tutorial is to provide a complete, up-to-date survey of this field as of 2008. Basic and advanced particle methods for filtering as well as smoothing are presented.},\n\tciteulike-article-id = 9086845,\n\tciteulike-linkout-0 = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.157.772},\n\tkeywords     = {algorithms},\n\tposted-at    = {2012-01-28 15:28:07},\n\tpriority     = 2\n}\n@inproceedings{douceur2002sybil,\n\ttitle        = {The Sybil Attack},\n\tauthor       = {Douceur, John R.},\n\tyear         = 2002,\n\tbooktitle    = {Revised Papers from the First International Workshop on Peer-to-Peer Systems},\n\tpublisher    = {Springer-Verlag},\n\taddress      = {London, UK, UK},\n\tseries       = {IPTPS '01},\n\tpages        = {251--260},\n\tisbn         = {3-540-44179-4},\n\turl          = {http://dl.acm.org/citation.cfm?id=646334.687813},\n\tacmid        = 687813,\n\tnumpages     = 10\n}\n@article{dougherty1989nonnegativity,\n\ttitle        = {Nonnegativity-, monotonicity-, or convexity-preserving cubic and quintic {H}ermite interpolation},\n\tauthor       = {Dougherty, Randall L and Edelman, Alan S and Hyman, James M},\n\tyear         = 1989,\n\tjournal      = {Mathematics of Computation},\n\tvolume       = 52,\n\tnumber       = 186,\n\tpages        = {471--494}\n}\n@inproceedings{dozat2017stanford,\n\ttitle        = {Stanford's Graph-based Neural Dependency Parser at the CoNLL 2017 Shared Task},\n\tauthor       = {Timothy Dozat and Peng Qi and Christopher D Manning},\n\tyear         = 2017,\n\tbooktitle    = {Computational Natural Language Learning (CoNLL)},\n\tpages        = {20--30}\n}\n@article{dragan2013policy,\n\ttitle        = {A policy-blending formalism for shared control},\n\tauthor       = {Anca D Dragan and Siddhartha S Srinivasa},\n\tyear         = 2013,\n\tjournal      = {International Journal of Robotics Research (IJRR)},\n\tvolume       = 32,\n\tpages        = {790--805}\n}\n@inproceedings{DRCFU12,\n\ttitle        = {Spectral Dependency Parsing with Latent Variables},\n\tauthor       = {P. S. Dhillon and J. Rodu and M. Collins and D. P. Foster and L. H. Ungar},\n\tyear         = 2012,\n\tbooktitle    = {EMNLP-CoNLL}\n}\n@inproceedings{dreesen2012roots,\n\ttitle        = {Back to the roots: Polynomial system solving, linear algebra, systems theory},\n\tauthor       = {Philippe Dreesen and Kim Batselier and Bart De Moor},\n\tyear         = 2012,\n\tbooktitle    = {IFAC Symposium on System Identification (SYSID)},\n\tpages        = {1203--1208}\n}\n@inproceedings{dreossi2017compositional,\n\ttitle        = {Compositional Falsification of Cyber-Physical Systems with Machine Learning Components},\n\tauthor       = {T. Dreossi and Alexandre Donzé and S. Seshia},\n\tyear         = 2017,\n\tbooktitle    = {NFM}\n}\n@inproceedings{drineas2003pass,\n\ttitle        = {Pass efficient algorithms for approximating large matrices.},\n\tauthor       = {Drineas, Petros and Kannan, Ravi},\n\tyear         = 2003,\n\tbooktitle    = {SODA},\n\tvolume       = 3,\n\tpages        = {223--232}\n}\n@article{drineas2005nystrom,\n\ttitle        = {On the {Nystr{\\\"o}m} method for approximating a Gram matrix for improved kernel-based learning},\n\tauthor       = {Drineas, Petros and Mahoney, Michael W},\n\tyear         = 2005,\n\tjournal      = {Journal of Machine Learning Research},\n\tpublisher    = {JMLR. org},\n\tvolume       = 6,\n\tpages        = {2153--2175}\n}\n@article{drineas2006fast,\n\ttitle        = {Fast Monte Carlo algorithms for matrices III: Computing a compressed approximate matrix decomposition},\n\tauthor       = {Drineas, Petros and Kannan, Ravi and Mahoney, Michael W},\n\tyear         = 2006,\n\tjournal      = {SIAM Journal on Computing},\n\tpublisher    = {SIAM},\n\tvolume       = 36,\n\tnumber       = 1,\n\tpages        = {184--206}\n}\n@inproceedings{drineas2006sampling,\n\ttitle        = {Sampling algorithms for l 2 regression and applications},\n\tauthor       = {Drineas, Petros and Mahoney, Michael W and Muthukrishnan, S},\n\tyear         = 2006,\n\tbooktitle    = {Proceedings of the seventeenth annual ACM-SIAM symposium on Discrete algorithm},\n\tpages        = {1127--1136},\n\torganization = {Society for Industrial and Applied Mathematics}\n}\n@article{drineas2008relative,\n\ttitle        = {Relative-error {CUR} matrix decompositions},\n\tauthor       = {Drineas, Petros and Mahoney, Michael W and Muthukrishnan, S},\n\tyear         = 2008,\n\tjournal      = {SIAM Journal on Matrix Analysis and Applications},\n\tpublisher    = {SIAM},\n\tvolume       = 30,\n\tnumber       = 2,\n\tpages        = {844--881}\n}\n@article{drineas2012fast,\n\ttitle        = {Fast approximation of matrix coherence and statistical leverage},\n\tauthor       = {Drineas, Petros and Magdon-Ismail, Malik and Mahoney, Michael W and Woodruff, David P},\n\tyear         = 2012,\n\tjournal      = {Journal of Machine Learning Research},\n\tpublisher    = {JMLR. org},\n\tvolume       = 13,\n\tnumber       = 1,\n\tpages        = {3475--3506}\n}\n@inproceedings{drozdov2019unsupervised,\n\ttitle        = {Unsupervised Latent Tree Induction with Deep Inside-Outside Recursive Autoencoders},\n\tauthor       = {Andrew Drozdov and Pat Verga and Mohit Yadav and Mohit Iyyer and Andrew McCallum},\n\tyear         = 2019,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{drton2007algebraic,\n\ttitle        = {Algebraic factor analysis: tetrads, pentads and beyond},\n\tauthor       = {Drton, M. and Sturmfels, B. and Sullivant, S.},\n\tyear         = 2007,\n\tjournal      = {Probability Theory and Related Fields},\n\tpublisher    = {Springer},\n\tvolume       = 138,\n\tnumber       = 3,\n\tpages        = {463--493}\n}\n@book{drton2009lectures,\n\ttitle        = {Lectures on algebraic statistics},\n\tauthor       = {Mathias Drton and Bernd Sturmfels and Seth Sullivant},\n\tyear         = 2009,\n\tpublisher    = {Springer}\n}\n@inproceedings{druck08ge,\n\ttitle        = {Learning from Labeled Features using Generalized Expectation Criteria},\n\tauthor       = {Gregory Druck and Gideon Mann and Andrew McCallum},\n\tyear         = 2008,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)},\n\tpages        = {595--602}\n}\n@inproceedings{druck2009active,\n\ttitle        = {Active learning by labeling features},\n\tauthor       = {Gregory Druck and Burr Settles and Andrew McCallum},\n\tyear         = 2009,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {81--90}\n}\n@inproceedings{DS,\n\ttitle        = {Uncertainty principles and signal recovery},\n\tauthor       = {D. Donoho and P. Stark},\n\tyear         = 1999,\n\tbooktitle    = {SIAM J. on Appl. Math},\n\tpages        = {906--931}\n}\n@article{DS07,\n\ttitle        = {A Probabilistic Analysis of {EM} for Mixtures of Separated, Spherical {G}aussians},\n\tauthor       = {S. Dasgupta and L. Schulman},\n\tyear         = 2007,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 8,\n\tnumber       = {Feb},\n\tpages        = {203--226}\n}\n@inproceedings{ds16,\n\ttitle        = {Complexity theoretic limitations on learning DNF’s},\n\tauthor       = {Daniely, Amit and Shalev-Shwartz, Shai},\n\tyear         = 2016,\n\tbooktitle    = {Conference on Learning Theory (COLT)},\n\tpages        = {815--830}\n}\n@article{dtmr18,\n\ttitle        = {Safely Learning to Control the Constrained Linear Quadratic Regulator},\n\tauthor       = {Dean, Sarah and Tu, Stephen and Matni, Nikolai and Recht, Benjamin},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1809.10121}\n}\n@inproceedings{du17stochastic,\n\ttitle        = {Stochastic Variance Reduction Methods for Policy Evaluation},\n\tauthor       = {Simon S. Du and Jianshu Chen and Lihong Li and Lin Xiao and Dengyong Zhou},\n\tyear         = 2017,\n\tmonth        = {06--11 Aug},\n\tbooktitle    = {Proceedings of the 34th International Conference on Machine Learning},\n\tpublisher    = {PMLR},\n\tseries       = {Proceedings of Machine Learning Research},\n\tvolume       = 70,\n\tpages        = {1049--1058},\n\turl          = {http://proceedings.mlr.press/v70/du17a.html},\n\teditor       = {Precup, Doina and Teh, Yee Whye},\n\tpdf          = {http://proceedings.mlr.press/v70/du17a/du17a.pdf},\n\tabstract     = {Policy evaluation is concerned with estimating the value function that predicts long-term values of states under a given policy. It is a crucial step in many reinforcement-learning algorithms. In this paper, we focus on policy evaluation with linear function approximation over a fixed dataset. We first transform the empirical policy evaluation problem into a (quadratic) convex-concave saddle-point problem, and then present a primal-dual batch gradient method, as well as two stochastic variance reduction methods for solving the problem. These algorithms scale linearly in both sample size and feature dimension. Moreover, they achieve linear convergence even when the saddle-point problem has only strong concavity in the dual variables but no strong convexity in the primal variables. Numerical experiments on benchmark problems demonstrate the effectiveness of our methods.}\n}\n@article{du2017gradient,\n\ttitle        = {Gradient Descent Can Take Exponential Time to Escape Saddle Points},\n\tauthor       = {Du, Simon S and Jin, Chi and Lee, Jason D and Jordan, Michael I and Singh, Aarti and Poczos, Barnabas},\n\tyear         = 2017,\n\tjournal      = {Neural Information Processing Systems (NIPS)},\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {1067--1077}\n}\n@inproceedings{du2017hypothesis,\n\ttitle        = {Hypothesis Transfer Learning via Transformation Functions},\n\tauthor       = {Du, Simon S and Koushik, Jayanth and Singh, Aarti and Poczos, Barnabas},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 30,\n\tpages        = {},\n\turl          = {https://proceedings.neurips.cc/paper/2017/file/352fe25daf686bdb4edca223c921acea-Paper.pdf},\n\teditor       = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett}\n}\n@inproceedings{du2017power,\n\ttitle        = {On the Power of Truncated {SVD} for General High-rank Matrix Estimation Problems},\n\tauthor       = {Du, Simon S and Wang, Yining and Singh, Aarti},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 30,\n\tpages        = {},\n\turl          = {https://proceedings.neurips.cc/paper/2017/file/89f0fd5c927d466d6ec9a21b9ac34ffa-Paper.pdf},\n\teditor       = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett}\n}\n@article{du2017sparse,\n\ttitle        = {Computationally efficient robust estimation of sparse functionals},\n\tauthor       = {Simon S. Du and Sivaraman Balakrishnan and Aarti Singh},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1702.07709}\n}\n@article{du2017spurious,\n\ttitle        = {Gradient Descent Learns One-hidden-layer {CNN}: Don't be Afraid of Spurious Local Minima},\n\tauthor       = {Du, Simon S and Lee, Jason D and Tian, Yuandong and Poczos, Barnabas and Singh, Aarti},\n\tyear         = 2017,\n\tjournal      = {Proceedings of the 35th International Conference on Machine Learning},\n\tpages        = {1339--1348}\n}\n@article{du2017stochastic,\n\ttitle        = {Stochastic variance reduction methods for policy evaluation},\n\tauthor       = {Du, Simon S and Chen, Jianshu and Li, Lihong and Xiao, Lin and Zhou, Dengyong},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1702.07944}\n}\n@inproceedings{du2018algorithmic,\n\ttitle        = {Algorithmic regularization in learning deep homogeneous models: Layers are automatically balanced},\n\tauthor       = {Du, Simon S and Hu, Wei and Lee, Jason D},\n\tyear         = 2018,\n\tjournal      = {Neural Information Processing Systems (NIPS)},\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {384--395}\n}\n@article{du2018gradient,\n\ttitle        = {Gradient descent finds global minima of deep neural networks},\n\tauthor       = {Du, Simon S and Lee, Jason D and Li, Haochuan and Wang, Liwei and Zhai, Xiyu},\n\tyear         = 2018,\n\tmonth        = nov,\n\tjournal      = {arXiv preprint arXiv:1811.03804},\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {1675--1685},\n\torganization = {PMLR}\n}\n@inproceedings{du2018how,\n\ttitle        = {How Many Samples are Needed to Estimate a Convolutional Neural Network?},\n\tauthor       = {Du, Simon S and Wang, Yining and Zhai, Xiyu and Balakrishnan, Sivaraman and Salakhutdinov, Russ R and Singh, Aarti},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 31,\n\tpages        = {},\n\turl          = {https://proceedings.neurips.cc/paper/2018/file/03c6b06952c750899bb03d998e631860-Paper.pdf},\n\teditor       = {S. Bengio and H. Wallach and H. Larochelle and K. Grauman and N. Cesa-Bianchi and R. Garnett}\n}\n@article{du2018improved,\n\ttitle        = {Improved Learning of One-hidden-layer Convolutional Neural Networks with Overlaps},\n\tauthor       = {S. Du and Surbhi Goel},\n\tyear         = 2018,\n\tjournal      = {ArXiv},\n\tvolume       = {abs/1805.07798}\n}\n@inproceedings{du2018linear,\n\ttitle        = {Linear Convergence of the Primal-Dual Gradient Method for Convex-Concave Saddle Point Problems without Strong Convexity},\n\tauthor       = {Du, Simon S. and Hu, Wei},\n\tyear         = 2019,\n\tmonth        = {16--18 Apr},\n\tbooktitle    = {Proceedings of the Twenty-Second International Conference on Artificial Intelligence and Statistics},\n\tpublisher    = {PMLR},\n\tseries       = {Proceedings of Machine Learning Research},\n\tvolume       = 89,\n\tpages        = {196--205},\n\turl          = {http://proceedings.mlr.press/v89/du19b.html},\n\teditor       = {Chaudhuri, Kamalika and Sugiyama, Masashi},\n\tpdf          = {http://proceedings.mlr.press/v89/du19b/du19b.pdf},\n\tabstract     = {We consider the convex-concave saddle point problem $\\min_{x}\\max_{y} f(x)+y^\\top A x-g(y)$ where $f$ is smooth and convex and $g$ is smooth and strongly convex. We prove that if the coupling matrix $A$ has full column rank, the vanilla primal-dual gradient method can achieve linear convergence even if $f$ is not strongly convex. Our result generalizes previous work which either requires $f$ and $g$ to be quadratic functions or requires proximal mappings for both $f$ and $g$. We adopt a novel analysis technique that in each iteration uses a \"ghost\" update as a reference, and show that the iterates in the primal-dual gradient method converge to this \"ghost\" sequence. Using the same technique we further give an analysis for the primal-dual stochastic variance reduced gradient method for convex-concave saddle point problems with a finite-sum structure.}\n}\n@article{du2018power,\n\ttitle        = {On the Power of Over-parametrization in Neural Networks with Quadratic Activation},\n\tauthor       = {Du, Simon S and Lee, Jason D},\n\tyear         = 2018,\n\tjournal      = {International Conference on Machine Learning (ICML)}\n}\n@article{du2018robust,\n\ttitle        = {Robust Nonparametric Regression under Huber's epsilon-contamination Model},\n\tauthor       = {S. Du and Y. Wang and Sivaraman Balakrishnan and Pradeep Ravikumar and A. Singh},\n\tyear         = 2018,\n\tjournal      = {ArXiv},\n\tvolume       = {abs/1805.10406}\n}\n@inproceedings{du2018when,\n\ttitle        = {When is a Convolutional Filter Easy to Learn?},\n\tauthor       = {Simon S. Du and Jason D. Lee and Yuandong Tian},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations},\n\turl          = {https://openreview.net/forum?id=SkA-IE06W}\n}\n@article{du2019continuous,\n\ttitle        = {Continuous Control with Contexts, Provably},\n\tauthor       = {Du, Simon S and Wang, Ruosong and Wang, Mengdi and Yang, Lin F},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1910.13614}\n}\n@inproceedings{du2019decoding,\n\ttitle        = {Provably efficient RL with rich observations via latent state decoding},\n\tauthor       = {Du, Simon and Krishnamurthy, Akshay and Jiang, Nan and Agarwal, Alekh and Dudik, Miroslav and Langford, John},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {1665--1674},\n\torganization = {PMLR}\n}\n@inproceedings{du2019dsec,\n\ttitle        = {Provably efficient {Q}-learning with function approximation via distribution shift error checking oracle},\n\tauthor       = {Du, Simon S and Luo, Yuping and Wang, Ruosong and Zhang, Hanrui},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {8058--8068}\n}\n@inproceedings{du2019good,\n\ttitle        = {Is a Good Representation Sufficient for Sample Efficient Reinforcement Learning?},\n\tauthor       = {Du, Simon S and Kakade, Sham M and Wang, Ruosong and Yang, Lin F},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Learning Representations}\n}\n@inproceedings{du2019graph,\n\ttitle        = {Graph Neural Tangent Kernel: Fusing Graph Neural Networks with Graph Kernels},\n\tauthor       = {Du, Simon S and Hou, Kangcheng and Salakhutdinov, Russ R and Poczos, Barnabas and Wang, Ruosong and Xu, Keyulu},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 32,\n\tpages        = {},\n\turl          = {https://proceedings.neurips.cc/paper/2019/file/663fd3c5144fd10bd5ca6611a9a5b92d-Paper.pdf},\n\teditor       = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\\textquotesingle Alch\\'{e}-Buc and E. Fox and R. Garnett}\n}\n@inproceedings{du2019width,\n\ttitle        = {Width provably matters in optimization for deep linear neural networks},\n\tauthor       = {Du, Simon and Hu, Wei},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {1655--1664},\n\torganization = {PMLR}\n}\n@article{du2020agnostic,\n\ttitle        = {Agnostic Q-learning with function approximation in deterministic systems: Tight bounds on approximation error and sample complexity},\n\tauthor       = {Du, Simon S and Lee, Jason D and Mahajan, Gaurav and Wang, Ruosong},\n\tyear         = 2020,\n\tjournal      = {Neural Information Processing Systems (NeurIPS)}\n}\n@article{du2020few,\n\ttitle        = {Few-shot learning via learning the representation, provably},\n\tauthor       = {Du, Simon S and Hu, Wei and Kakade, Sham M and Lee, Jason D and Lei, Qi},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2002.09434}\n}\n@article{du2020fewshot,\n\ttitle        = {Few-Shot Learning via Learning the Representation, Provably},\n\tauthor       = {Simon S. Du and Wei Hu and Sham M. Kakade and Jason D. Lee and Qi Lei},\n\tyear         = 2020,\n\tjournal      = {arXiv}\n}\n@article{du2020particle,\n\ttitle        = {When is Particle Filtering Efficient for POMDP Sequential Planning?},\n\tauthor       = {Du, Simon S and Hu, Wei and Li, Zhiyuan and Shen, Ruoqi and Song, Zhao and Wu, Jiajun},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.05975}\n}\n@article{du2021bilinear,\n\ttitle        = {Bilinear Classes: A Structural Framework for Provable Generalization in RL},\n\tauthor       = {Du, Simon S and Kakade, Sham M and Lee, Jason D and Lovett, Shachar and Mahajan, Gaurav and Sun, Wen and Wang, Ruosong},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2103.10897}\n}\n@inproceedings{du2021fewshot,\n\ttitle        = {Few-Shot Learning via Learning the Representation, Provably},\n\tauthor       = {Simon Shaolei Du and Wei Hu and Sham M. Kakade and Jason D. Lee and Qi Lei},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Learning Representations},\n\turl          = {https://openreview.net/forum?id=pW2Q2xLwIMD}\n}\n@inproceedings{dua2019drop,\n\ttitle        = {{DROP}: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs},\n\tauthor       = {Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},\n\tyear         = 2019,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@misc{dua2019uci,\n\ttitle        = {{UCI} Machine Learning Repository},\n\tauthor       = {Dua, Dheeru and Graff, Casey},\n\tyear         = 2017,\n\turl          = {http://archive.ics.uci.edu/ml},\n\tinstitution  = {University of California, Irvine, School of Information and Computer Sciences}\n}\n@inproceedings{duan2012discovering,\n\ttitle        = {Discovering localized attributes for fine-grained recognition},\n\tauthor       = {Kun Duan and Devi Parikh and David Crandall and Kristen Grauman},\n\tyear         = 2012,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {3474--3481}\n}\n@inproceedings{duan2016benchmarking,\n\ttitle        = {Benchmarking deep reinforcement learning for continuous control},\n\tauthor       = {Duan, Yan and Chen, Xi and Houthooft, Rein and Schulman, John and Abbeel, Pieter},\n\tyear         = 2016,\n\tbooktitle    = {International conference on machine learning},\n\tpages        = {1329--1338},\n\torganization = {PMLR}\n}\n@article{duan2016generating,\n\ttitle        = {Generating disambiguating paraphrases for structurally ambiguous sentences},\n\tauthor       = {Manjuan Duan and Ethan Hill and Michael White},\n\tyear         = 2016,\n\tjournal      = {Proceedings of 10th Linguistic Annotation Workshop}\n}\n@article{duan2016rl,\n\ttitle        = {{RL}$^2$: Fast reinforcement learning via slow reinforcement learning},\n\tauthor       = {Yan Duan and John Schulman and Xi Chen and Peter L Bartlett and Ilya Sutskever and Pieter Abbeel},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1611.02779}\n}\n@article{duan2017one,\n\ttitle        = {One-Shot Imitation Learning},\n\tauthor       = {Y. Duan and M. Andrychowicz and B. C. Stadie and J. Ho and J. Schneider and I. Sutskever and P. Abbeel and W. Zaremba},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1703.07326}\n}\n@article{duan2018adaptive,\n\ttitle        = {Adaptive Low-Nonnegative-Rank Approximation for State Aggregation of Markov Chains},\n\tauthor       = {Duan, Yaqi and Wang, Mengdi and Wen, Zaiwen and Yuan, Yaxiang},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1810.06032},\n\tpublisher    = {SIAM}\n}\n@article{duan2018state,\n\ttitle        = {State Aggregation Learning from Markov Transition Data},\n\tauthor       = {Duan, Yaqi and Ke, Zheng Tracy and Wang, Mengdi},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1811.02619}\n}\n@article{DuanPettie2014,\n\ttitle        = {{Linear-Time Approximation for Maximum Weight Matching}},\n\tauthor       = {Duan, Ran and Pettie, Seth},\n\tyear         = 2014,\n\tmonth        = jan,\n\tjournal      = {Journal of the ACM},\n\tvolume       = 61,\n\tnumber       = 1,\n\tpages        = {1--23},\n\tdoi          = {10.1145/2529989},\n\tissn         = {00045411},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/3492dea6a70b4a1339999fc8ae8e26be784d1cb1.pdf:pdf},\n\tmendeley-groups = {Algorithms/Maxflow}\n}\n@misc{duchi_notes,\n\ttitle        = {Lecture Notes for Statistics 311/Electrical Engineering 377},\n\tauthor       = {Duchi, John},\n\tyear         = 2019,\n\tmonth        = {March},\n\tpublisher    = {Stanford University}\n}\n@inproceedings{duchi10adagrad,\n\ttitle        = {Adaptive Subgradient Methods for Online Learning and Stochastic Optimization},\n\tauthor       = {John Duchi and Elad Hazan and Yoram Singer},\n\tyear         = 2010,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@inproceedings{duchi2008projections,\n\ttitle        = {Efficient Projections onto the l1-Ball for Learning in High Dimensions},\n\tauthor       = {J. Duchi and S. Shalev-Shwartz and Y. Singer and T. Chandra},\n\tyear         = 2008,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{Duchi2010,\n\ttitle        = {{Composite Objective Mirror Descent}},\n\tauthor       = {Duchi, John and {Shalev-Shwartz}, Shai and Singer, Yoram and Tewari, Ambuj},\n\tyear         = 2010,\n\tbooktitle    = {Proceedings of the 23rd Annual Conference on Learning Theory - COLT '10},\n\tnumber       = 1,\n\tabstract     = {We present a new method for regularized convex optimization and analyze it under both online and stochastic optimization settings. In addition to unifying previously known ﬁrstorder algorithms, such as the projected gradient method, mirror descent, and forwardbackward splitting, our method yields new analysis and algorithms. We also derive speciﬁc instantiations of our method for commonly used regularization functions, such as ℓ1, mixed norm, and trace-norm.},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Duchi et al. - 2010 - Composite Objective Mirror Descent.pdf:pdf},\n\tkeywords     = {Learning/Statistics \\& Optimisation,Theory \\& Algorithms},\n\tmendeley-groups = {Optimization/Gradient Descent Theory/Composite}\n}\n@article{duchi2011adaptive,\n\ttitle        = {Adaptive subgradient methods for online learning and stochastic optimization},\n\tauthor       = {Duchi, John and Hazan, Elad and Singer, Yoram},\n\tyear         = 2011,\n\tjournal      = {Journal of Machine Learning Research},\n\tpublisher    = {JMLR. org},\n\tvolume       = 12,\n\tnumber       = {Jul},\n\tpages        = {2121--2159}\n}\n@inproceedings{duchi2013local,\n\ttitle        = {Local Privacy and Statistical Minimax Rates},\n\tauthor       = {John C. Duchi and Michael I. Jordan and Martin J. Wainwright},\n\tyear         = 2013,\n\tbooktitle    = {Foundations of Computer Science (FOCS)}\n}\n@article{duchi2013optimal,\n\ttitle        = {Optimal rates for zero-order optimization: the power of two function evaluations},\n\tauthor       = {Duchi, John C and Jordan, Michael I and Wainwright, Martin J and Wibisono, Andre},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1312.2139}\n}\n@article{duchi2014optimality,\n\ttitle        = {Optimality guarantees for distributed statistical estimation},\n\tauthor       = {Duchi, John C and Jordan, Michael I and Wainwright, Martin J and Zhang, Yuchen},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1405.0782}\n}\n@article{duchi2015optimal,\n\ttitle        = {Optimal rates for zero-order convex optimization: The power of two function evaluations},\n\tauthor       = {Duchi, John C and Jordan, Michael I and Wainwright, Martin J and Wibisono, Andre},\n\tyear         = 2015,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tpublisher    = {IEEE},\n\tvolume       = 61,\n\tnumber       = 5,\n\tpages        = {2788--2806}\n}\n@article{duchi2016,\n\ttitle        = {Statistics of Robust Optimization: A Generalized Empirical Likelihood Approach},\n\tauthor       = {John Duchi and Peter Glynn and Hongseok Namkoong},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@misc{duchi2019distributionally,\n\ttitle        = {Distributionally Robust Losses Against Mixture Covariate Shifts},\n\tauthor       = {John Duchi and Tatsunori Hashimoto and Hongseok Namkoong},\n\tyear         = 2019,\n\thowpublished = {\\url{https://cs.stanford.edu/~thashim/assets/publications/condrisk.pdf}}\n}\n@article{duchi2021learning,\n\ttitle        = {Learning Models with Uniform Performance via Distributionally Robust Optimization},\n\tauthor       = {John Duchi and Hongseok Namkoong},\n\tyear         = 2021,\n\tjournal      = {Annals of Statistics}\n}\n@inproceedings{DuchiSSC08projection,\n\ttitle        = {Efficient projections onto the \\emph{l}\\({}_{\\mbox{1}}\\)-ball for learning in high dimensions},\n\tauthor       = {John C. Duchi and Shai Shalev{-}Shwartz and Yoram Singer and Tushar Chandra},\n\tyear         = 2008,\n\tbooktitle    = {Machine Learning, Proceedings of the Twenty-Fifth International Conference {(ICML} 2008), Helsinki, Finland, June 5-9, 2008},\n\tpages        = {272--279},\n\tdoi          = {10.1145/1390156.1390191},\n\turl          = {http://doi.acm.org/10.1145/1390156.1390191},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/icml/DuchiSSC08},\n\tcrossref     = {DBLP:conf/icml/2008},\n\ttimestamp    = {Sat, 21 Jan 2012 17:47:23 +0100},\n\tbdsk-url-1   = {http://doi.acm.org/10.1145/1390156.1390191},\n\tbdsk-url-2   = {http://dx.doi.org/10.1145/1390156.1390191}\n}\n@inproceedings{duclaye2003learning,\n\ttitle        = {Learning paraphrases to improve a question-answering system},\n\tauthor       = {Florence Duclaye and François Yvon and Olivier Collin},\n\tyear         = 2003,\n\tbooktitle    = {Workshop on Natural Language Processing for Question Answering},\n\tpages        = {35--41}\n}\n@article{dudik07maxent,\n\ttitle        = {Maximum Entropy Density Estimation},\n\tauthor       = {Miroslav Dudík and Steven J. Phillips and Robert E. Schapire},\n\tyear         = 2007,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 8,\n\tpages        = {1217--1260}\n}\n@article{dudik2011efficient,\n\ttitle        = {Efficient optimal learning for contextual bandits},\n\tauthor       = {Dudik, Miroslav and Hsu, Daniel and Kale, Satyen and Karampatziakis, Nikos and Langford, John and Reyzin, Lev and Zhang, Tong},\n\tyear         = 2011,\n\tjournal      = {arXiv preprint arXiv:1106.2369}\n}\n@article{dudley1967sizes,\n\ttitle        = {The sizes of compact subsets of {H}ilbert space and continuity of {G}aussian processes},\n\tauthor       = {Richard M. Dudley},\n\tyear         = 1967,\n\tjournal      = {Journal of Functional Analysis},\n\tvolume       = 1,\n\tnumber       = 3,\n\tpages        = {290--330}\n}\n@phdthesis{duff2002optimal,\n\ttitle        = {Optimal Learning: Computational procedures for Bayes-adaptive Markov decision processes},\n\tauthor       = {Michael O'Gordon Duff},\n\tyear         = 2002,\n\tschool       = {University of Massachusetts Amherst}\n}\n@inproceedings{dumais1994latent,\n\ttitle        = {Latent Semantic Indexing ({LSI}) and {TREC}-2},\n\tauthor       = {Susan T. Dumais},\n\tyear         = 1994,\n\tmonth        = mar,\n\tbooktitle    = {The Second Text Retrieval Conference (TREC-2)},\n\tpublisher    = {NIST},\n\taddress      = {Gaithersburg, MD},\n\tpages        = {105--115},\n\tnote         = {Special publication 500-215},\n\teditor       = {D. K. Harman},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@article{dunn2017searchqa,\n\ttitle        = {{SearchQA}: A New {Q}\\&{A} Dataset Augmented with Context from a Search Engine},\n\tauthor       = {Matthew Dunn and and Levent Sagun and Mike Higgins and Ugur Guney and Volkan Cirik and Kyunghyn Cho},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@book{durbin,\n\ttitle        = {Biological Sequence Analysis: Probabilistic Models of Proteins and Nucleic Acids},\n\tauthor       = {R. Durbin and S. R. Eddy and A. Krogh and G. Mitchison},\n\tyear         = 1999,\n\tpublisher    = {Cambridge University Press}\n}\n@article{durrant2006simultaneous,\n\ttitle        = {Simultaneous localization and mapping: part I},\n\tauthor       = {Durrant-Whyte, Hugh and Bailey, Tim},\n\tyear         = 2006,\n\tjournal      = {Robotics \\& Automation Magazine, IEEE},\n\tpublisher    = {IEEE},\n\tvolume       = 13,\n\tnumber       = 2,\n\tpages        = {99--110},\n\tdate-added   = {2016-04-04 17:35:36 +0000},\n\tdate-modified = {2016-04-04 17:35:36 +0000}\n}\n@inproceedings{durrett2015neural,\n\ttitle        = {Neural {CRF} parsing},\n\tauthor       = {Greg Durrett and Dan Klein},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{dusek2017referenceless,\n\ttitle        = {Referenceless Quality Estimation for Natural Language Generation},\n\tauthor       = {Ondrej Dusek and Jekaterina Novikova and Verena Rieser},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@inproceedings{dutta2020ista,\n\ttitle        = {Is There a Trade-Off Between Fairness and Accuracy? A Perspective Using Mismatched Hypothesis Testing},\n\tauthor       = {Sanghamitra Dutta and Dennis Wei and Hazar Yueksel and Pin-Yu Chen and Sijia Liu and Kush R. Varshney},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{duvallet2014inferring,\n\ttitle        = {Inferring Maps and Behaviors from Natural Language Instructions},\n\tauthor       = {F. Duvallet and M. R. Walter and T. Howard and S. Hemachandra and J. Oh and S. Teller and N. Roy and A. Stentz},\n\tyear         = 2014,\n\tbooktitle    = {International Symposium on Experimental Robotics (ISER)}\n}\n@article{dvijotham2018dual,\n\ttitle        = {A Dual Approach to Scalable Verification of Deep Networks},\n\tauthor       = {Krishnamurthy Dvijotham and Robert Stanforth and Sven Gowal and Timothy Mann and Pushmeet Kohli},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1803.06567}\n}\n@article{dvijotham2018training,\n\ttitle        = {Training verified learners with learned verifiers},\n\tauthor       = {Krishnamurthy Dvijotham and Sven Gowal and Robert Stanforth and Relja Arandjelovic and Brendan O'Donoghue and Jonathan Uesato and Pushmeet Kohli},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.10265}\n}\n@article{dvzeroski2004combining,\n\ttitle        = {Is combining classifiers with stacking better than selecting the best one?},\n\tauthor       = {Saso D{\\v{z}}eroski and Bernard {\\v{Z}}enko},\n\tyear         = 2004,\n\tjournal      = {Machine learning},\n\tvolume       = 54,\n\tnumber       = 3,\n\tpages        = {255--273}\n}\n@inproceedings{dwork2006calibrating,\n\ttitle        = {Calibrating noise to sensitivity in private data analysis},\n\tauthor       = {Cynthia Dwork and Frank McSherry and Kobbi Nissim and Adam Smith},\n\tyear         = 2006,\n\tbooktitle    = {Proceedings of the 3rd Theory of Cryptography Conference},\n\tpages        = {265--284}\n}\n@inproceedings{dwork2006differential,\n\ttitle        = {Differential privacy},\n\tauthor       = {Cynthia Dwork},\n\tyear         = 2006,\n\tbooktitle    = {Automata, languages and programming},\n\tpages        = {1--12}\n}\n@inproceedings{dwork2012,\n\ttitle        = {Fairness through awareness},\n\tauthor       = {Cynthia Dwork and Moritz Hardt and Toniann Pitassi and Omer Reingold and Rich Zemel},\n\tyear         = 2012,\n\tbooktitle    = {Innovations in Theoretical Computer Science (ITCS)},\n\tpages        = {214--226}\n}\n@inproceedings{dwork2012fairness,\n\ttitle        = {Fairness through awareness},\n\tauthor       = {Dwork, Cynthia and Hardt, Moritz and Pitassi, Toniann and Reingold, Omer and Zemel, Richard},\n\tyear         = 2012,\n\tbooktitle    = {Proceedings of the 3rd innovations in theoretical computer science conference},\n\tpages        = {214--226},\n\torganization = {ACM}\n}\n@inproceedings{dwork2018decoupled,\n\ttitle        = {Decoupled classifiers for group-fair and efficient machine learning},\n\tauthor       = {Cynthia Dwork and Nicole Immorlica and Adam Tauman Kalai and Max Leiserson},\n\tyear         = 2018,\n\tbooktitle    = {Conference on Fairness, Accountability and Transparency},\n\tpages        = {119--133}\n}\n@inproceedings{dwork2019learning,\n\ttitle        = {Learning from outcomes: Evidence-based rankings},\n\tauthor       = {Dwork, Cynthia and Kim, Michael P and Reingold, Omer and Rothblum, Guy N and Yona, Gal},\n\tyear         = 2019,\n\tbooktitle    = {2019 IEEE 60th Annual Symposium on Foundations of Computer Science (FOCS)},\n\tpages        = {106--125},\n\torganization = {IEEE}\n}\n@article{dwork2021outcome,\n\ttitle        = {Outcome Indistinguishability},\n\tauthor       = {Dwork, Cynthia and Kim, Michael P and Reingold, Omer and Rothblum, Guy N and Yona, Gal},\n\tyear         = 2021,\n\tjournal      = {STOC}\n}\n@inproceedings{DworkTTZ2014-onlineEV,\n\ttitle        = {Analyze gauss: optimal bounds for privacy-preserving principal component analysis},\n\tauthor       = {Dwork, Cynthia and Talwar, Kunal and Thakurta, Abhradeep and Zhang, Li},\n\tyear         = 2014,\n\tbooktitle    = {STOC},\n\tpages        = {11--20},\n\torganization = {ACM}\n}\n@inproceedings{dwzbss18,\n\ttitle        = {How Many Samples are Needed to Learn a Convolutional Neural Network?},\n\tauthor       = {Du, Simon S and Wang, Yining and Zhai, Xiyu and Balakrishnan, Sivaraman and Salakhutdinov, Ruslan and Singh, Aarti},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpublisher    = {arXiv preprint arXiv:1805.07883}\n}\n@article{dyer2014notes,\n\ttitle        = {Notes on noise contrastive estimation and negative sampling},\n\tauthor       = {Dyer, Chris},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1410.8251}\n}\n@inproceedings{dyer2015transition,\n\ttitle        = {Transition-based dependency parsing with stack long short-term memory},\n\tauthor       = {Chris Dyer and Miguel Ballesteros and Wang Ling and Austin Matthews and Noah A Smith},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{dyer2016recurrent,\n\ttitle        = {Recurrent neural network grammars},\n\tauthor       = {Chris Dyer and Adhiguna Kuncoro and Miguel Ballesteros and Noah A Smith},\n\tyear         = 2016,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{dyrka07pcfg,\n\ttitle        = {A probabilistic context-free grammar for the detection of binding sites from a protein sequence},\n\tauthor       = {Witold Dyrka and Jean-Christophe Nebel},\n\tyear         = 2007,\n\tjournal      = {Systems Biology, Bioinformatics and Synthetic Biology},\n\tvolume       = 1,\n\tpages        = {78--79}\n}\n@article{dziugaite2017computing,\n\ttitle        = {Computing nonvacuous generalization bounds for deep (stochastic) neural networks with many more parameters than training data},\n\tauthor       = {Dziugaite, Gintare Karolina and Roy, Daniel M},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1703.11008}\n}\n@article{dzps18,\n\ttitle        = {{Gradient Descent Provably Optimizes Over-parameterized Neural Networks}},\n\tauthor       = {Simon S. Du and Xiyu Zhai and Barnabas Poczos and Aarti Singh},\n\tyear         = 2018,\n\tjournal      = {ArXiv e-prints},\n\tarchiveprefix = {arXiv},\n\teprint       = {1810.02054},\n\tprimaryclass = {cs.LG}\n}\n@misc{e2lsh,\n\ttitle        = {{E2LSH}},\n\tauthor       = {Alexandr Andoni},\n\tyear         = 2004,\n\thowpublished = {\\url{http://www.mit.edu/~andoni/LSH/}}\n}\n@article{e90,\n\ttitle        = {Finding structure in time},\n\tauthor       = {Elman, Jeffrey L},\n\tyear         = 1990,\n\tjournal      = {Cognitive science},\n\tpublisher    = {Wiley Online Library},\n\tvolume       = 14,\n\tnumber       = 2,\n\tpages        = {179--211}\n}\n@inproceedings{EA,\n\ttitle        = {Image denoising via sparse and redundant representations over learned dictionaries},\n\tauthor       = {M. Elad and M. Aharon},\n\tyear         = 2006,\n\tbooktitle    = {IEEE Trans. on Signal Processing},\n\tpages        = {3736--3745}\n}\n@inproceedings{EAH,\n\ttitle        = {Method of optimal directions for frame design},\n\tauthor       = {K. Engan and S. Aase and J. Hakon-Husoy},\n\tyear         = 1999,\n\tbooktitle    = {ICASSP},\n\tpages        = {2443--2446}\n}\n@article{earl2005parallel,\n\ttitle        = {Parallel tempering: Theory, applications, and new perspectives},\n\tauthor       = {David J Earl and Michael W Deem},\n\tyear         = 2005,\n\tjournal      = {Physical Chemistry Chemical Physics},\n\tvolume       = 7,\n\tnumber       = 23,\n\tpages        = {3910--3916}\n}\n@inproceedings{ebrahimi2018adversarial,\n\ttitle        = {On Adversarial Examples for Character-Level Neural Machine Translation},\n\tauthor       = {Javid Ebrahimi and Daniel Lowd and Dejing Dou},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)}\n}\n@inproceedings{ebrahimi2018hotflip,\n\ttitle        = {Hotflip: White-box adversarial examples for text classification},\n\tauthor       = {Javid Ebrahimi and Anyi Rao and Daniel Lowd and Dejing Dou},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{eckart1936approximation,\n\ttitle        = {The approximation of one matrix by another of lower rank},\n\tauthor       = {Eckart, Carl and Young, Gale},\n\tyear         = 1936,\n\tjournal      = {Psychometrika},\n\tpublisher    = {Springer},\n\tvolume       = 1,\n\tnumber       = 3,\n\tpages        = {211--218}\n}\n@inproceedings{eckhard2011global,\n\ttitle        = {On the global convergence of identification of output error models},\n\tauthor       = {Eckhard, Diego and Bazanella, Alexandre Sanfelice},\n\tyear         = 2011,\n\tbooktitle    = {Proc.~$18$th IFAC World congress}\n}\n@article{ecoffet2019go,\n\ttitle        = {Go-Explore: a New Approach for Hard-Exploration Problems},\n\tauthor       = {Adrien Ecoffet and Joost Huizinga and Joel Lehman and Kenneth O Stanley and Jeff Clune},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1901.10995}\n}\n@article{economos1982rate,\n\ttitle        = {Rate of aging, rate of dying and the mechanism of mortality},\n\tauthor       = {Angelos C Economos},\n\tyear         = 1982,\n\tjournal      = {Archives of Gerontology and Geriatrics},\n\tvolume       = 1,\n\tnumber       = 1,\n\tpages        = {3--27}\n}\n@article{ECP1624,\n\ttitle        = {Freedman's inequality for matrix martingales},\n\tauthor       = {Joel Tropp},\n\tyear         = 2011,\n\tjournal      = {Electron. Commun. Probab.},\n\tvolume       = 16,\n\tpages        = {no. 25, 262--270},\n\tdoi          = {10.1214/ECP.v16-1624},\n\tissn         = {1083-589X},\n\turl          = {http://ecp.ejpecp.org/article/view/1624},\n\tfjournal     = {Electronic Communications in Probability},\n\tkeywords     = {Discrete-time martingale, large deviation, probability inequality, random matrix},\n\tabstract     = {Freedman's inequality is a martingale counterpart to Bernstein's inequality. This result shows that the large-deviation behavior of a martingale is controlled by the predictable quadratic variation and a uniform upper bound for the martingale difference sequence. Oliveira has recently established a natural extension of Freedman's inequality that provides tail bounds for the maximum singular value of a matrix-valued martingale. This note describes a different proof of the matrix Freedman inequality that depends on a deep theorem of Lieb from matrix analysis. This argument delivers sharp constants in the matrix Freedman inequality, and it also yields tail bounds for other types of matrix martingales. The new techniques are adapted from recent work by the present author.}\n}\n@inproceedings{edizel2019misspelling,\n\ttitle        = {Misspelling Oblivious Word Embeddings},\n\tauthor       = {Bora Edizel and Alekxandra Piktus and Piotr Bojanowski and Rui Ferreira and Edouard Grave and Fabrizio Silvestri},\n\tyear         = 2019,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{edmonds1965maximum,\n\ttitle        = {Maximum matching and a polyhedron with 0,1-vertices},\n\tauthor       = {Edmonds, Jack},\n\tyear         = 1965,\n\tjournal      = {Journal of Research of the National Bureau of Standards--B}\n}\n@article{edmonds1972theoretical,\n\ttitle        = {Theoretical improvements in algorithmic efficiency for network flow problems},\n\tauthor       = {Jack Edmonds and Richard M. Karp},\n\tyear         = 1972,\n\tjournal      = {Journal of the ACM (JACM)},\n\tvolume       = 19,\n\tnumber       = 2,\n\tpages        = {248--264}\n}\n@article{edmonds2002near,\n\ttitle        = {Near-synonymy and lexical choice},\n\tauthor       = {Philip Edmonds and Graeme Hirst},\n\tyear         = 2002,\n\tjournal      = {Computational Linguistics}\n}\n@inproceedings{edunov2018understanding,\n\ttitle        = {Understanding back-translation at scale},\n\tauthor       = {Sergey Edunov and Myle Ott and Michael Auli and David Grangier},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{edwards2019imitating,\n\ttitle        = {Imitating latent policies from observation},\n\tauthor       = {Ashley Edwards and Himanshu Sahni and Yannick Schroecker and Charles Isbell},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1755--1763}\n}\n@article{efron1975efficiency,\n\ttitle        = {The efficiency of logistic regression compared to normal discriminant analysis},\n\tauthor       = {Bradley Efron},\n\tyear         = 1975,\n\tjournal      = {Journal of the American Statistical Association (JASA)},\n\tvolume       = 70,\n\tnumber       = 352,\n\tpages        = {892--898}\n}\n@article{efron1979bootstrap,\n\ttitle        = {Bootstrap Methods: Another Look at the Jackknife},\n\tauthor       = {Brad Efron},\n\tyear         = 1979,\n\tjournal      = {Annals of Statistics},\n\tvolume       = 7\n}\n@inproceedings{efstathiou2014catan,\n\ttitle        = {Learning Non-Cooperative Dialogue Behaviours},\n\tauthor       = {Ioannis Efstathiou and Oliver Lemon},\n\tyear         = 2014,\n\tbooktitle    = {Special Interest Group on Discourse and Dialogue (SIGDIAL)}\n}\n@inproceedings{ehlers2017formal,\n\ttitle        = {Formal verification of piece-wise linear feed-forward neural networks},\n\tauthor       = {Ruediger Ehlers},\n\tyear         = 2017,\n\tbooktitle    = {International Symposium on Automated Technology for Verification and Analysis (ATVA)},\n\tpages        = {269--286}\n}\n@inproceedings{eidelman2012topic,\n\ttitle        = {Topic Models for Dynamic Translation Model Adaptation},\n\tauthor       = {Vladimir Eidelman and Jordan Boyd-Graber and Philip Resnik},\n\tyear         = 2012,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {115--119}\n}\n@inproceedings{eisape2020cloze,\n\ttitle        = {Cloze Distillation Improves Psychometric Predictive Power},\n\tauthor       = {Tiwalayo Eisape and Noga Zaslavsky and Roger Levy},\n\tyear         = 2020,\n\tbooktitle    = {Computational Natural Language Learning (CoNLL)}\n}\n@inproceedings{eisenstein08topic,\n\ttitle        = {{B}ayesian Unsupervised Topic Segmentation},\n\tauthor       = {Jacob Eisenstein and Regina Barzilay},\n\tyear         = 2008,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {334--343}\n}\n@inproceedings{eisenstein09read,\n\ttitle        = {Reading to Learn: Constructing Features from Semantic Abstracts},\n\tauthor       = {J. Eisenstein and J. Clarke and D. Goldwasser and D. Roth},\n\tyear         = 2009,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {958--967}\n}\n@inproceedings{eisner00cubic,\n\ttitle        = {Bilexical grammars and their cubic-time parsing algorithms},\n\tauthor       = {Jason Eisner},\n\tyear         = 2000,\n\tbooktitle    = {Advances in Probabilistic and Other Parsing Technologies},\n\tpages        = {29--62}\n}\n@inproceedings{eisner96dependency,\n\ttitle        = {Three New Probabilistic Models for Dependency Parsing: An Exploration},\n\tauthor       = {Jason Eisner},\n\tyear         = 1996,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)},\n\tpages        = {340--345}\n}\n@article{ekeland1976convex,\n\ttitle        = {Convex analysis and 9 variational problems},\n\tauthor       = {Ekeland, Ivar and Temam, Roger},\n\tyear         = 1976,\n\tpublisher    = {SIAM}\n}\n@book{Elad:2010:SRR:1895005,\n\ttitle        = {Sparse and Redundant Representations: From Theory to Applications in Signal and Image Processing},\n\tauthor       = {Elad, Michael},\n\tyear         = 2010,\n\tpublisher    = {Springer Publishing Company, Incorporated},\n\tisbn         = {144197010X, 9781441970107},\n\tedition      = {1st}\n}\n@article{elad2006image,\n\ttitle        = {Image denoising via sparse and redundant representations over learned dictionaries},\n\tauthor       = {Elad, Michael and Aharon, Michal},\n\tyear         = 2006,\n\tjournal      = {Image Processing, IEEE Transactions on},\n\tpublisher    = {IEEE},\n\tvolume       = 15,\n\tnumber       = 12,\n\tpages        = {3736--3745},\n\towner        = {gewor_000},\n\ttimestamp    = {2013.11.10}\n}\n@misc{EladHazan2016-email,\n\tauthor       = {Hazan, Elad},\n\tyear         = 2016,\n\thowpublished = {private communication}\n}\n@inproceedings{eldan2016depth,\n\ttitle        = {The Power of Depth for Feedforward Neural Networks},\n\tauthor       = {R. Eldan and O. Shamir},\n\tyear         = 2016,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@inproceedings{eldan2016power,\n\ttitle        = {The power of depth for feedforward neural networks},\n\tauthor       = {Eldan, Ronen and Shamir, Ohad},\n\tyear         = 2016,\n\tbooktitle    = {Conference on learning theory},\n\tpages        = {907--940},\n\torganization = {PMLR}\n}\n@article{eldar09sure,\n\ttitle        = {Generalized {SURE} for Exponential Families: Applications to Regularization},\n\tauthor       = {Yonina C. Eldar},\n\tyear         = 2009,\n\tjournal      = {IEEE Transactions on Signal Processing},\n\tvolume       = 57,\n\tnumber       = 2,\n\tpages        = {471--481}\n}\n@misc{elektronikee575,\n\ttitle        = {EE575 Series - HVAC Miniature Air Velocity Transmitter},\n\tauthor       = {E+E Elektronik},\n\thowpublished = {Available at \\url{http://www.epluse.com/uploads/tx_EplusEprDownloads/datasheet_EE575_e_02.pdf}}\n}\n@article{elhabian2008moving,\n\ttitle        = {Moving object detection in spatial domain using background removal techniques-state-of-art},\n\tauthor       = {Shireen Y Elhabian and Khaled M El-Sayed and Sumaya H Ahmed},\n\tyear         = 2008,\n\tjournal      = {Recent patents on computer science},\n\tvolume       = 1,\n\tnumber       = 1,\n\tpages        = {32--54}\n}\n@inproceedings{elkan11reinforcement,\n\ttitle        = {Reinforcement Learning with a Bilinear {Q} Function},\n\tauthor       = {Charles Elkan},\n\tyear         = 2011,\n\tbooktitle    = {Recent Advances in Reinforcement Learning - 9th European Workshop (EWRL)},\n\tseries       = {Lecture Notes in Computer Science},\n\tnumber       = 7188,\n\tpages        = {78--88}\n}\n@article{Elliott1968error,\n\ttitle        = {Error analysis of an algorithm for summing certain finite series},\n\tauthor       = {Elliott, David},\n\tyear         = 1968,\n\tjournal      = {Journal of the Australian Mathematical Society},\n\tpublisher    = {Cambridge Univ Press},\n\tvolume       = 8,\n\tnumber       = {02},\n\tpages        = {213--221}\n}\n@article{ellis2012kbp,\n\ttitle        = {Linguistic Resources for 2012 Knowledge Base Population Evaluations},\n\tauthor       = {Joe Ellis and Xuansong Li and Kira Griffitt and Stephanie M. Strassel},\n\tyear         = 2012,\n\tjournal      = {Text Analytics Conference}\n}\n@article{ellis2015tackbp,\n\ttitle        = {{TAC} {KBP} 2015 Slot Descriptions},\n\tauthor       = {Joe Ellis and  Jeremy Getman and Heather Simpson and Kira Griffitt and Hoa Trang Dang and Ralph Grishman and Heng Ji and Catherine DePrince and Thomas Riese and  Neil Kuster},\n\tyear         = 2015,\n\tjournal      = {Linguistic Data Consortium}\n}\n@article{ellis2016overview,\n\ttitle        = {Overview of linguistic resources for the {TAC} {KBP} 2016 evaluations: Methodologies and results},\n\tauthor       = {Joe Ellis and Jeremy Getman and Dana Fore and Neil Kuster and Zhiyi Song and Ann Bies and Stephanie Strassel},\n\tyear         = 2016,\n\tjournal      = {Text Analytics Conference}\n}\n@article{elman1990finding,\n\ttitle        = {Finding structure in time},\n\tauthor       = {Jeffrey L Elman},\n\tyear         = 1990,\n\tjournal      = {Cognitive Science},\n\tvolume       = 14,\n\tnumber       = 2,\n\tpages        = {179--211}\n}\n@article{elvidge2009poverty,\n\ttitle        = {A global poverty map derived from satellite data},\n\tauthor       = {Christopher D. Elvidge and Paul C. Sutton and Tilottama Ghosh and Benjamin T. Tuttle and Kimberly E. Baugh and Budhendra Bhaduri and Edward Bright},\n\tyear         = 2009,\n\tjournal      = {Computers and Geosciences},\n\tvolume       = 35\n}\n@article{elyaniv2010foundations,\n\ttitle        = {On the Foundations of Noise-free Selective Classification},\n\tauthor       = {Ran El-Yaniv and Yair Wiener},\n\tyear         = 2010,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 11\n}\n@inproceedings{elyaniv2011finance,\n\ttitle        = {Selective Prediction of Financial Trends with Hidden Markov Models},\n\tauthor       = {Ran El-Yaniv and Dmitry Pidan},\n\tyear         = 2011,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{elzinga1975central,\n\ttitle        = {A central cutting plane algorithm for the convex programming problem},\n\tauthor       = {Elzinga, Jack and Moore, Thomas G.},\n\tyear         = 1975,\n\tjournal      = {Mathematical Programming},\n\tpublisher    = {Springer},\n\tvolume       = 8,\n\tnumber       = 1,\n\tpages        = {134--145}\n}\n@article{embley2016converting,\n\ttitle        = {Converting heterogeneous statistical tables on the web to searchable databases},\n\tauthor       = {David W. Embley and Mukkai S. Krishnamoorthy and George Nagy and Sharad C. Seth},\n\tyear         = 2016,\n\tjournal      = {International Journal on Document Analysis and Recognition (IJDAR)},\n\tvolume       = 19,\n\tpages        = {119--138}\n}\n@article{encode2012integrated,\n\ttitle        = {An integrated encyclopedia of {DNA} elements in the human genome},\n\tauthor       = {ENCODE Project Consortium and others},\n\tyear         = 2012,\n\tjournal      = {Nature},\n\tvolume       = 489,\n\tnumber       = 7414,\n\tpages        = {57--74}\n}\n@inproceedings{endres2013learning,\n\ttitle        = {Learning the dynamics of doors for robotic manipulation},\n\tauthor       = {F. Endres and J. Trinkle and W. Burgard},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Intelligent Robots and Systems (IROS)}\n}\n@inproceedings{engan1999method,\n\ttitle        = {Method of optimal directions for frame design},\n\tauthor       = {Engan, Kjersti and Aase, Sven Ole and Hakon Husoy, J},\n\tyear         = 1999,\n\tbooktitle    = {Acoustics, Speech, and Signal Processing, 1999. Proceedings., 1999 IEEE International Conference on},\n\tvolume       = 5,\n\tpages        = {2443--2446},\n\torganization = {IEEE},\n\towner        = {gewor_000},\n\ttimestamp    = {2013.11.10}\n}\n@inproceedings{english2005mixed,\n\ttitle        = {Learning Mixed Initiative Dialog Strategies by Using Reinforcement Learning on Both Conversants},\n\tauthor       = {Michael S. English and Peter A. Heeman},\n\tyear         = 2005,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{engstrom2018evaluating,\n\ttitle        = {Evaluating and understanding the robustness of adversarial logit pairing},\n\tauthor       = {Logan Engstrom and Andrew Ilyas and Anish Athalye},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1807.10272}\n}\n@inproceedings{engstrom2019exploring,\n\ttitle        = {Exploring the Landscape of Spatial Robustness},\n\tauthor       = {Logan Engstrom and Brandon Tran and Dimitris Tsipras and Ludwig Schmidt and Aleksander Madry},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1802--1811}\n}\n@article{engstrom2020government,\n\ttitle        = {Government by algorithm: Artificial intelligence in federal administrative agencies},\n\tauthor       = {David Freeman Engstrom and Daniel E Ho and Catherine M Sharkey and Mariano-Florentino Cuéllar},\n\tyear         = 2020,\n\tjournal      = {NYU School of Law, Public Law Research Paper},\n\tvolume       = 20\n}\n@article{engstrom2020identifying,\n\ttitle        = {Identifying Statistical Bias in Dataset Replication},\n\tauthor       = {Engstrom, Logan and Ilyas, Andrew and Santurkar, Shibani and Tsipras, Dimitris and Steinhardt, Jacob and Madry, Aleksander},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2005.09619}\n}\n@article{ensign2017runaway,\n\ttitle        = {Runaway feedback loops in predictive policing},\n\tauthor       = {Danielle Ensign and Sorelle A Friedler and Scott Neville and Carlos Scheidegger and Suresh Venkatasubramanian},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1706.09847}\n}\n@article{entrywise-sampling-PetroA2011,\n\ttitle        = {{A Note on Element-wise Matrix Sparsification via a Matrix-valued Bernstein Inequality}},\n\tauthor       = {Petros Drineas and Anastasios Zouzias},\n\tyear         = 2011,\n\tmonth        = jan,\n\tjournal      = {ArXiv e-prints},\n\tvolume       = {abs/1006.0407}\n}\n@misc{enwiki:1050715591,\n\ttitle        = {Hidden Markov model --- {Wikipedia}{,} The Free Encyclopedia},\n\tauthor       = {{Wikipedia contributors}},\n\tyear         = 2021,\n\turl          = {https://en.wikipedia.org/w/index.php?title=Hidden_Markov_model&oldid=1050715591},\n\tnote         = {[Online; accessed 27-November-2021]}\n}\n@misc{enwiki:986771357,\n\ttitle        = {Hilbert–Schmidt integral operator --- {Wikipedia}{,} The Free Encyclopedia},\n\tauthor       = {{Wikipedia contributors}},\n\tyear         = 2020,\n\turl          = {https://en.wikipedia.org/w/index.php?title=Hilbert%E2%80%93Schmidt_integral_operator&oldid=986771357},\n\tnote         = {[Online; accessed 21-July-2021]}\n}\n@techreport{epa2007epa,\n\ttitle        = {EPA Report to Congress on Server and Data Center Energy Efficiency},\n\tauthor       = {{EPA}},\n\tyear         = 2007,\n\tinstitution  = {U.S. Environmental Protection Agency},\n\tciteulike-article-id = 2483731,\n\tciteulike-linkout-0 = {http://www.energystar.gov/ia/partners/prod_development/downloads/EPA_Datacenter_Report_Congress_Final1.pdf},\n\tkeywords     = {data\\_center},\n\tmyurl        = {http://www.energystar.gov/ia/partners/prod_development/downloads/EPA_Datacenter_Report_Congress_Final1.pdf},\n\tposted-at    = {2008-07-20 22:23:42},\n\tpriority     = 2\n}\n@article{epsey2015development,\n\ttitle        = {Data for Development: A Needs Assessment for {SDG} Monitoring and Statistical Capacity Development},\n\tauthor       = {Jessica Espey and Eric Swanson and Shaida Badiee and Zach Chistensen and Alex Fischer and Marc Levy and Greg Yetman and Alex de Sherbinin and Robert Chen and Yue Qiu and Geoffrey Greenwell and Thilo Klein and and Johannes Jutting and Morten Jerven and Grant Cameron and Ana Milena Aguilar Rivera and Victoriano C. Arias and and Samuel Lantei Mills and Albert Motivans},\n\tyear         = 2015,\n\tjournal      = {Sustainable Development Solutions Network}\n}\n@article{eraslan2019deep,\n\ttitle        = {Deep learning: new computational modelling techniques for genomics},\n\tauthor       = {G{\\\"o}kcen Eraslan and {\\v{Z}}iga Avsec and Julien Gagneur and Fabian J Theis},\n\tyear         = 2019,\n\tjournal      = {Nature Reviews Genetics},\n\tvolume       = 20,\n\tnumber       = 7,\n\tpages        = {389--403}\n}\n@article{Erdogan09,\n\ttitle        = {On the convergence of {ICA} algorithms with symmetric orthogonalization},\n\tauthor       = {A. T. Erdogan},\n\tyear         = 2009,\n\tjournal      = {IEEE Transactions on Signal Processing},\n\tvolume       = 57,\n\tpages        = {2209--2221}\n}\n@article{Eremenko2007uniform,\n\ttitle        = {Uniform approximation of sgn x by polynomials and entire functions},\n\tauthor       = {Eremenko, Alexandre and Yuditskii, Peter},\n\tyear         = 2007,\n\tjournal      = {Journal d'Analyse Math{\\'e}matique},\n\tpublisher    = {Springer},\n\tvolume       = 101,\n\tnumber       = 1,\n\tpages        = {313--324}\n}\n@article{Eremenko2011polynomials,\n\ttitle        = {Polynomials of the best uniform approximation to sgn (x) on two intervals},\n\tauthor       = {Eremenko, Alexandre and Yuditskii, Peter},\n\tyear         = 2011,\n\tjournal      = {Journal d'Analyse Math{\\'e}matique},\n\tpublisher    = {Springer},\n\tvolume       = 114,\n\tnumber       = 1,\n\tpages        = {285--315}\n}\n@inproceedings{erk2008structured,\n\ttitle        = {A structured vector space model for word meaning in context},\n\tauthor       = {Katrin Erk and Sebastian Padó},\n\tyear         = 2008,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {897--906}\n}\n@techreport{erol2013extended,\n\ttitle        = {The Extended Parameter Filter},\n\tauthor       = {Erol, Yusuf and Li, Lei and Ramsundar, Bharath and Russell, Stuart J.},\n\tyear         = 2013,\n\tmonth        = may,\n\tnumber       = {UCB/EECS-2013-48},\n\turl          = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2013/EECS-2013-48.html},\n\tinstitution  = {EECS Department, University of California, Berkeley},\n\tabstract     = {The parameters of temporal models, such as dynamic Bayesian networks, may be modelled in a Bayesian context as static or atemporal variables that influence transition probabilities at every time step. Particle filters fail for models that include such variables, while methods that use Gibbs sampling of parameter variables may incur a per-sample cost that grows linearly with the length of the observation sequence. Storvik devised a method for incremental computation of exact sufficient statistics that, for some cases, reduces the per-sample cost to a constant. In this paper, we demonstrate a connection between Storvik's filter and a Kalman filter in parameter space and establish more general conditions under which Storvik's filter works. Drawing on an analogy to the extended Kalman filter, we develop and analyze, both theoretically and experimentally, a Taylor approximation to the parameter posterior that allows Storvik's method to be applied to a broader class of models. Our experiments on both synthetic examples and real applications show improvement over existing methods.}\n}\n@inproceedings{ertekin2007learning,\n\ttitle        = {Learning on the border: active learning in imbalanced data classification},\n\tauthor       = {Seyda Ertekin and Jian Huang and Leon Bottou and Lee Giles},\n\tyear         = 2007,\n\tbooktitle    = {Conference on Information and Knowledge Management (CIKM)}\n}\n@article{escobar95mixture,\n\ttitle        = {{B}ayesian Density Estimation and Inference Using Mixtures},\n\tauthor       = {M. D. Escobar and M. West},\n\tyear         = 1995,\n\tjournal      = {Journal of the American Statistical Association (JASA)},\n\tvolume       = 90,\n\tpages        = {577--588}\n}\n@article{esfahani2018data,\n\ttitle        = {Data-driven distributionally robust optimization using the Wasserstein metric: Performance guarantees and tractable reformulations},\n\tauthor       = {Peyman Mohajerin Esfahani and Daniel Kuhn},\n\tyear         = 2018,\n\tjournal      = {Mathematical Programming},\n\tvolume       = 171,\n\tnumber       = 1,\n\tpages        = {115--166}\n}\n@article{EshofFLSV2002numerical,\n\ttitle        = {Numerical methods for the QCDd overlap operator. I. Sign-function and error bounds},\n\tauthor       = {van den Eshof, Jasper and Frommer, Andreas and Lippert, Th and Schilling, Klaus and van der Vorst, Henk A.},\n\tyear         = 2002,\n\tjournal      = {Computer Physics Communications},\n\tpublisher    = {Elsevier},\n\tvolume       = 146,\n\tnumber       = 2,\n\tpages        = {203--224}\n}\n@inproceedings{esmeir2007anytime,\n\ttitle        = {Anytime induction of cost-sensitive trees},\n\tauthor       = {Saher Esmeir and Shaul Markovitch},\n\tyear         = 2007,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {425--432}\n}\n@article{esserman2013overdiagnosis,\n\ttitle        = {Overdiagnosis and overtreatment in cancer: an opportunity for improvement},\n\tauthor       = {Esserman, Laura J and Thompson, Ian M and Reid, Brian},\n\tyear         = 2013,\n\tjournal      = {Jama},\n\tpublisher    = {American Medical Association},\n\tvolume       = 310,\n\tnumber       = 8,\n\tpages        = {797--798}\n}\n@book{estermann1962complex,\n\ttitle        = {Complex numbers and functions},\n\tauthor       = {Estermann, T.},\n\tyear         = 1962,\n\tpublisher    = {Athlone Press},\n\turl          = {https://books.google.com/books?id=ITbvAAAAMAAJ},\n\tlccn         = 62006689,\n\tbdsk-url-1   = {https://books.google.com/books?id=ITbvAAAAMAAJ}\n}\n@article{esteva2017dermatologist,\n\ttitle        = {Dermatologist-level classification of skin cancer with deep neural networks},\n\tauthor       = {Andre Esteva and Brett Kuprel and Roberto A Novoa and Justin Ko and Susan M Swetter and Helen M Blau and Sebastian Thrun},\n\tyear         = 2017,\n\tjournal      = {Nature},\n\tvolume       = 542,\n\tnumber       = 7639,\n\tpages        = {115--118}\n}\n@article{ethayarajh2019contextual,\n\ttitle        = {How contextual are contextualized word representations? comparing the geometry of BERT, ELMo, and GPT-2 embeddings},\n\tauthor       = {Ethayarajh, Kawin},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1909.00512}\n}\n@article{ethayarajh2020your,\n\ttitle        = {Is Your Classifier Actually Biased? Measuring Fairness under Uncertainty with Bernstein Bounds},\n\tauthor       = {Ethayarajh, Kawin},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2004.12332}\n}\n@inproceedings{ettinger2017generalizable,\n\ttitle        = {Towards Linguistically Generalizable {NLP} Systems: A Workshop and Shared Task},\n\tauthor       = {Allyson Ettinger and Sudha Rao and Hal {Daum{\\'e} III} and Emily M. Bender},\n\tyear         = 2017,\n\tbooktitle    = {Workshop on Building Linguistically Generalizable NLP Systems}\n}\n@inproceedings{etzioni11openie,\n\ttitle        = {Open Information Extraction: the Second Generation},\n\tauthor       = {Oren Etzioni and Anthony Fader and Janara Christensen and Stephen Soderland and Mausam},\n\tyear         = 2011,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@article{etzioni2005unsupervised,\n\ttitle        = {Unsupervised named-entity extraction from the web: An experimental study},\n\tauthor       = {Oren Etzioni and Michael Cafarella and Doug Downey and Ana-Maria Popescu and Tal Shaked and Stephen Soderland and Daniel S Weld and Alexander Yates},\n\tyear         = 2005,\n\tjournal      = {Artificial Intelligence},\n\tvolume       = 165,\n\tnumber       = 1,\n\tpages        = {91--134}\n}\n@misc{euyu2012monomial,\n\ttitle        = {A non-negative matrix has a non-negative inverse. {What} other properties does it have?},\n\tauthor       = {EuYu},\n\tyear         = 2012,\n\thowpublished = {\\url{https://math.stackexchange.com/q/214401}}\n}\n@book{euzenat2007ontology,\n\ttitle        = {Ontology matching},\n\tauthor       = {Jerome Euzenat and Pavel Shvaiko and others},\n\tyear         = 2007,\n\tpublisher    = {Springer Springer},\n\tvolume       = 18\n}\n@inproceedings{even2002pac,\n\ttitle        = {PAC bounds for multi-armed bandit and {M}arkov decision processes},\n\tauthor       = {Even-Dar, Eyal and Mannor, Shie and Mansour, Yishay},\n\tyear         = 2002,\n\tbooktitle    = {International Conference on Computational Learning Theory},\n\tpages        = {255--270},\n\torganization = {Springer}\n}\n@article{even2006action,\n\ttitle        = {Action elimination and stopping conditions for the multi-armed bandit and reinforcement learning problems},\n\tauthor       = {Even-Dar, Eyal and Mannor, Shie and Mansour, Yishay},\n\tyear         = 2006,\n\tjournal      = {Journal of machine learning research},\n\tvolume       = 7,\n\tnumber       = {Jun},\n\tpages        = {1079--1105}\n}\n@article{even2009online,\n\ttitle        = {Online Markov decision processes},\n\tauthor       = {Even-Dar, Eyal and Kakade, Sham M and Mansour, Yishay},\n\tyear         = 2009,\n\tjournal      = {Mathematics of Operations Research},\n\tpublisher    = {INFORMS},\n\tvolume       = 34,\n\tnumber       = 3,\n\tpages        = {726--736}\n}\n@article{evfimievski2004privacy,\n\ttitle        = {Privacy preserving mining of association rules},\n\tauthor       = {Alexandre Evfimievski and Ramakrishnan Srikant and Rakesh Agrawal and Johannes Gehrke},\n\tyear         = 2004,\n\tjournal      = {Information Systems},\n\tvolume       = 29,\n\tnumber       = 4,\n\tpages        = {343--364}\n}\n@article{evgeniou05task,\n\ttitle        = {Learning multiple tasks with kernel methods},\n\tauthor       = {T. Evgeniou and C. Micchelli and M. Pontil},\n\tyear         = 2005,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 6,\n\tpages        = {615--637}\n}\n@proceedings{evgeniou2007multi,\n\ttitle        = {Advances in Neural Information Processing Systems 19, Proceedings of the Twentieth Annual Conference on Neural Information Processing Systems, Vancouver, British Columbia, Canada, December 4-7, 2006},\n\tyear         = 2007,\n\tbooktitle    = {NIPS},\n\tpublisher    = {MIT Press},\n\tisbn         = {0-262-19568-2},\n\teditor       = {Bernhard Sch{\\\"o}lkopf and John C. Platt and Thomas Hoffman},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de}\n}\n@article{evtimov2017robust,\n\ttitle        = {Robust Physical-World Attacks on Machine Learning Models},\n\tauthor       = {Ivan Evtimov and Kevin Eykholt and Earlence Fernandes and Tadayoshi Kohno and Bo Li and Atul Prakash and Amir Rahmati and Dawn Song},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@article{eysenbach2018diversity,\n\ttitle        = {Diversity is all you need: Learning skills without a reward function},\n\tauthor       = {Benjamin Eysenbach and Abhishek Gupta and Julian Ibarz and Sergey Levine},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1802.06070}\n}\n@inproceedings{eysenbach2018leave,\n\ttitle        = {Leave no Trace: Learning to Reset for Safe and Autonomous Reinforcement Learning},\n\tauthor       = {Eysenbach, B and Gu, S and Ibarz, J and Levine, S},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1711.06782},\n\tbooktitle    = {6th International Conference on Learning Representations (ICLR 2018)},\n\torganization = {OpenReview. net}\n}\n@article{f17,\n\ttitle        = {An Overview of ResNet and its Variants},\n\tauthor       = {Vincent Fung},\n\tyear         = 2017,\n\tjournal      = {https://towardsdatascience.com/an-overview-of-resnet-and-its-variants-5281e2f56035}\n}\n@article{f89,\n\ttitle        = {On the approximate realization of continuous mappings by neural networks},\n\tauthor       = {Funahashi, Ken-Ichi},\n\tyear         = 1989,\n\tjournal      = {Neural networks},\n\tpublisher    = {Pergamon},\n\tvolume       = 2,\n\tnumber       = 3,\n\tpages        = {183--192}\n}\n@article{fabian1968asymptotic,\n\ttitle        = {On Aymptotic Normality in Stochastic Approximation},\n\tauthor       = {Václav Fabian},\n\tyear         = 1968,\n\tjournal      = {Annals of Mathematical Statistics},\n\tvolume       = 39,\n\tnumber       = 4,\n\tpages        = {1327--1332}\n}\n@inproceedings{fader11reverb,\n\ttitle        = {Identifying Relations for Open Information Extraction},\n\tauthor       = {Anthony Fader and Stephen Soderland and Oren Etzioni},\n\tyear         = 2011,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{fader2013paraphrase,\n\ttitle        = {Paraphrase-Driven Learning for Open Question Answering},\n\tauthor       = {Anthony Fader and Luke Zettlemoyer and Oren Etzioni},\n\tyear         = 2013,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{fader2014open,\n\ttitle        = {Open question answering over curated and extracted knowledge bases},\n\tauthor       = {Anthony Fader and Luke Zettlemoyer and Oren Etzioni},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)},\n\tpages        = {1156--1165}\n}\n@article{fagin1987belief,\n\ttitle        = {Belief, awareness, and limited reasoning},\n\tauthor       = {Ronald Fagin and Joseph Y. Halpern},\n\tyear         = 1987,\n\tjournal      = {Artificial intelligence},\n\tvolume       = 34,\n\tpages        = {39--76}\n}\n@inproceedings{fakcharoenphol2003tight,\n\ttitle        = {A tight bound on approximating arbitrary metrics by tree metrics},\n\tauthor       = {Jittat Fakcharoenphol and Satish Rao and Kunal Talwar},\n\tyear         = 2003,\n\tbooktitle    = {Symposium on Theory of Computing (STOC)},\n\tpages        = {448--455}\n}\n@article{fakoor2019meta,\n\ttitle        = {Meta-{Q}-Learning},\n\tauthor       = {Rasool Fakoor and Pratik Chaudhari and Stefano Soatto and Alexander J Smola},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1910.00125}\n}\n@techreport{faloutsos1994fastmap,\n\ttitle        = {\n\t\tFastMap: {A} Fast Algorithm for Indexing, Data-Mining and Visualization\n\n\t\tof Traditional and Multimedia Datasets\n\t},\n\tauthor       = {Christos Faloutsos and King-Ip (David) Lin},\n\tyear         = 1994,\n\taddress      = {College Park},\n\tnumber       = {94-80},\n\tinstitution  = {Dept. of Computer Science, Univ. of Maryland},\n\ttype         = {CS-TR-3383 UMIACS-TR-94-132 ISR TR},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@inproceedings{faloutsos1999power,\n\ttitle        = {On power-law relationships of the Internet topology},\n\tauthor       = {Faloutsos, Michalis and Faloutsos, Petros and Faloutsos, Christos},\n\tyear         = 1999,\n\tbooktitle    = {SIGCOMM '99},\n\tlocation     = {Cambridge, Massachusetts, United States},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tpages        = {251--262},\n\tdoi          = {10.1145/316188.316229},\n\tisbn         = {1-58113-135-6},\n\turl          = {http://doi.acm.org/10.1145/316188.316229},\n\tacmid        = 316229,\n\tnumpages     = 12\n}\n@article{fan1953minimax,\n\ttitle        = {Minimax theorems},\n\tauthor       = {Ky Fan},\n\tyear         = 1953,\n\tjournal      = {Proceedings of the National Academy of Sciences of the United States of America},\n\tvolume       = 39,\n\tpages        = {42--47}\n}\n@inproceedings{fan2007power,\n\ttitle        = {Power provisioning for a warehouse-sized computer},\n\tauthor       = {Fan, Xiaobo and Weber, Wolf-Dietrich and Barroso, Luiz Andre},\n\tyear         = 2007,\n\tbooktitle    = {\n\t\tProceedings of the 34th annual international symposium on Computer\n\n\t\tarchitecture\n\t},\n\tlocation     = {San Diego, California, USA},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {ISCA '07},\n\tpages        = {13--23},\n\tdoi          = {http://doi.acm.org/10.1145/1250662.1250665},\n\tisbn         = {978-1-59593-706-3},\n\tacmid        = 1250665,\n\tkeywords     = {energy efficiency, power modeling, power provisioning},\n\tnumpages     = 11\n}\n@inproceedings{fan2017transfer,\n\ttitle        = {Transfer Learning for Neural Semantic Parsing},\n\tauthor       = {Xing Fan and Emilio Monti and Lambert Mathias and Markus Dreyer},\n\tyear         = 2017,\n\tbooktitle    = {Workshop on Representation Learning for NLP}\n}\n@article{fan2018hierarchical,\n\ttitle        = {Hierarchical Neural Story Generation},\n\tauthor       = {Angela Fan and Mike Lewis and Yann Dauphin},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.04833}\n}\n@inproceedings{fan2018surreal,\n\ttitle        = {SURREAL: Open-Source Reinforcement Learning Framework and Robot Manipulation Benchmark},\n\tauthor       = {Linxi Fan and Yuke Zhu and Jiren Zhu and Zihua Liu and Orien Zeng and Anchit Gupta and Joan Creus-Costa and Silvio Savarese and Li Fei-Fei},\n\tyear         = 2018,\n\tbooktitle    = {Conference on Robot Learning}\n}\n@article{fan2019theoretical,\n\ttitle        = {A theoretical analysis of deep {Q}-learning},\n\tauthor       = {Fan, Jianqing and Wang, Zhaoran and Xie, Yuchen and Yang, Zhuoran},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1901.00137}\n}\n@article{fang1994inequalities,\n\ttitle        = {Inequalities for the trace of matrix product},\n\tauthor       = {Fang, Yuguang and Loparo, Kenneth A and Feng, Xiangbo},\n\tyear         = 1994,\n\tjournal      = {IEEE Transactions on Automatic Control},\n\tpublisher    = {IEEE},\n\tvolume       = 39,\n\tnumber       = 12,\n\tpages        = {2489--2490}\n}\n@inproceedings{fang2013unbiased,\n\ttitle        = {Unbiased metric learning: On the utilization of multiple datasets and web images for softening bias},\n\tauthor       = {Chen Fang and Ye Xu and Daniel N Rockmore},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Computer Vision (ICCV)},\n\tpages        = {1657--1664}\n}\n@inproceedings{fang2015captions,\n\ttitle        = {From captions to visual concepts and back},\n\tauthor       = {Hao Fang and Saurabh Gupta and Forrest Iandola and Rupesh K Srivastava and Li Deng and Piotr Doll{\\'a}r and Jianfeng Gao and Xiaodong He and Margaret Mitchell and John C Platt},\n\tyear         = 2015,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@inproceedings{fang2017learning,\n\ttitle        = {Learning how to Active Learn: A Deep Reinforcement Learning Approach},\n\tauthor       = {Meng Fang and Yuan Li and Trevor Cohn},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{fang2018sounding,\n\ttitle        = {Sounding Board: A User-Centric and Content-Driven Social Chatbot},\n\tauthor       = {Fang, Hao and Cheng, Hao and Sap, Maarten and Clark, Elizabeth and Holtzman, Ari and Choi, Yejin and Smith, Noah A and Ostendorf, Mari},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1804.10202}\n}\n@article{fang2020modeling,\n\ttitle        = {Modeling from Features: a Mean-field Framework for Over-parameterized Deep Neural Networks},\n\tauthor       = {Fang, Cong and Lee, Jason D and Yang, Pengkun and Zhang, Tong},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.01452}\n}\n@inproceedings{farahmand2017value,\n\ttitle        = {Value-aware loss function for model-based reinforcement learning},\n\tauthor       = {Farahmand, Amir-massoud and Barreto, Andre and Nikovski, Daniel},\n\tyear         = 2017,\n\tbooktitle    = {Artificial Intelligence and Statistics}\n}\n@inproceedings{farhadi2010attribute,\n\ttitle        = {Attribute-centric recognition for cross-category generalization},\n\tauthor       = {A. Farhadi and I. Endres and D. Hoiem},\n\tyear         = 2010,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@inproceedings{farhadi2010every,\n\ttitle        = {Every picture tells a story: Generating sentences from images},\n\tauthor       = {A. Farhadi and M. Hejrati and M. A. Sadeghi and P. Young and C. Rashtchian and J. Hockenmaier and D. Forsyth},\n\tyear         = 2010,\n\tbooktitle    = {European Conference on Computer Vision (ECCV)},\n\tpages        = {15--29}\n}\n@inproceedings{farshchian2019adversarial,\n\ttitle        = {Adversarial Domain Adaptation for Stable Brain-Machine Interfaces},\n\tauthor       = {Ali Farshchian and Juan A. Gallego and Joseph P. Cohen and Yoshua Bengio and Lee E. Miller and Sara A. Solla},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{fasola2013using,\n\ttitle        = {Using Semantic Fields to Model Dynamic Spatial Relations in a Robot Architecture for Natural Language Instruction of Service Robots},\n\tauthor       = {J. Fasola and M. J Matari{\\'c}},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Intelligent Robots and Systems (IROS)}\n}\n@inproceedings{fasola2014interpreting,\n\ttitle        = {Interpreting Instruction Sequences in Spatial Language Discourse with Pragmatics towards Natural Human-Robot Interaction},\n\tauthor       = {J. Fasola and M. J Matari{\\'c}},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Robotics and Automation (ICRA)},\n\tpages        = {6667--6672}\n}\n@article{FastICA,\n\ttitle        = {Fast and robust fixed-point algorithms for independent component analysis},\n\tauthor       = {Hyvarinen, A.},\n\tyear         = 1999,\n\tjournal      = {Neural Networks, IEEE Transactions on},\n\tvolume       = 10,\n\tnumber       = 3,\n\tpages        = {626--634}\n}\n@article{FastPCAGarber,\n\ttitle        = {Fast and Simple PCA via Convex Optimization},\n\tauthor       = {Garber, Dan and Hazan, Elad},\n\tyear         = 2015,\n\tmonth        = sep,\n\tjournal      = {arXiv preprint arXiv:1509.05647},\n\tvolumn       = {abs/1509.05647}\n}\n@inproceedings{fathi2011combining,\n\ttitle        = {Combining Self Training and Active Learning for Video Segmentation},\n\tauthor       = {Alireza Fathi and Maria-Florina Balcan and Xiaofeng Ren and James M. Rehg},\n\tyear         = 2011,\n\tbooktitle    = {British Machine Vision Conference (BMVC)}\n}\n@inproceedings{faury2020improved,\n\ttitle        = {Improved optimistic algorithms for logistic bandits},\n\tauthor       = {Faury, Louis and Abeille, Marc and Calauz{\\`e}nes, Cl{\\'e}ment and Fercoq, Olivier},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {3052--3060},\n\torganization = {PMLR}\n}\n@article{fawzi2017classification,\n\ttitle        = {Classification regions of deep neural networks},\n\tauthor       = {Fawzi, Alhussein and Moosavi-Dezfooli, Seyed-Mohsen and Frossard, Pascal and Soatto, Stefano},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1705.09552}\n}\n@article{fawzi2018analysis,\n\ttitle        = {Analysis of classifiers' robustness to adversarial perturbations},\n\tauthor       = {Alhussein Fawzi and Omar Fawzi and Pascal Frossard},\n\tyear         = 2018,\n\tjournal      = {Machine Learning},\n\tvolume       = 107,\n\tnumber       = 3,\n\tpages        = {481--508}\n}\n@inproceedings{fazel2001rank,\n\ttitle        = {A rank minimization heuristic with application to minimum order system approximation},\n\tauthor       = {Fazel, Maryam and Hindi, Haitham and Boyd, Stephen P},\n\tyear         = 2001,\n\tbooktitle    = {Proc.~American Control Conference},\n\tvolume       = 6,\n\tpages        = {4734--4739},\n\torganization = {IEEE}\n}\n@inproceedings{fazel2004rank,\n\ttitle        = {Rank minimization and applications in system theory},\n\tauthor       = {Fazel, Maryam and Hindi, Haitham and Boyd, S},\n\tyear         = 2004,\n\tbooktitle    = {Proc.~American Control Conference},\n\tvolume       = 4,\n\tpages        = {3273--3278},\n\torganization = {IEEE}\n}\n@article{fazlyab2019safety,\n\ttitle        = {Safety Verification and Robustness Analysis of Neural Networks via Quadratic Constraints and Semidefinite Programming},\n\tauthor       = {Mahyar Fazlyab and Manfred Morari and George J Pappas},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1903.01287}\n}\n@inproceedings{fb16,\n\ttitle        = {Topology and geometry of half-rectified network optimization},\n\tauthor       = {Freeman, C. Daniel and Bruna, Joan},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1611.01540},\n\tbooktitle    = {ICLR},\n\turl          = {https://arxiv.org/abs/1611.01540}\n}\n@misc{feamsterbgp,\n\ttitle        = {BGP Monitor - The Datapository Project, http://www.datapository.net/bgpmon/},\n\tauthor       = {N. Feamster and D. Andersen and H. Balakrishnan and F. Kaashoek},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@article{fearnley2015learning,\n\ttitle        = {Learning equilibria of games via payoff queries.},\n\tauthor       = {Fearnley, John and Gairing, Martin and Goldberg, Paul W and Savani, Rahul},\n\tyear         = 2015,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 16,\n\tpages        = {1305--1344}\n}\n@article{fearnley2016finding,\n\ttitle        = {Finding approximate Nash equilibria of bimatrix games via payoff queries},\n\tauthor       = {Fearnley, John and Savani, Rahul},\n\tyear         = 2016,\n\tjournal      = {ACM Transactions on Economics and Computation (TEAC)},\n\tpublisher    = {ACM},\n\tvolume       = 4,\n\tnumber       = 4,\n\tpages        = 25\n}\n@article{feder1994relations,\n\ttitle        = {Relations between entropy and error probability},\n\tauthor       = {Meir Feder and Neri Merhav},\n\tyear         = 1994,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tvolume       = 40,\n\tpages        = {259--266}\n}\n@inproceedings{fedus2018maskgan,\n\ttitle        = {MaskGAN: Better Text Generation via Filling in the},\n\tauthor       = {William Fedus and Ian Goodfellow and Andrew M. Dai},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{feige2000finding,\n\ttitle        = {Finding and certifying a large hidden clique in a semirandom graph},\n\tauthor       = {Uriel Feige and Robert Krauthgamer},\n\tyear         = 2000,\n\tjournal      = {Random Structures and Algorithms},\n\tvolume       = 16,\n\tnumber       = 2,\n\tpages        = {195--208}\n}\n@article{feige2001heuristics,\n\ttitle        = {Heuristics for semirandom graph problems},\n\tauthor       = {Uriel Feige and Joe Kilian},\n\tyear         = 2001,\n\tjournal      = {Journal of Computer and System Sciences},\n\tvolume       = 63,\n\tnumber       = 4,\n\tpages        = {639--671}\n}\n@article{FeigeKrauthgamer02,\n\ttitle        = {A Polylogarithmic Approximation of the Minimum Bisection},\n\tauthor       = {Feige, Uriel and Krauthgamer, Robert},\n\tyear         = 2002,\n\tmonth        = apr,\n\tjournal      = {SIAM J. Comput.},\n\tpublisher    = {Society for Industrial and Applied Mathematics},\n\tvolume       = 31,\n\tnumber       = 4,\n\tissue_date   = 2002,\n\tnumpages     = 29\n}\n@article{feinberg2014value,\n\ttitle        = {The value iteration algorithm is not strongly polynomial for discounted dynamic programming},\n\tauthor       = {Feinberg, Eugene A and Huang, Jefferson},\n\tyear         = 2014,\n\tjournal      = {Operations Research Letters},\n\tpublisher    = {Elsevier},\n\tvolume       = 42,\n\tnumber       = 2,\n\tpages        = {130--131}\n}\n@article{feinman2017detecting,\n\ttitle        = {Detecting Adversarial Samples from Artifacts},\n\tauthor       = {Reuben Feinman and Ryan R Curtin and Saurabh Shintre and Andrew B Gardner},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1703.00410}\n}\n@inproceedings{feizabadi2014crowdsourcing,\n\ttitle        = {Crowdsourcing Annotation of Non-Local Semantic Roles},\n\tauthor       = {Parvin Sadat Feizabadi and Sebastian Pado},\n\tyear         = 2014,\n\tbooktitle    = {European Association for Computational Linguistics (EACL)},\n\tpages        = {226--230}\n}\n@article{feizi2017porcupine,\n\ttitle        = {Porcupine neural networks:(almost) all local optima are global},\n\tauthor       = {Feizi, Soheil and Javadi, Hamid and Zhang, Jesse and Tse, David},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1710.02196}\n}\n@article{feldman04embodied,\n\ttitle        = {Embodied Meaning in a Neural Theory of Language},\n\tauthor       = {J. Feldman and S. Narayanan},\n\tyear         = 2004,\n\tjournal      = {Brain and Language},\n\tvolume       = 89,\n\tpages        = {385--392}\n}\n@inproceedings{feldman05product,\n\ttitle        = {Learning mixtures of product distributions over discrete domains},\n\tauthor       = {Jon Feldman and Ryan O'Donnell and Rocco A. Servedio},\n\tyear         = 2005,\n\tbooktitle    = {Foundations of Computer Science (FOCS)},\n\tpages        = {501--510}\n}\n@article{feldman2009agnostic,\n\ttitle        = {On agnostic learning of parities, monomials, and halfspaces},\n\tauthor       = {Vitaly Feldman and Parikshit Gopalan and Subhash Khot and Ashok Kumar Ponnuswami},\n\tyear         = 2009,\n\tjournal      = {SIAM Journal on Computing},\n\tvolume       = 39,\n\tnumber       = 2,\n\tpages        = {606--645}\n}\n@inproceedings{feldman2015,\n\ttitle        = {Certifying and removing disparate impact},\n\tauthor       = {Michael Feldman and Sorelle Friedler and John Moeller and Carlos Scheidegger and Suresh Venkatasubramanian},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)},\n\tpages        = {259--268}\n}\n@book{fellbaum1998wordnet,\n\ttitle        = {WordNet: An Electronic Lexical Database},\n\tauthor       = {Christiane Fellbaum},\n\tyear         = 1998,\n\tpublisher    = {MIT Press}\n}\n@article{felsenstein96phylohmm,\n\ttitle        = {A hidden {M}arkov model approach to variation among sites in rate of evolution},\n\tauthor       = {J. Felsenstein and G. A. Churchill},\n\tyear         = 1996,\n\tjournal      = {Molecular Biology and Evolution},\n\tvolume       = 13,\n\tpages        = {93--104}\n}\n@inproceedings{feng2014cross,\n\ttitle        = {Cross-modal Retrieval with Correspondence Autoencoder},\n\tauthor       = {Fangxiang Feng and Xiaojie Wang and Ruifan Li},\n\tyear         = 2014,\n\tbooktitle    = {Proceedings of the 22Nd ACM International Conference on Multimedia},\n\tpages        = {7--16}\n}\n@inproceedings{feng2017api,\n\ttitle        = {Component-Based Synthesis for Complex APIs},\n\tauthor       = {Yu Feng and Ruben Martins and Yuepeng Wang and Isil Dillig and Tomas W. Reps},\n\tyear         = 2017,\n\tbooktitle    = {Principles of Programming Languages (POPL)}\n}\n@inproceedings{feng2018pathologies,\n\ttitle        = {Pathologies of Neural Models Make Interpretations Difficult},\n\tauthor       = {Shi Feng and Eric Wallace and Alvin {Grissom II} and Mohit Iyyer and Pedro Rodriguez and Jordan Boyd-Graber},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{feng2019misleading,\n\ttitle        = {Misleading Failures of Partial-input Baselines},\n\tauthor       = {Shi Feng and Eric Wallace and Jordan Boyd-Graber},\n\tyear         = 2019,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{feng2019robot,\n\ttitle        = {Robot-Assisted Feeding: Generalizing Skewering Strategies across Food Items on a Realistic Plate},\n\tauthor       = {Ryan Feng and Young-sun Kim and Gilwoo Lee and E. Gordon and Matt Schmittle and Shivaum Kumar and T. Bhattacharjee and S. Srinivasa},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1906.02350}\n}\n@article{feng2019selective,\n\ttitle        = {Selective prediction-set models with coverage guarantees},\n\tauthor       = {Jean Feng and Arjun Sondhi and Jessica Perry and Noah Simon},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1906.05473}\n}\n@inproceedings{feng2020provably,\n\ttitle        = {Provably Efficient Exploration for RL with Unsupervised Learning},\n\tauthor       = {Feng, Fei and Wang, Ruosong and Yin, Wotao and Du, Simon S and Yang, Lin F},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2003.06898},\n\tbooktitle    = {Advances in Neural Information Processing Systems}\n}\n@article{Fercoq2013,\n\ttitle        = {Accelerated, Parallel, and Proximal Coordinate Descent},\n\tauthor       = {Fercoq, Olivier and Richt\\'{a}rik, Peter},\n\tyear         = 2015,\n\tjournal      = {SIAM Journal on Optimization},\n\tvolume       = 25,\n\tnumber       = 4,\n\tpages        = {1997--2023},\n\tnote         = {First appeared on ArXiv 1312.5799 in 2013}\n}\n@inproceedings{Fercoq2014fast,\n\ttitle        = {Fast distributed coordinate descent for non-strongly convex losses},\n\tauthor       = {Fercoq, Olivier and Qu, Zheng and Richt{\\'a}rik, Peter and Tak{\\'a}c, Martin},\n\tyear         = 2014,\n\tbooktitle    = {MLSP},\n\tpages        = {1--6},\n\torganization = {IEEE}\n}\n@article{FercoqRichtarik2013smooth,\n\ttitle        = {Smooth minimization of nonsmooth functions with parallel coordinate descent methods},\n\tauthor       = {{Fercoq}, Olivier and {Richt{\\'a}rik}, Peter},\n\tyear         = 2013,\n\tmonth        = sep,\n\tjournal      = {ArXiv e-prints},\n\tvolume       = {abs/1309.5885}\n}\n@article{ferguson73dp,\n\ttitle        = {A {B}ayesian Analysis of Some Nonparametric Problems},\n\tauthor       = {T. S. Ferguson},\n\tyear         = 1973,\n\tjournal      = {Annals of Statistics},\n\tvolume       = 1,\n\tpages        = {209--230}\n}\n@article{ferguson74prior,\n\ttitle        = {Prior distributions on spaces of probability measures},\n\tauthor       = {T. S. Ferguson},\n\tyear         = 1974,\n\tjournal      = {Annals of Statistics},\n\tvolume       = 2,\n\tpages        = {615--629}\n}\n@article{ferro2012bias,\n\ttitle        = {A bias-corrected decomposition of the Brier score},\n\tauthor       = {C. A. T. Ferro and Thomas E. Fricker},\n\tyear         = 2012,\n\tjournal      = {Quarterly Journal of the Royal Meteorological Society},\n\tvolume       = 138,\n\tnumber       = 668,\n\tpages        = {1954--1960}\n}\n@article{ferrucci2013watson,\n\ttitle        = {Building {W}atson: An Overview of the {D}eep{QA} Project},\n\tauthor       = {David Ferrucci and Eric Brown and Jennifer Chu-Carroll and James Fan and David Gondek and Aditya A. Kalyanpur and Adam Lally and J. William Murdock and Eric Nyberg and John Prager and Nico Schlaefer and Chris Welty},\n\tyear         = 2013,\n\tjournal      = {AI Magazine},\n\tvolume       = 31,\n\tnumber       = 3,\n\tpages        = {59--79}\n}\n@inproceedings{feser2015synthesizing,\n\ttitle        = {Synthesizing Data Structure Transformations from Input-Output Examples},\n\tauthor       = {John K. Feser and Swarat Chaudhuri and Isil Dillig},\n\tyear         = 2015,\n\tbooktitle    = {Programming Language Design and Implementation (PLDI)}\n}\n@inproceedings{fetaya2016unsupervised,\n\ttitle        = {Unsupervised Ensemble Learning with Dependent Classifiers},\n\tauthor       = {Ethan Fetaya and Boaz Nadler and Ariel Jaffe and Yuval Kluger and Tingting Jiang},\n\tyear         = 2016,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)},\n\tpages        = {351--360}\n}\n@inproceedings{fevry2018unsupervised,\n\ttitle        = {Unsupervised Sentence Compression using Denoising Auto-Encoders},\n\tauthor       = {Thibault Fevry and Jason Phang},\n\tyear         = 2018,\n\tbooktitle    = {Computational Natural Language Learning (CoNLL)},\n\tpages        = {413--422}\n}\n@inproceedings{FGKP,\n\ttitle        = {New Results for Learning Noisy Parities and Halfspaces},\n\tauthor       = {Feldman, Vitaly and Gopalan, Parikshit and Khot, Subhash and Ponnuswami, Ashok Kumar},\n\tyear         = 2006,\n\tbooktitle    = {Proceedings of the 47th Annual IEEE Symposium on Foundations of Computer Science},\n\tpublisher    = {IEEE Computer Society},\n\taddress      = {Washington, DC, USA},\n\tseries       = {FOCS '06},\n\tpages        = {563--574},\n\tisbn         = {0-7695-2720-5},\n\tnumpages     = 12\n}\n@article{ficler2017controlling,\n\ttitle        = {Controlling Linguistic Style Aspects in Natural Language Generation},\n\tauthor       = {Jessica Ficler and Yoav Goldberg},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1707.02633}\n}\n@inproceedings{figurnov2018implicit,\n\ttitle        = {Implicit reparameterization gradients},\n\tauthor       = {Figurnov, Mikhail and Mohamed, Shakir and Mnih, Andriy},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems}\n}\n@book{filar2012competitive,\n\ttitle        = {Competitive Markov decision processes},\n\tauthor       = {Filar, Jerzy and Vrieze, Koos},\n\tyear         = 2012,\n\tpublisher    = {Springer Science \\& Business Media}\n}\n@inproceedings{filippi2010parametric,\n\ttitle        = {Parametric bandits: The generalized linear case},\n\tauthor       = {Filippi, Sarah and Cappe, Olivier and Garivier, Aur{\\'e}lien and Szepesv{\\'a}ri, Csaba},\n\tyear         = 2010,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {586--594}\n}\n@inproceedings{filippova2013overcoming,\n\ttitle        = {Overcoming the Lack of Parallel Data in Sentence Compression},\n\tauthor       = {Katja Filippova and Yasemin Altun},\n\tyear         = 2013,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{filmer2011asset,\n\ttitle        = {Assessing Asset Indices},\n\tauthor       = {Deon Filmer and Kinnon Scott},\n\tyear         = 2011,\n\tjournal      = {Demography},\n\tvolume       = 49\n}\n@inproceedings{finegan2018improving,\n\ttitle        = {Improving text-to-sql evaluation methodology},\n\tauthor       = {Catherine Finegan-Dollak and Jonathan K Kummerfeld and Li Zhang and Karthik Ramanathan and Sesh Sadasivam and Rui Zhang and Dragomir Radev},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{finkel07inftree,\n\ttitle        = {The Infinite Tree},\n\tauthor       = {J. R. Finkel and T. Grenager and C. Manning},\n\tyear         = 2007,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {272--279}\n}\n@inproceedings{finkel08crf,\n\ttitle        = {Efficient, Feature-based, Conditional Random Field Parsing},\n\tauthor       = {J. R. Finkel and A. Kleeman and C. Manning},\n\tyear         = 2008,\n\tbooktitle    = {Human Language Technology and Association for Computational Linguistics (HLT/ACL)}\n}\n@inproceedings{finkel2005incorporating,\n\ttitle        = {Incorporating non-local information into information extraction systems by {G}ibbs sampling},\n\tauthor       = {Jenny Rose Finkel and Trond Grenager and Christopher Manning},\n\tyear         = 2005,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {363--370}\n}\n@article{finlayson2021clinician,\n\ttitle        = {The Clinician and Dataset Shift in Artificial Intelligence},\n\tauthor       = {Samuel G. Finlayson and Adarsh Subbaswamy and Karandeep Singh and John Bowers and Annabel Kupke and Jonathan Zittrain and Isaac S. Kohane and Suchi Saria},\n\tyear         = 2021,\n\tjournal      = {New England Journal of Medicine},\n\tvolume       = 385,\n\tnumber       = 3,\n\tpages        = {283--286}\n}\n@inproceedings{finley2008training,\n\ttitle        = {Training structural {SVM}s when exact inference is intractable},\n\tauthor       = {Thomas Finley and Thorsten Joachims},\n\tyear         = 2008,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {304--311}\n}\n@inproceedings{finn2003active,\n\ttitle        = {Active learning selection strategies for information extraction},\n\tauthor       = {Aidan Finn and Nicolas Kushmerick},\n\tyear         = 2003,\n\tbooktitle    = {Proceedings of the International Workshop on Adaptive Text Extraction and Mining (ATEM-03)},\n\tpages        = {18--25}\n}\n@inproceedings{finn2016guided,\n\ttitle        = {Guided cost learning: Deep inverse optimal control via policy optimization},\n\tauthor       = {C. Finn and S. Levine and P. Abbeel},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {49--58}\n}\n@inproceedings{finn2016unsupervised,\n\ttitle        = {Unsupervised learning for physical interaction through video prediction},\n\tauthor       = {Chelsea Finn and Ian Goodfellow and Sergey Levine},\n\tyear         = 2016,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {64--72}\n}\n@inproceedings{finn2017deep,\n\ttitle        = {Deep visual foresight for planning robot motion},\n\tauthor       = {Chelsea Finn and Sergey Levine},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Robotics and Automation (ICRA)}\n}\n@inproceedings{finn2017modelagnostic,\n\ttitle        = {Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks},\n\tauthor       = {Chelsea Finn and Pieter Abbeel and Sergey Levine},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{finn2017one,\n\ttitle        = {One-Shot Visual Imitation Learning via Meta-Learning},\n\tauthor       = {C. Finn and T. Yu and T. Zhang and P. Abbeel and S. Levine},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1709.04905}\n}\n@inproceedings{finucane2010ltlmop,\n\ttitle        = {LTLMoP: Experimenting with language, temporal logic and robot control},\n\tauthor       = {C. Finucane and G. Jing and H. Kress-Gazit},\n\tyear         = 2010,\n\tbooktitle    = {International Conference on Intelligent Robots and Systems (IROS)}\n}\n@article{fiorini2013combinatorial,\n\ttitle        = {Combinatorial bounds on nonnegative rank and extended formulations},\n\tauthor       = {Fiorini, Samuel and Kaibel, Volker and Pashkovich, Kanstantsin and Theis, Dirk Oliver},\n\tyear         = 2013,\n\tjournal      = {Discrete Mathematics},\n\tpublisher    = {Elsevier}\n}\n@inproceedings{firat2016multi,\n\ttitle        = {Multi-Way, Multilingual Neural Machine Translation with a Shared Attention Mechanism},\n\tauthor       = {Orhan Firat and  Kyunghyun Cho and  Yoshua Bengio},\n\tyear         = 2016,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@book{firth1957a,\n\ttitle        = {A synopsis of linguistic theory},\n\tauthor       = {John Rupert Firth},\n\tyear         = 1957\n}\n@inproceedings{fisac2019bridging,\n\ttitle        = {Bridging {Hamilton-Jacobi} Safety Analysis and Reinforcement Learning},\n\tauthor       = {Jaime F. Fisac and Neil F. Lugovoy and Vicenç Rúbies Royo and S. Ghosh and C. Tomlin},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Robotics and Automation (ICRA)}\n}\n@inproceedings{fisch2019mrqa,\n\ttitle        = {{MRQA} 2019 Shared Task: Evaluating Generalization in Reading Comprehension},\n\tauthor       = {Adam Fisch and Alon Talmor and Robin Jia and Minjoon Seo and Eunsol Choi and Danqi Chen},\n\tyear         = 2019,\n\tbooktitle    = {Workshop on Machine Reading for Question Answering (MRQA)}\n}\n@techreport{fishburn1970utility,\n\ttitle        = {Utility theory for decision making},\n\tauthor       = {Fishburn, Peter C},\n\tyear         = 1970,\n\tinstitution  = {Research analysis corp McLean VA}\n}\n@article{fishburn1979two,\n\ttitle        = {Two-piece von Neumann-Morgenstern utility functions},\n\tauthor       = {Fishburn, Peter C and Kochenberger, Gary A},\n\tyear         = 1979,\n\tjournal      = {Decision Sciences},\n\tpublisher    = {Wiley Online Library},\n\tvolume       = 10,\n\tnumber       = 4,\n\tpages        = {503--518}\n}\n@article{fisher2012example,\n\ttitle        = {Example-based Synthesis of 3{D} Object Arrangements},\n\tauthor       = {Matthew Fisher and Daniel Ritchie and Manolis Savva and Thomas Funkhouser and Pat Hanrahan},\n\tyear         = 2012,\n\tjournal      = {ACM SIGGRAPH Asia},\n\tvolume       = 12\n}\n@inproceedings{fitzgerald2013learning,\n\ttitle        = {Learning Distributions over Logical Forms for Referring Expression Generation},\n\tauthor       = {Nicholas FitzGerald and Yoav Artzi and Luke S. Zettlemoyer},\n\tyear         = 2013,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1914--1925}\n}\n@inproceedings{FJK,\n\ttitle        = {Learning Linear Transformations},\n\tauthor       = {Alan M. Frieze and Mark Jerrum and Ravindran Kannan},\n\tyear         = 1996,\n\tbooktitle    = {FOCS}\n}\n@inproceedings{FJK96,\n\ttitle        = {Learning Linear Transformations},\n\tauthor       = {A. M. Frieze and M. Jerrum and R. Kannan},\n\tyear         = 1996,\n\tbooktitle    = {FOCS}\n}\n@inproceedings{flanigan2014discriminative,\n\ttitle        = {A discriminative graph-based parser for the abstract meaning representation},\n\tauthor       = {Jeffrey Flanigan and Sam Thomson and Jaime G Carbonell and Chris Dyer and Noah A Smith},\n\tyear         = 2014,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{flash1985coordination,\n\ttitle        = {\n\t\tThe coordination of arm movements: an experimentally confirmed mathematical\n\n\t\tmodel.\n\t},\n\tauthor       = {Flash, T. and Hogan, N.},\n\tyear         = 1985,\n\tmonth        = jul,\n\tjournal      = {J Neurosci},\n\tvolume       = 5,\n\tnumber       = 7,\n\tpages        = {1688--1703},\n\tissn         = {0270-6474},\n\tabstract     = {\n\t\tThis paper presents studies of the coordination of voluntary human\n\n\t\tarm movements. A mathematical model is formulated which is shown\n\n\t\tto predict both the qualitative features and the quantitative details\n\n\t\tobserved experimentally in planar, multijoint arm movements. Coordination\n\n\t\tis modeled mathematically by defining an objective function, a measure\n\n\t\tof performance for any possible movement. The unique trajectory which\n\n\t\tyields the best performance is determined using dynamic optimization\n\n\t\ttheory. In the work presented here, the objective function is the\n\n\t\tsquare of the magnitude of jerk (rate of change of acceleration)\n\n\t\tof the hand integrated over the entire movement. This is equivalent\n\n\t\tto assuming that a major goal of motor coordination is the production\n\n\t\tof the smoothest possible movement of the hand. Experimental observations\n\n\t\tof human subjects performing voluntary unconstrained movements in\n\n\t\ta horizontal plane are presented. They confirm the following predictions\n\n\t\tof the mathematical model: unconstrained point-to-point motions are\n\n\t\tapproximately straight with bell-shaped tangential velocity profiles;\n\n\t\tcurved motions (through an intermediate point or around an obstacle)\n\n\t\thave portions of low curvature joined by portions of high curvature;\n\n\t\tat points of high curvature, the tangential velocity is reduced;\n\n\t\tthe durations of the low-curvature portions are approximately equal.\n\n\t\tThe theoretical analysis is based solely on the kinematics of movement\n\n\t\tindependent of the dynamics of the musculoskeletal system and is\n\n\t\tsuccessful only when formulated in terms of the motion of the hand\n\n\t\tin extracorporal space. The implications with respect to movement\n\n\t\torganization are discussed.\n\t},\n\tciteulike-article-id = 701244,\n\tkeywords     = {arm, coordination, jerk, ngd, smoothness},\n\tmyurl        = {http://www.jneurosci.org/cgi/content/abstract/5/7/1688},\n\tpriority     = 2\n}\n@inproceedings{flaxman2005online,\n\ttitle        = {Online convex optimization in the bandit setting: gradient descent without a gradient},\n\tauthor       = {Flaxman, Abraham D and Kalai, Adam Tauman and McMahan, H Brendan},\n\tyear         = 2005,\n\tbooktitle    = {Proceedings of the sixteenth annual ACM-SIAM symposium on Discrete algorithms},\n\tpages        = {385--394},\n\torganization = {Society for Industrial and Applied Mathematics}\n}\n@article{fle70,\n\ttitle        = {A new approach to variable metric algorithms},\n\tauthor       = {Fletcher, Roger},\n\tyear         = 1970,\n\tjournal      = {The computer journal},\n\tpublisher    = {Br Computer Soc},\n\tvolume       = 13,\n\tnumber       = 3,\n\tpages        = {317--322}\n}\n@article{Fleischer2000,\n\ttitle        = {{Approximating Fractional Multicommodity Flow Independent of the Number of Commodities}},\n\tauthor       = {Fleischer, Lisa K.},\n\tyear         = 2000,\n\tmonth        = jan,\n\tjournal      = {SIAM Journal on Discrete Mathematics},\n\tvolume       = 13,\n\tnumber       = 4,\n\tpages        = {505--520},\n\tdoi          = {10.1137/S0895480199355754},\n\tissn         = {0895-4801},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Fleischer - 2000 - Approximating Fractional Multicommodity Flow Independent of the Number of Commodities.pdf:pdf},\n\tmendeley-groups = {Algorithms/Multiplicative Weight/Flow}\n}\n@inproceedings{fleischman07intention,\n\ttitle        = {Representing Intentions in a Cognitive Model of Language Acquisition: Effects of Phrase Structure on Situated Verb Learning},\n\tauthor       = {M. Fleischman and D. Roy},\n\tyear         = 2007,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{fleischman2005intentional,\n\ttitle        = {Intentional context in situated natural language learning},\n\tauthor       = {M. Fleischman and D. Roy},\n\tyear         = 2005,\n\tbooktitle    = {Computational Natural Language Learning (CoNLL)},\n\tpages        = {104--111}\n}\n@article{fodor1988connectionism,\n\ttitle        = {Connectionism and Cognitive Architecture: A Critical Analysis},\n\tauthor       = {Jerry A. Fodor and Zenon W. Pylyshyn},\n\tyear         = 1988,\n\tjournal      = {Cognition},\n\tvolume       = 28,\n\tpages        = {3--71}\n}\n@inproceedings{foerster2016learning,\n\ttitle        = {Learning to communicate with deep multi-agent reinforcement learning},\n\tauthor       = {Jakob Foerster and Yannis M Assael and Nando de Freitas and Shimon Whiteson},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {2137--2145}\n}\n@inproceedings{foerster2018counterfactual,\n\ttitle        = {Counterfactual multi-agent policy gradients},\n\tauthor       = {Foerster, Jakob and Farquhar, Gregory and Afouras, Triantafyllos and Nardelli, Nantas and Whiteson, Shimon},\n\tyear         = 2018,\n\tbooktitle    = {Proceedings of the AAAI Conference on Artificial Intelligence},\n\tvolume       = 32,\n\tnumber       = 1\n}\n@inproceedings{fong2017interpretable,\n\ttitle        = {Interpretable explanations of black boxes by meaningful perturbation},\n\tauthor       = {Ruth C Fong and Andrea Vedaldi},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Computer Vision (ICCV)},\n\tpages        = {3429--3437}\n}\n@article{fonteneau2013batch,\n\ttitle        = {Batch mode reinforcement learning based on the synthesis of artificial trajectories},\n\tauthor       = {Fonteneau, Raphael and Murphy, Susan A and Wehenkel, Louis and Ernst, Damien},\n\tyear         = 2013,\n\tjournal      = {Annals of operations research},\n\tpublisher    = {Springer},\n\tvolume       = 208,\n\tnumber       = 1,\n\tpages        = {383--416}\n}\n@article{foreh2003honesty,\n\ttitle        = {When is honesty the best policy? The effect of stated company intent on consumer skepticism},\n\tauthor       = {Foreh, Mark R and Grier, Sonya},\n\tyear         = 2003,\n\tjournal      = {Journal of consumer psychology},\n\tpublisher    = {Elsevier},\n\tvolume       = 13,\n\tnumber       = 3,\n\tpages        = {349--356}\n}\n@article{foret2020sharpness,\n\ttitle        = {Sharpness-Aware Minimization for Efficiently Improving Generalization},\n\tauthor       = {Foret, Pierre and Kleiner, Ariel and Mobahi, Hossein and Neyshabur, Behnam},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.01412}\n}\n@inproceedings{foret2021sharpnessaware,\n\ttitle        = {Sharpness-aware Minimization for Efficiently Improving Generalization},\n\tauthor       = {Pierre Foret and Ariel Kleiner and Hossein Mobahi and Behnam Neyshabur},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Learning Representations}\n}\n@article{fort2019deep,\n\ttitle        = {Deep ensembles: A loss landscape perspective},\n\tauthor       = {Fort, Stanislav and Hu, Huiyi and Lakshminarayanan, Balaji},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1912.02757}\n}\n@inproceedings{FOS05,\n\ttitle        = {Learning Mixtures of Product Distributions over Discrete Domains},\n\tauthor       = {J. Feldman and R. O'Donnell and R. Servedio},\n\tyear         = 2005,\n\tbooktitle    = {FOCS}\n}\n@inproceedings{FOS06,\n\ttitle        = {{PAC} Learning Mixtures of Axis-Aligned {G}aussians with No Separation Assumption},\n\tauthor       = {J. Feldman and R. O'Donnell and R. Servedio},\n\tyear         = 2006,\n\tbooktitle    = {COLT}\n}\n@inproceedings{foster04xslt,\n\ttitle        = {Techniques for text planning with {XSLT}},\n\tauthor       = {Mary Ellen Foster and Michael White},\n\tyear         = 2004,\n\tbooktitle    = {Workshop on NLP and XML: RDF/RDFS and OWL in Language Technology},\n\tpages        = {1--8}\n}\n@inproceedings{foster2011news,\n\ttitle        = {From news to comment: Resources and benchmarks for parsing the language of {Web} 2.0},\n\tauthor       = {Jennifer Foster and Ozlem Cetinoglu and Joachim Wagner and Joseph Le Roux and Joakim Nivre and Deirdre Hogan and Josef VanGenabith},\n\tyear         = 2011,\n\tbooktitle    = {Association for Computational Linguistics and International Joint Conference on Natural Language Processing (ACL-IJCNLP)}\n}\n@inproceedings{foster2018practical,\n\ttitle        = {Practical Contextual Bandits with Regression Oracles},\n\tauthor       = {Foster, Dylan and Agarwal, Alekh and Dudik, Miroslav and Luo, Haipeng and Schapire, Robert},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {1539--1548}\n}\n@inproceedings{foster2020beyond,\n\ttitle        = {Beyond {UCB}: Optimal and Efficient Contextual Bandits with Regression Oracles},\n\tauthor       = {Foster, Dylan and Rakhlin, Alexander},\n\tyear         = 2020,\n\tmonth        = {13--18 Jul},\n\tbooktitle    = {Proceedings of the 37th International Conference on Machine Learning},\n\tpublisher    = {PMLR},\n\tseries       = {Proceedings of Machine Learning Research},\n\tvolume       = 119,\n\tpages        = {3199--3210},\n\tabstract     = {A fundamental challenge in contextual bandits is to develop flexible, general-purpose algorithms with computational requirements no worse than classical supervised learning tasks such as classification and regression. Algorithms based on regression have shown promising empirical success, but theoretical guarantees have remained elusive except in special cases. We provide the first universal and optimal reduction from contextual bandits to online regression. We show how to transform any oracle for online regression with a given value function class into an algorithm for contextual bandits with the induced policy class, with no overhead in runtime or memory requirements. We characterize the minimax rates for contextual bandits with general, potentially nonparametric function classes, and show that our algorithm is minimax optimal whenever the oracle obtains the optimal rate for regression. Compared to previous results, our algorithm requires no distributional assumptions beyond realizability, and works even when contexts are chosen adversarially.}\n}\n@article{foster2020instance,\n\ttitle        = {Instance-dependent complexity of contextual bandits and reinforcement learning: A disagreement-based perspective},\n\tauthor       = {Foster, Dylan J and Rakhlin, Alexander and Simchi-Levi, David and Xu, Yunzong},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.03104}\n}\n@misc{foster98asymptoticcalibration,\n\ttitle        = {Asymptotic calibration},\n\tauthor       = {Dean P. Foster and Rakesh V. Vohra},\n\tyear         = 1998\n}\n@article{fourierpca,\n\ttitle        = {Fourier PCA},\n\tauthor       = {N. Goyal and S. Vempala and Y. Xiao},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1306.5825}\n}\n@inproceedings{fox16taming,\n\ttitle        = {Taming the Noise in Reinforcement Learning via Soft Updates},\n\tauthor       = {Roy Fox and Ari Pakman and Naftali Tishby},\n\tyear         = 2016,\n\tbooktitle    = {Proceedings of the Thirty-Second Conference on Uncertainty in Artificial Intelligence (UAI-16)}\n}\n@inproceedings{foygel2011learning,\n\ttitle        = {Learning with the weighted trace-norm under arbitrary sampling distributions},\n\tauthor       = {Foygel, Rina and Salakhutdinov, Ruslan and Shamir, Ohad and Srebro, Nathan},\n\tyear         = 2011,\n\tbooktitle    = {Proc. of NIPS}\n}\n@article{fralick1967learning,\n\ttitle        = {Learning to recognize patterns without a teacher},\n\tauthor       = {Stanley C. Fralick},\n\tyear         = 1967,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tvolume       = 13\n}\n@manual{francis1979brown,\n\ttitle        = {Brown Corpus Manual},\n\tauthor       = {W. Nelson Francis and Henry Kucera},\n\tyear         = 1979\n}\n@article{frank09intentions,\n\ttitle        = {Using Speakers' Referential Intentions to Model Early Cross-Situational Word Learning},\n\tauthor       = {Michael C. Frank and Noah D. Goodman and Joshua B. Tenenbaum},\n\tyear         = 2009,\n\tjournal      = {Psychological Science},\n\tvolume       = 20,\n\tnumber       = 5,\n\tpages        = {578--585}\n}\n@article{frank2012pragmatics,\n\ttitle        = {Predicting Pragmatic Reasoning in Language Games},\n\tauthor       = {M.C. Frank and N. D. Goodman},\n\tyear         = 2012,\n\tjournal      = {Science},\n\tvolume       = 336,\n\tpages        = {998--998}\n}\n@article{frank2014inferring,\n\ttitle        = {Inferring word meanings by assuming that speakers are informative},\n\tauthor       = {M.C. Frank and N. D. Goodman},\n\tyear         = 2014,\n\tjournal      = {Cognitive Psychology},\n\tvolume       = 75,\n\tpages        = {80--96}\n}\n@book{franke2009signal,\n\ttitle        = {Signal to act: Game theory in pragmatics},\n\tauthor       = {Michael Franke},\n\tyear         = 2009,\n\tpublisher    = {Institute for Logic, Language and Computation}\n}\n@misc{freebase2013dump,\n\ttitle        = {{F}reebase Data Dumps (2013-06-09)},\n\tauthor       = {Google},\n\tyear         = 2013,\n\thowpublished = {\\url{https://developers.google.com/freebase/data}}\n}\n@article{Freedman,\n\ttitle        = {Freedman's inequality for matrix martingales},\n\tauthor       = {Tropp, Joel and others},\n\tyear         = 2011,\n\tjournal      = {Electronic Communications in Probability},\n\tpublisher    = {The Institute of Mathematical Statistics and the Bernoulli Society},\n\tvolume       = 16,\n\tpages        = {262--270}\n}\n@article{freedman1975tail,\n\ttitle        = {On tail probabilities for martingales},\n\tauthor       = {Freedman, David A},\n\tyear         = 1975,\n\tjournal      = {the Annals of Probability},\n\tpublisher    = {Institute of Mathematical Statistics},\n\tvolume       = 3,\n\tnumber       = 1,\n\tpages        = {100--118}\n}\n@article{freedman2004graphical,\n\ttitle        = {Graphical models for causation, and the identification problem},\n\tauthor       = {David A Freedman},\n\tyear         = 2004,\n\tjournal      = {Evaluation Review},\n\tvolume       = 28,\n\tnumber       = 4,\n\tpages        = {267--293}\n}\n@inproceedings{freitag1998information,\n\ttitle        = {Information extraction from {HTML}: Application of a general machine learning approach},\n\tauthor       = {Dayne Freitag},\n\tyear         = 1998,\n\tbooktitle    = {AAAI/IAAI},\n\tpages        = {517--523}\n}\n@article{frenay2014classification,\n\ttitle        = {Classification in the presence of label noise: a survey},\n\tauthor       = {Beno{\\^\\i}t Fr{\\'e}nay and Michel Verleysen},\n\tyear         = 2014,\n\tjournal      = {IEEE Transactions on Neural Networks and Learning Systems},\n\tvolume       = 25,\n\tpages        = {845--869}\n}\n@article{french1999catastrophic,\n\ttitle        = {Catastrophic forgetting in connectionist networks},\n\tauthor       = {Robert M French},\n\tyear         = 1999,\n\tjournal      = {Trends in cognitive sciences},\n\tvolume       = 3\n}\n@inproceedings{french2018selfensembling,\n\ttitle        = {Self-ensembling for visual domain adaptation},\n\tauthor       = {Geoff French and Michal Mackiewicz and Mark Fisher},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations}\n}\n@article{frenkel2014minkowski,\n\ttitle        = {Minkowski's inequality and sums of squares},\n\tauthor       = {P{\\'e}ter E. Frenkel and P{\\'e}ter Horv{\\'a}th},\n\tyear         = 2014,\n\tjournal      = {Central European Journal of Mathematics},\n\tvolume       = 12,\n\tnumber       = 3,\n\tpages        = {510--516}\n}\n@inproceedings{freund1995adaboost,\n\ttitle        = {A decision-theoretic generalization of on-line learning and an application to boosting},\n\tauthor       = {Y. Freund and R. Schapire},\n\tyear         = 1995,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@inproceedings{freund1995desicion,\n\ttitle        = {A desicion-theoretic generalization of on-line learning and an application to boosting},\n\tauthor       = {Freund, Yoav and Schapire, Robert E},\n\tyear         = 1995,\n\tbooktitle    = {Computational learning theory},\n\tpages        = {23--37},\n\torganization = {Springer}\n}\n@article{freund2004sensitivity,\n\ttitle        = {A sensitivity result for semidefinite programs},\n\tauthor       = {Roland W Freund and Florian Jarre},\n\tyear         = 2004,\n\tjournal      = {Operations Research Letters},\n\tvolume       = 32,\n\tpages        = {126--132}\n}\n@inproceedings{fried2018speakerfollower,\n\ttitle        = {Speaker-Follower Models for Vision-and-Language Navigation},\n\tauthor       = {Daniel Fried and Ronghang Hu and Volkan Cirik and Anna Rohrbach and Jacob Andreas and Louis-Philippe Morency and Taylor Berg-Kirkpatrick and Kate Saenko and Dan Klein and Trevor Darrell},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{fried2018unified,\n\ttitle        = {Unified Pragmatic Models for Generating and Following Instructions},\n\tauthor       = {Daniel Fried and Jacob Andreas and Dan Klein},\n\tyear         = 2018,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{friedler2016possibility,\n\ttitle        = {On the (im) possibility of fairness},\n\tauthor       = {Friedler, Sorelle A and Scheidegger, Carlos and Venkatasubramanian, Suresh},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1609.07236}\n}\n@inproceedings{friedman00bayesian,\n\ttitle        = {Being {B}ayesian about {B}ayesian Network Structure: A {B}ayesian Approach to Structure Discovery in {B}ayesian Networks},\n\tauthor       = {N. Friedman and D. Koller},\n\tyear         = 2000,\n\tbooktitle    = {Uncertainty in Artificial Intelligence (UAI)},\n\tpages        = {201--210}\n}\n@book{friedman2001elements,\n\ttitle        = {The elements of statistical learning},\n\tauthor       = {Jerome Friedman and Trevor Hastie and Robert Tibshirani},\n\tyear         = 2001,\n\tpublisher    = {Springer series in statistics New York, NY, USA: Springer series in statistics New York, NY, USA:},\n\tvolume       = 1,\n\tnumber       = 10\n}\n@article{friedman2010regularization,\n\ttitle        = {Regularization paths for generalized linear models via coordinate descent},\n\tauthor       = {Jerome Friedman and Trevor Hastie and Rob Tibshirani},\n\tyear         = 2010,\n\tjournal      = {Journal of Statistical Software},\n\tvolume       = 33,\n\tnumber       = 1,\n\tpages        = {1--22}\n}\n@inproceedings{friedmann2011subexponential,\n\ttitle        = {Subexponential lower bounds for randomized pivoting rules for the simplex algorithm},\n\tauthor       = {Friedmann, Oliver and Hansen, Thomas Dueholm and Zwick, Uri},\n\tyear         = 2011,\n\tbooktitle    = {Proceedings of the forty-third annual ACM symposium on Theory of computing},\n\tpages        = {283--292},\n\torganization = {ACM}\n}\n@inproceedings{frieze1996learning,\n\ttitle        = {Learning linear transformations},\n\tauthor       = {Frieze, Alan and Jerrum, Mark and Kannan, Ravi},\n\tyear         = 1996,\n\tbooktitle    = {focs},\n\tpages        = 359,\n\torganization = {IEEE}\n}\n@book{frisch1934statistical,\n\ttitle        = {Statistical confluence analysis by means of complete regression systems},\n\tauthor       = {Ragnar Frisch},\n\tyear         = 1934,\n\tpublisher    = {Universitetets {\\O}konomiske Instituut},\n\tvolume       = 5\n}\n@inproceedings{frostig15competing,\n\ttitle        = {Competing with the empirical risk minimizer in a single pass},\n\tauthor       = {Roy Frostig and Rong Ge and Sham M. Kakade and Aaron Sidford},\n\tyear         = 2015,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@inproceedings{frostig15unregularizing,\n\ttitle        = {Un-regularizing: approximate proximal point and faster stochastic algorithms for empirical risk minimization},\n\tauthor       = {Roy Frostig and Rong Ge and Sham M. Kakade and Aaron Sidford},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{frostig2014lowrank,\n\ttitle        = {Simple {MAP} inference via low-rank relaxations},\n\tauthor       = {Roy Frostig and Sida I. Wang and Percy Liang and Chris Manning},\n\tyear         = 2014,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{frostig2014subconstant,\n\ttitle        = {A sub-constant improvement in approximating the positive semidefinite {G}rothendieck problem},\n\tauthor       = {Roy Frostig and Sida I. Wang},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1408.2270}\n}\n@inproceedings{frostig2015regularizing,\n\ttitle        = {Un-regularizing: approximate proximal point and faster stochastic algorithms for empirical risk minimization},\n\tauthor       = {Frostig, Roy and Ge, Rong and Kakade, Sham M and Sidford, Aaron},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the 32nd International Conference on Machine Learning (ICML)},\n\tvolume       = 37,\n\tpages        = {1--28},\n\turl          = {http://arxiv.org/abs/1506.07512},\n\tabstract     = {We develop a family of accelerated stochastic algorithms that minimize sums of convex functions. Our algorithms improve upon the fastest running time for empirical risk minimization (ERM), and in particular linear least-squares regression, across a wide range of problem settings. To achieve this, we establish a framework based on the classical proximal point algorithm. Namely, we provide several algorithms that reduce the minimization of a strongly convex function to approximate minimizations of regularizations of the function. Using these results, we accelerate recent fast stochastic algorithms in a black-box fashion. Empirically, we demonstrate that the resulting algorithms exhibit notions of stability that are advantageous in practice. Both in theory and in practice, the provided algorithms reap the computational benefits of adding a large strongly convex regularization term, without incurring a corresponding bias to the original problem.},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {1506.07512},\n\teprint       = {1506.07512},\n\tfile         = {:D$\\backslash$:/Mendeley Desktop/Frostig et al. - 2015 - Un-regularizing approximate proximal point and faster stochastic algorithms for empirical risk minimization.pdf:pdf},\n\tmendeley-groups = {Optimization/Gradient Descent Theory}\n}\n@inproceedings{FrostigMMS2016,\n\ttitle        = {{Principal Component Projection Without Principal Component Analysis}},\n\tauthor       = {Frostig, Roy and Musco, Cameron and Musco, Christopher and Sidford, Aaron},\n\tyear         = 2016,\n\tbooktitle    = {ICML}\n}\n@misc{FrostigMMS2016-pcr-krylov,\n\ttitle        = {{Code \\verb\"kpcr.m\"}},\n\tauthor       = {Frostig, Roy and Musco, Cameron and Musco, Christopher and Sidford, Aaron},\n\tyear         = 2015,\n\tnote         = {Accessed: 2016-07, \\url{http://www.chrismusco.com/kpcr.m}}\n}\n@inproceedings{FRU12,\n\ttitle        = {Using Regression for Spectral Estimation of HMMs},\n\tauthor       = {Jordan Rodu and Dean P. Foster and Weichen Wu and Lyle H. Ungar},\n\tyear         = 2013,\n\tbooktitle    = {Statistical Language and Speech Processing},\n\tpages        = {212--223}\n}\n@article{fruchterman1991graph,\n\ttitle        = {Graph drawing by force-directed placement},\n\tauthor       = {Thomas MJ Fruchterman and Edward M Reingold},\n\tyear         = 1991,\n\tjournal      = {Software: Practice and experience},\n\tvolume       = 21,\n\tnumber       = 11,\n\tpages        = {1129--1164}\n}\n@inproceedings{fruit2018efficient,\n\ttitle        = {Efficient Bias-Span-Constrained Exploration-Exploitation in Reinforcement Learning},\n\tauthor       = {Fruit, Ronan and Pirotta, Matteo and Lazaric, Alessandro and Ortner, Ronald},\n\tyear         = 2018,\n\tbooktitle    = {ICML 2018-The 35th International Conference on Machine Learning},\n\tvolume       = 80,\n\tpages        = {1578--1586}\n}\n@inproceedings{fruit2018near,\n\ttitle        = {Near optimal exploration-exploitation in non-communicating markov decision processes},\n\tauthor       = {Fruit, Ronan and Pirotta, Matteo and Lazaric, Alessandro},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {2994--3004}\n}\n@article{fruit2019improved,\n\ttitle        = {Improved Analysis of UCRL2B},\n\tauthor       = {Fruit, Ronan and Pirotta, Matteo and Lazaric, Alessandro},\n\tyear         = 2019,\n\tjournal      = {Available at rlgammazero. github. io/docs/ucrl2b\\_improved. pdf}\n}\n@article{fry2017comparison,\n\ttitle        = {Comparison of sociodemographic and health-related characteristics of {UK Biobank} participants with those of the general population},\n\tauthor       = {Anna Fry and Thomas J Littlejohns and Cathie Sudlow and Nicola Doherty and Ligia Adamska and Tim Sprosen and Rory Collins and Naomi E Allen},\n\tyear         = 2017,\n\tjournal      = {American Journal of Epidemiology},\n\tvolume       = 186,\n\tnumber       = 9,\n\tpages        = {1026--1034}\n}\n@inproceedings{fsm10,\n\ttitle        = {Error propagation for approximate policy and value iteration},\n\tauthor       = {Farahmand, Amir-massoud and Szepesv{\\'a}ri, Csaba and Munos, R{\\'e}mi},\n\tyear         = 2010,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {568--576}\n}\n@inproceedings{fu2005scaling,\n\ttitle        = {Scaling and time warping in time series querying},\n\tauthor       = {\n\t\tFu, Ada Wai-chee and Keogh, Eamonn and Lau, Leo Yung Hang and Ratanamahatana,\n\n\t\tChotirat Ann\n\t},\n\tyear         = 2005,\n\tbooktitle    = {\n\t\tProceedings of the 31st international conference on Very large data\n\n\t\tbases\n\t},\n\tlocation     = {Trondheim, Norway},\n\tpublisher    = {VLDB Endowment},\n\tseries       = {VLDB '05},\n\tpages        = {649--660},\n\tisbn         = {1-59593-154-6},\n\tacmid        = 1083668,\n\tnumpages     = 12\n}\n@inproceedings{fu2017domain,\n\ttitle        = {Domain Adaptation for Relation Extraction with Domain Adversarial Neural Network},\n\tauthor       = {Lisheng Fu and Thien Huu Nguyen and Bonan Min and Ralph Grishman},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the The 8th International Joint Conference on Natural Language Processing},\n\tpages        = {425--429}\n}\n@article{fu2017ex2,\n\ttitle        = {EX2: Exploration with Exemplar Models for Deep Reinforcement Learning},\n\tauthor       = {Justin Fu and John D. Co-Reyes and Sergey Levine},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@inproceedings{fu2018style,\n\ttitle        = {Style Transfer in Text: Exploration and Evaluation},\n\tauthor       = {Zhenxin Fu and Xiaoye Tan and Nanyun Peng and Dongyan Zhao and Rui Yan},\n\tyear         = 2018,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{fu2018variational,\n\ttitle        = {Variational Inverse Control with Events: A General Framework for Data-Driven Reward Definition},\n\tauthor       = {Justin Fu and Avi Singh and Dibya Ghosh and Larry Yang and Sergey Levine},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{fu2019lang2goals,\n\ttitle        = {From Language to Goals: Inverse Reinforcement Learning for Vision-Based Instruction Following},\n\tauthor       = {Justin Fu and Anoop Korattikara and Sergey Levine and Sergio Guadarrama},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{fu2020d4rl,\n\ttitle        = {D4rl: Datasets for deep data-driven reinforcement learning},\n\tauthor       = {Fu, Justin and Kumar, Aviral and Nachum, Ofir and Tucker, George and Levine, Sergey},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2004.07219}\n}\n@book{fudenberg1991game,\n\ttitle        = {Game theory},\n\tauthor       = {Fudenberg, Drew and Tirole, Jean},\n\tyear         = 1991,\n\tpublisher    = {MIT Press, Cambridge, MA},\n\tpages        = {xxiv+579},\n\tisbn         = {0-262-06141-4},\n\tmrclass      = {90-02 (90D10 90D20 90D40 90D80)},\n\tmrnumber     = 1124618,\n\tmrreviewer   = {Fran\\c{c}oise Forges}\n}\n@inproceedings{fujimoto2018addressing,\n\ttitle        = {Addressing function approximation error in actor-critic methods},\n\tauthor       = {Fujimoto, Scott and Hoof, Herke and Meger, David},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {1587--1596},\n\torganization = {PMLR}\n}\n@inproceedings{fujimoto2019off,\n\ttitle        = {Off-policy deep reinforcement learning without exploration},\n\tauthor       = {Fujimoto, Scott and Meger, David and Precup, Doina},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {2052--2062},\n\torganization = {PMLR}\n}\n@article{fujimoto2021minimalist,\n\ttitle        = {A Minimalist Approach to Offline Reinforcement Learning},\n\tauthor       = {Fujimoto, Scott and Gu, Shixiang Shane},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2106.06860}\n}\n@inproceedings{fujiwara2008spiral,\n\ttitle        = {\n\t\tSPIRAL: efficient and exact model identification for hidden {M}arkov\n\n\t\tmodels\n\t},\n\tauthor       = {Fujiwara, Yasuhiro and Sakurai, Yasushi and Yamamuro, Masashi},\n\tyear         = 2008,\n\tbooktitle    = {\n\t\tProceeding of the 14th ACM SIGKDD international conference on Knowledge\n\n\t\tdiscovery and data mining\n\t},\n\tlocation     = {Las Vegas, Nevada, USA},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {KDD '08},\n\tpages        = {247--255},\n\tdoi          = {http://doi.acm.org/10.1145/1401890.1401924},\n\tisbn         = {978-1-60558-193-4},\n\tacmid        = 1401924,\n\tkeywords     = {Hidden Markov model, likelihood, upper bound},\n\tnumpages     = 9\n}\n@inproceedings{fukui2016multimodal,\n\ttitle        = {Multimodal compact bilinear pooling for visual question answering and visual grounding},\n\tauthor       = {Akira Fukui and Dong Huk Park and Daylen Yang and Anna Rohrbach and Trevor Darrell and Marcus Rohrbach},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{fukumizu2000statistical,\n\ttitle        = {Statistical active learning in multilayer perceptrons},\n\tauthor       = {Kenji Fukumizu},\n\tyear         = 2000,\n\tjournal      = {IEEE Transactions on Neural Networks},\n\tvolume       = 11,\n\tnumber       = 1,\n\tpages        = {17--26}\n}\n@book{fukunaga1990introduction,\n\ttitle        = {Introduction to statistical pattern recognition (2nd ed.)},\n\tauthor       = {Fukunaga, Keinosuke},\n\tyear         = 1990,\n\tpublisher    = {Academic Press Professional, Inc.},\n\taddress      = {San Diego, CA, USA},\n\tisbn         = {0-12-269851-7}\n}\n@article{fukushima1988neocognitron,\n\ttitle        = {Neocognitron: A hierarchical neural network capable of visual pattern recognition},\n\tauthor       = {Kunihiko Fukushima},\n\tyear         = 1988,\n\tjournal      = {Neural networks},\n\tvolume       = 1,\n\tnumber       = 2,\n\tpages        = {119--130}\n}\n@book{fuller2009measurement,\n\ttitle        = {Measurement error models},\n\tauthor       = {Wayne A Fuller},\n\tyear         = 2009,\n\tpublisher    = {John Wiley \\& Sons},\n\tvolume       = 305\n}\n@book{fumarola2011extracting,\n\ttitle        = {Extracting general lists from web documents: A hybrid approach},\n\tauthor       = {Fabio Fumarola and Tim Weninger and Rick Barber and Donato Malerba and Jiawei Han},\n\tyear         = 2011,\n\tpublisher    = {Modern Approaches in Applied Intelligence Springer},\n\tpages        = {285--294}\n}\n@inproceedings{fung1989weighing,\n\ttitle        = {Weighing and Integrating Evidence for Stochastic Simulation in {B}ayesian Networks},\n\tauthor       = {Robert Fung and Kuo-Chu Chang},\n\tyear         = 1989,\n\tbooktitle    = {Proceedings of the Fifth Conference Annual Conference on Uncertainty in Artificial Intelligence (UAI-89)},\n\tpublisher    = {Elsevier Science},\n\taddress      = {New York, NY},\n\tpages        = {112--117}\n}\n@inproceedings{fung1994backward,\n\ttitle        = {Backward Simulation in {B}ayesian Networks},\n\tauthor       = {\n\t\tRobert M. Fung and\n\n\t\tBrendan Del Favero\n\t},\n\tyear         = 1994,\n\tbooktitle    = {UAI},\n\tpages        = {227--234},\n\tee           = {http://uai.sis.pitt.edu/displayArticleDetails.jsp?mmnu=1{\\&}smnu=2{\\&}article_id=508{\\&}proceeding_id=10}\n}\n@article{furukawa1975assessment,\n\ttitle        = {Assessment of biological age by multiple regression analysis},\n\tauthor       = {Toshiyuki Furukawa and Michitoshi Inoue and Fumihiko Kajiya and Hiroshi Inada and Seiichi Takasugi and Sugao Fukui and Hiroshi Takeda and Hiroshi Abe},\n\tyear         = 1975,\n\tjournal      = {Journal of Gerontology},\n\tvolume       = 30,\n\tnumber       = 4,\n\tpages        = {422--434}\n}\n@techreport{fuster2017predictably,\n\ttitle        = {Predictably Unequal? The Effects of Machine Learning on Credit Markets},\n\tauthor       = {Andreas Fuster and Paul Goldsmith-Pinkham and Tarun Ramadorai and Ansgar Walther},\n\tyear         = 2017,\n\tinstitution  = {CEPR Discussion Papers}\n}\n@inproceedings{G,\n\ttitle        = {Robustness Analysis of HottTopixx, a Linear Programming Model for Factoring Nonnegative Matrices},\n\tauthor       = {N. Gillis},\n\tyear         = 2012,\n\tnote         = {http://arxiv.org/abs/1211.6687}\n}\n@misc{gabrilovich2013facc1,\n\ttitle        = {FACC1: Freebase annotation of ClueWeb corpora},\n\tauthor       = {Evgeniy Gabrilovich and Michael Ringgaard and Amarnag Subramanya},\n\tyear         = 2013,\n\thowpublished = {\\url{http://lemurproject.org/clueweb09/}}\n}\n@inproceedings{gadermayr2018gradual,\n\ttitle        = {Gradual Domain Adaptation for Segmenting Whole Slide Images Showing Pathological Variability},\n\tauthor       = {Gadermayr, Michael and Eschweiler, Dennis and Klinkhammer, Barbara Mara and Boor, Peter and Merhof, Dorit},\n\tyear         = 2018,\n\tbooktitle    = {Image and Signal Processing}\n}\n@inproceedings{gaikwad2015daemo,\n\ttitle        = {Daemo: A Self-Governed Crowdsourcing Marketplace},\n\tauthor       = {Snehal Neil Gaikwad and Durim Morina and Rohit Nistala and Megha Agarwal and Alison Cossette and Radhika Bhanu and Saiph Savage and Vishwajeet Narwal and Karan Rajpal and Jeff Regino and others},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the 28th Annual ACM Symposium on User Interface Software \\& Technology},\n\tpages        = {101--102}\n}\n@inproceedings{gal2016dropout,\n\ttitle        = {Dropout as a {Bayesian} Approximation: Representing Model Uncertainty in Deep Learning},\n\tauthor       = {Yarin Gal and Zoubin Ghahramani},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{gal2017dbal,\n\ttitle        = {Deep {B}ayesian Active Learning with Image Data},\n\tauthor       = {Yarin Gal and R. Islam and Zoubin Ghahramani},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{gallant1988connectionist,\n\ttitle        = {Connectionist expert systems},\n\tauthor       = {Stephen I. Gallant},\n\tyear         = 1988,\n\tbooktitle    = {Communications of the ACM}\n}\n@article{gallego2015online,\n\ttitle        = {Online Resource Allocation with Customer Choice},\n\tauthor       = {Guillermo Gallego and Anran Li and Van-Anh Truong and Xinshang Wang},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1511.01837}\n}\n@inproceedings{galley04translation,\n\ttitle        = {What's in a translation rule?},\n\tauthor       = {Michel Galley and Mark Hopkins and Kevin Knight and Daniel Marcu},\n\tyear         = 2004,\n\tbooktitle    = {Human Language Technology and North American Association for Computational Linguistics (HLT/NAACL)},\n\tpages        = {273--280}\n}\n@article{galley2015delta,\n\ttitle        = {deltaBLEU: A Discriminative Metric for Generation Tasks with Intrinsically Diverse Targets},\n\tauthor       = {Michel Galley and Chris Brockett and Alessandro Sordoni and Yangfeng Ji and Michael Auli and Chris Quirk and Margaret Mitchell and Jianfeng Gao and Bill Dolan},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1506.06863}\n}\n@article{Gallo1989,\n\ttitle        = {A Fast Parametric Maximum Flow Algorithm and Applications},\n\tauthor       = {Gallo, Giorgio and Grigoriadis, Michael D. and Tarjan, Robert E.},\n\tyear         = 1989,\n\tmonth        = feb,\n\tjournal      = {SIAM Journal on Computing},\n\tvolume       = 18,\n\tnumber       = 1,\n\tpages        = {30--55}\n}\n@article{gama2014survey,\n\ttitle        = {A Survey on Concept Drift Adaptation},\n\tauthor       = {Gama, João and Žliobaitė, Indrė and Bifet, Albert and Pechenizkiy, Mykola and Bouchachia, Hamid},\n\tyear         = 2014,\n\tjournal      = {ACM Computing Surveys (CSUR)},\n\tvolume       = 46\n}\n@inproceedings{gamon2013identifying,\n\ttitle        = {Identifying salient entities in web pages},\n\tauthor       = {Michael Gamon and Tae Yano and Xinying Song and Johnson Apacible and Patrick Pantel},\n\tyear         = 2013,\n\tbooktitle    = {Conference on Information and Knowledge Management (CIKM)}\n}\n@article{gan2017equivalence,\n\ttitle        = {Equivalence of predictors under real and over-parameterized linear models},\n\tauthor       = {Shengjun Gan and Yuqin Sun and Yongge Tian},\n\tyear         = 2017,\n\tjournal      = {Communications in Statistics-Theory and Methods},\n\tvolume       = 46,\n\tnumber       = 11,\n\tpages        = {5368--5383}\n}\n@inproceedings{gan2017style,\n\ttitle        = {StyleNet: Generating Attractive Visual Captions with Styles},\n\tauthor       = {Chuang Gan and Zhe Gan and Xiaodong He and Jianfeng Gao and Li Deng},\n\tyear         = 2017,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@inproceedings{ganchev08multi,\n\ttitle        = {Multi-View Learning over Structured and Non-Identical Outputs},\n\tauthor       = {Kuzman Ganchev and João Graça and John Blitzer and Ben Taskar},\n\tyear         = 2008,\n\tbooktitle    = {Uncertainty in Artificial Intelligence (UAI)}\n}\n@article{ganchev10posterior,\n\ttitle        = {Posterior Regularization for Structured Latent Variable Models},\n\tauthor       = {Kuzman Ganchev and João Graça and Jennifer Gillenwater and Ben Taskar},\n\tyear         = 2010,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 11,\n\tpages        = {2001--2049}\n}\n@article{gandy2011tensor,\n\ttitle        = {Tensor completion and low-n-rank tensor recovery via convex optimization},\n\tauthor       = {Gandy, Silvia and Recht, Benjamin and Yamada, Isao},\n\tyear         = 2011,\n\tjournal      = {Inverse Problems},\n\tpublisher    = {IOP Publishing},\n\tvolume       = 27,\n\tnumber       = 2,\n\tpages        = {025010}\n}\n@book{ganesalingam2013math,\n\ttitle        = {The Language of Mathematics},\n\tauthor       = {Mohan Ganesalingam},\n\tyear         = 2013,\n\tpublisher    = {Springer-Verlag}\n}\n@inproceedings{ganin2015domain,\n\ttitle        = {Unsupervised Domain Adaptation by Backpropagation},\n\tauthor       = {Yaroslav Ganin and Victor Lempitsky},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1180--1189}\n}\n@article{ganin2016domain,\n\ttitle        = {Domain-adversarial training of neural networks},\n\tauthor       = {Ganin, Yaroslav and Ustinova, Evgeniya and Ajakan, Hana and Germain, Pascal and Larochelle, Hugo and Laviolette, Fran{\\c{c}}ois and Marchand, Mario and Lempitsky, Victor},\n\tyear         = 2016,\n\tjournal      = {The journal of machine learning research},\n\tpublisher    = {JMLR. org},\n\tvolume       = 17,\n\tnumber       = 1,\n\tpages        = {2096--2030}\n}\n@inproceedings{ganitkevitch2013ppdb,\n\ttitle        = {{PPDB}: The paraphrase database},\n\tauthor       = {Juri Ganitkevitch and Benjamin Van Durme and Chris Callison-Burch},\n\tyear         = 2013,\n\tbooktitle    = {Human Language Technology and North American Association for Computational Linguistics (HLT/NAACL)},\n\tpages        = {758--764}\n}\n@inproceedings{gao08comparison,\n\ttitle        = {A comparison of {B}ayesian estimators for unsupervised Hidden {M}arkov Model {POS} taggers},\n\tauthor       = {Jianfeng Gao and Mark Johnson},\n\tyear         = 2008,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {344--352}\n}\n@article{gao2008classifying,\n\ttitle        = {\n\t\tClassifying Data Streams with Skewed Class Distributions and Concept\n\n\t\tDrifts\n\t},\n\tauthor       = {Gao, Jing and Ding, B. and Fan, Wei and Han, Jiawei and Yu, P. S.},\n\tyear         = 2008,\n\tjournal      = {Internet Computing},\n\tvolume       = 12,\n\tnumber       = 6,\n\tpages        = {37--49},\n\tdoi          = {10.1109/MIC.2008.119},\n\tissn         = {1089-7801},\n\tabstract     = {\n\t\tClassification is an important data analysis tool that uses a model\n\n\t\tbuilt from historical data to predict class labels for new observations.\n\n\t\tMore and more applications are featuring data streams, rather than\n\n\t\tfinite stored data sets, which are a challenge for traditional classification\n\n\t\talgorithms. Concept drifts and skewed distributions, two common properties\n\n\t\tof data stream applications, make the task of learning in streams\n\n\t\tdifficult. The authors aim to develop a new approach to classify\n\n\t\tskewed data streams that uses an ensemble of models to match the\n\n\t\tdistribution over under-samples of negatives and repeated samples\n\n\t\tof positives.\n\t},\n\tkeywords     = {\n\t\tdata analysis, pattern classification, concept drifts, data analysis\n\n\t\ttool, data streams classification, skewed distributions, classification\n\n\t\talgorithms, concept drifts, data mining, data stream, model averaging,\n\n\t\tskewed distributions\n\t},\n\towner        = {leili},\n\ttimestamp    = {2010.02.05}\n}\n@inproceedings{gao2011active,\n\ttitle        = {Active classification based on value of classifier},\n\tauthor       = {Tianshi Gao and Daphne Koller},\n\tyear         = 2011,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1062--1070}\n}\n@article{gao2019convergence,\n\ttitle        = {Convergence of Adversarial Training in Overparametrized Networks},\n\tauthor       = {Gao, Ruiqi and Cai, Tianle and Li, Haochuan and Wang, Liwei and Hsieh, Cho-Jui and Lee, Jason D},\n\tyear         = 2019,\n\tjournal      = {Neural Information Processing Systems (NeurIPS)}\n}\n@article{gao2020making,\n\ttitle        = {Making Pre-trained Language Models Better Few-shot Learners},\n\tauthor       = {Gao, Tianyu and Fisch, Adam and Chen, Danqi},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2012.15723}\n}\n@article{gao2021making,\n\ttitle        = {Making Pre-trained Language Models Better Few-shot Learners},\n\tauthor       = {Tianyu Gao and Adam Fisch and Danqi Chen},\n\tyear         = 2021,\n\tjournal      = {arXiv}\n}\n@article{gao2021provably,\n\ttitle        = {A Provably Efficient Algorithm for Linear Markov Decision Process with Low Switching Cost},\n\tauthor       = {Gao, Minbo and Xie, Tianle and Du, Simon S and Yang, Lin F},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2101.00494}\n}\n@inproceedings{GarberHazan-et-al-2016-ICML,\n\ttitle        = {Robust Shift-and-Invert Preconditioning: Faster and More Sample Efficient Algorithms for Eigenvector Computation},\n\tauthor       = {Dan Garber and Elad Hazan and Chi Jin and Kakade, Sham M. and Cameron Musco and Praneeth Netrapalli and Aaron Sidford},\n\tyear         = 2016,\n\tbooktitle    = {ICML}\n}\n@inproceedings{GarberHazanMa2015-onlineEV,\n\ttitle        = {Online learning of eigenvectors},\n\tauthor       = {Garber, Dan and Hazan, Elad and Ma, Tengyu},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the 32nd International Conference on Machine Learning (ICML-15)},\n\tpages        = {560--568}\n}\n@inproceedings{GarberHJKMNS16,\n\ttitle        = {Faster Eigenvector Computation via Shift-and-Invert Preconditioning},\n\tauthor       = {Dan Garber and Elad Hazan and Chi Jin and Sham M. Kakade and Cameron Musco and Praneeth Netrapalli and Aaron Sidford},\n\tyear         = 2016,\n\tbooktitle    = {Proceedings of the 33nd International Conference on Machine Learning, {ICML} 2016, New York City, NY, USA, June 19-24, 2016},\n\tpages        = {2626--2634},\n\turl          = {http://jmlr.org/proceedings/papers/v48/garber16.html},\n\tcrossref     = {DBLP:conf/icml/2016},\n\ttimestamp    = {Tue, 12 Jul 2016 21:51:16 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/icml/GarberHJKMNS16},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@article{garder2019qaforamt,\n\ttitle        = {Question Answering is a Format; When is it Useful?},\n\tauthor       = {Matt Gardner and Jonathan Berant and Hannaneh Hajishirzi and Alon Talmor and Sewon Min},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1909.11291}\n}\n@article{gardiner2016security,\n\ttitle        = {On the Security of Machine Learning in Malware {C\\&C} Detection: A Survey},\n\tauthor       = {Joseph Gardiner and Shishir Nagaraja},\n\tyear         = 2016,\n\tjournal      = {ACM Computing Surveys (CSUR)},\n\tvolume       = 49,\n\tnumber       = 3\n}\n@inproceedings{gardner2014incorporating,\n\ttitle        = {Incorporating vector space similarity in random walk inference over knowledge bases},\n\tauthor       = {Matt Gardner and Partha Talukdar and Jayant Krishnamurthy and Tom Mitchell},\n\tyear         = 2014,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{gardner2017open,\n\ttitle        = {Open-Vocabulary Semantic Parsing with both Distributional Statistics and Formal Knowledge},\n\tauthor       = {Matt Gardner and Jayant Krishnamurthy},\n\tyear         = 2017,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@article{gardner2018allennlp,\n\ttitle        = {{AllenNLP}: A deep semantic natural language processing platform},\n\tauthor       = {Matt Gardner and Joel Grus and Mark Neumann and Oyvind Tafjord and Pradeep Dasigi and Nelson Liu and Matthew Peters and Michael Schmitz and Luke Zettlemoyer},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1803.07640}\n}\n@article{gardner2020contrast,\n\ttitle        = {Evaluating {NLP} Models via Contrast Sets},\n\tauthor       = {Matt Gardner and Yoav Artzi and Victoria Basmova and Jonathan Berant and Ben Bogin and Sihao Chen and Pradeep Dasigi and Dheeru Dua and Yanai Elazar and Ananth Gottumukkala and Nitish Gupta and Hanna Hajishirzi and Gabriel Ilharco and Daniel Khashabi and Kevin Lin and Jiangming Liu and Nelson F. Liu and Phoebe Mulcaire and Qiang Ning and Sameer Singh and Noah A. Smith and Sanjay Subramanian and Reut Tsarfaty and Eric Wallace and Ally Zhang and Ben Zhou},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2004.02709}\n}\n@inproceedings{garg2014on,\n\ttitle        = {On Communication Cost of Distributed Statistical Estimation and Dimensionality},\n\tauthor       = {Ankit Garg and Tengyu Ma and Huy L. Nguyen},\n\tbooktitle    = {Advances in Neural Information Processing Systems (NIPS), 2014},\n\turl          = {http://papers.nips.cc/paper/5442-on-communication-cost-of-distributed-statistical-estimation-and-dimensionality},\n\tcrossref     = {DBLP:conf/nips/2014},\n\ttimestamp    = {Wed, 10 Dec 2014 21:34:12 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/nips/GargMN14},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@article{garg2017extractor,\n\ttitle        = {Extractor-Based Time-Space Lower Bounds for Learning},\n\tauthor       = {Sumegha Garg and Ran Raz and Avishay Tal},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@inproceedings{garg2018robust,\n\ttitle        = {A Spectral View of Adversarially Robust Features},\n\tauthor       = {Shivam Garg and Vatsal Sharan and Brian Hu Zhang and Gregory Valiant},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{garg2019counterfactual,\n\ttitle        = {Counterfactual fairness in text classification through robustness},\n\tauthor       = {Sahaj Garg and Vincent Perot and Nicole Limtiaco and Ankur Taly and Ed H Chi and Alex Beutel},\n\tyear         = 2019,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tpages        = {219--226}\n}\n@inproceedings{garg2020tanda,\n\ttitle        = {{TANDA}: Transfer and Adapt Pre-Trained Transformer Models for Answer Sentence Selection},\n\tauthor       = {Siddhant Garg and Thuy Vu and Alessandro Moschitti},\n\tyear         = 2020,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@article{garg2020unified,\n\ttitle        = {A Unified View of Label Shift Estimation},\n\tauthor       = {Saurabh Garg and Yifan Wu and Sivaraman Balakrishnan and Zachary C Lipton},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2003.07554}\n}\n@article{GargK2007,\n\ttitle        = {{Faster and Simpler Algorithms for Multicommodity Flow and Other Fractional Packing Problems}},\n\tauthor       = {Garg, Naveen and K\\\"{o}nemann, Jochen},\n\tyear         = 2007,\n\tmonth        = jan,\n\tjournal      = {SIAM Journal on Computing},\n\tpublisher    = {IEEE Comput. Soc},\n\tvolume       = 37,\n\tnumber       = 2,\n\tpages        = {630--652},\n\tdoi          = {10.1137/S0097539704446232},\n\tisbn         = {0-8186-9172-7},\n\tissn         = {0097-5397},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Garg, K\\\"{o}nemann - 2007 - Faster and Simpler Algorithms for Multicommodity Flow and Other Fractional Packing Problems.pdf:pdf},\n\tmendeley-groups = {Algorithms/Multiplicative Weight/LP,Algorithms/Multiplicative Weight/Flow}\n}\n@book{garofalakis2009data,\n\ttitle        = {Data Stream Management: Processing High-Speed Data Streams},\n\tauthor       = {Minos Garofalakis and Johannes Gehrke and Rajeev Rastogi},\n\tyear         = 2009,\n\tpublisher    = {Springer},\n\tisbn         = 9783540286073,\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@inproceedings{garrette2013learning,\n\ttitle        = {Learning a Part-of-Speech Tagger from Two Hours of Annotation},\n\tauthor       = {Dan Garrette and Jason Baldridge},\n\tyear         = 2013,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)},\n\tpages        = {138--147}\n}\n@inproceedings{gartner2009coresets,\n\ttitle        = {Coresets for polytope distance},\n\tauthor       = {G{\\\"a}rtner, Bernd and Jaggi, Martin},\n\tyear         = 2009,\n\tbooktitle    = {Proceedings of the 25th annual symposium on computational geometry},\n\tpages        = {33--42},\n\torganization = {ACM}\n}\n@inproceedings{GCY92,\n\ttitle        = {One sense per discourse},\n\tauthor       = {W. A. Gale and K. W. Church and D. Yarowsky},\n\tyear         = 1992,\n\tbooktitle    = {4th DARPA Speech and Natural Language Workshop}\n}\n@inproceedings{Ge,\n\ttitle        = {Learning topic models--going beyond SVD},\n\tauthor       = {Arora, Sanjeev and Ge, Rong and Moitra, Ankur},\n\tyear         = 2012,\n\tbooktitle    = {Foundations of Computer Science (FOCS), 2012 IEEE 53rd Annual Symposium on},\n\tpages        = {1--10},\n\torganization = {IEEE},\n\tfile         = {:..\\\\Originals\\\\LDA.pdf:PDF},\n\towner        = {rongge},\n\ttimestamp    = {2013.09.26}\n}\n@inproceedings{ge05scissor,\n\ttitle        = {A Statistical Semantic Parser that Integrates Syntax and Semantics},\n\tauthor       = {Ruifang Ge and Raymond J. Mooney},\n\tyear         = 2005,\n\tbooktitle    = {Computational Natural Language Learning (CoNLL)},\n\tpages        = {9--16}\n}\n@article{ge2015decomposing,\n\ttitle        = {Decomposing Overcomplete 3rd Order Tensors using Sum-of-Squares Algorithms},\n\tauthor       = {Ge, Rong and Ma, Tengyu},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1504.05287}\n}\n@inproceedings{ge2015escaping,\n\ttitle        = {Escaping from saddle points—online stochastic gradient for tensor decomposition},\n\tauthor       = {Ge, Rong and Huang, Furong and Jin, Chi and Yuan, Yang},\n\tyear         = 2015,\n\tbooktitle    = {Conference on Learning Theory},\n\tseries       = {COLT 2015},\n\tpages        = {797--842}\n}\n@inproceedings{ge2016efficient,\n\ttitle        = {Efficient Algorithms for Large-scale Generalized Eigenvector Computation and Canonical Correlation Analysis},\n\tauthor       = {Ge, Rong and Jin, Chi and Kakade, Sham M and Netrapalli, Praneeth and Sidford, Aaron},\n\tyear         = 2016,\n\tbooktitle    = {ICML}\n}\n@inproceedings{ge2016matrix,\n\ttitle        = {Matrix completion has no spurious local minimum},\n\tauthor       = {Ge, Rong and Lee, Jason D and Ma, Tengyu},\n\tyear         = 2016,\n\tjournal      = {Neural Information Processing Systems (NIPS)},\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {2973--2981}\n}\n@article{ge2016on,\n\ttitle        = {{On the optimization landscape of tensor decomposition}},\n\tauthor       = {Rong Ge and Tengyu Ma},\n\tyear         = 2016,\n\tjournal      = {manuscript},\n\tkeywords     = {Statistics - Machine Learning, Computer Science - Learning, Mathematics - Optimization and Control},\n\tadsurl       = {http://adsabs.harvard.edu/abs/2016arXiv160507110K},\n\tadsnote      = {Provided by the SAO/NASA Astrophysics Data System}\n}\n@article{ge2017learning,\n\ttitle        = {Learning One-hidden-layer Neural Networks with Landscape Design},\n\tauthor       = {Ge, Rong and Lee, Jason D and Ma, Tengyu},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1711.00501}\n}\n@article{ge2017neural,\n\ttitle        = {Learning One-hidden-layer Neural Networks with Landscape Design},\n\tauthor       = {Rong Ge, Jason D. Lee, and Tengyu Ma},\n\tyear         = 2017,\n\tbooktitle    = {ICLR},\n\tpublisher    = {manuscript},\n\turl          = {http://arxiv.org/abs/1711.00501}\n}\n@article{ge2017no,\n\ttitle        = {No Spurious Local Minima in Nonconvex Low Rank Problems: A Unified Geometric Analysis},\n\tauthor       = {Ge, Rong and Jin, Chi and Zheng, Yi},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1704.00708}\n}\n@article{ge2017on,\n\ttitle        = {{On the Optimization Landscape of Tensor Decompositions}},\n\tauthor       = {{Ge}, R. and {Ma}, T.},\n\tyear         = 2017,\n\tmonth        = jun,\n\tjournal      = {ArXiv e-prints},\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Springer},\n\tpages        = {3653--3663},\n\tarchiveprefix = {arXiv},\n\teprint       = {1706.05598},\n\tprimaryclass = {cs.LG},\n\tkeywords     = {Computer Science - Learning, Computer Science - Data Structures and Algorithms, Mathematics - Optimization and Control, Mathematics - Probability, Statistics - Machine Learning},\n\tadsurl       = {http://adsabs.harvard.edu/abs/2017arXiv170605598G},\n\tadsnote      = {Provided by the SAO/NASA Astrophysics Data System}\n}\n@article{ge2018simulated,\n\ttitle        = {Simulated tempering {Langevin Monte Carlo II}: An improved proof using soft {Markov} chain decomposition},\n\tauthor       = {Ge, Rong and Lee, Holden and Risteski, Andrej},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1812.00793}\n}\n@article{ge2019step,\n\ttitle        = {{The Step Decay Schedule: A Near Optimal, Geometrically Decaying Learning Rate Procedure}},\n\tauthor       = {{Ge}, Rong and {Kakade}, Sham M. and {Kidambi}, Rahul and {Netrapalli}, Praneeth},\n\tyear         = 2019,\n\tmonth        = apr,\n\tjournal      = {arXiv e-prints},\n\tpages        = {arXiv:1904.12838},\n\tkeywords     = {Computer Science - Machine Learning, Mathematics - Optimization and Control, Statistics - Machine Learning},\n\teid          = {arXiv:1904.12838},\n\tarchiveprefix = {arXiv},\n\teprint       = {1904.12838},\n\tprimaryclass = {cs.LG},\n\tadsurl       = {https://ui.adsabs.harvard.edu/abs/2019arXiv190412838G},\n\tadsnote      = {Provided by the SAO/NASA Astrophysics Data System}\n}\n@article{gebru2018datasheets,\n\ttitle        = {Datasheets for Datasets},\n\tauthor       = {Timnit Gebru and Jamie Morgenstern and Briana Vecchione and Jennifer Wortman Vaughan and Hanna Wallach and Hal Daumé Ill and Kate Crawford},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1803.09010}\n}\n@book{geer2000empirical,\n\ttitle        = {Empirical Processes in M-estimation},\n\tauthor       = {Van de Geer, Sara},\n\tyear         = 2000,\n\tpublisher    = {Cambridge University Press}\n}\n@inproceedings{GeHJY15,\n\ttitle        = {Escaping From Saddle Points - Online Stochastic Gradient for Tensor Decomposition},\n\tauthor       = {Rong Ge and Furong Huang and Chi Jin and Yang Yuan},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of The 28th Conference on Learning Theory, {COLT} 2015, Paris, France, July 3-6, 2015},\n\tpages        = {797--842}\n}\n@inproceedings{GeHK15,\n\ttitle        = {Learning Mixtures of Gaussians in High Dimensions},\n\tauthor       = {Rong Ge and Qingqing Huang and Sham M. Kakade},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the Forty-Seventh Annual {ACM} on Symposium on Theory of Computing, {STOC} 2015, Portland, OR, USA, June 14-17, 2015},\n\tpages        = {761--770},\n\tdoi          = {10.1145/2746539.2746616},\n\turl          = {http://doi.acm.org/10.1145/2746539.2746616},\n\tcrossref     = {DBLP:conf/stoc/2015},\n\ttimestamp    = {Wed, 10 Jun 2015 17:20:57 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/stoc/GeHK15},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org},\n\tpp           = {761-770}\n}\n@article{gehman2020realtoxicityprompts,\n\ttitle        = {Realtoxicityprompts: Evaluating neural toxic degeneration in language models},\n\tauthor       = {Samuel Gehman and Suchin Gururangan and Maarten Sap and Yejin Choi and Noah A Smith},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2009.11462}\n}\n@article{gehring2017convolutional,\n\ttitle        = {Convolutional Sequence to Sequence Learning},\n\tauthor       = {Jonas Gehring and Michael Auli and David Grangier and Denis Yarats and Yann N Dauphin},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1705.03122}\n}\n@inproceedings{gehrmann2018bottom,\n\ttitle        = {Bottom-Up Abstractive Summarization},\n\tauthor       = {Sebastian Gehrmann and Yuntian Deng and Alexander M. Rush},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{geifman2017selective,\n\ttitle        = {Selective classification for deep neural networks},\n\tauthor       = {Yonatan Geifman and Ran El-Yaniv},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{geifman2018bias,\n\ttitle        = {Bias-Reduced Uncertainty Estimation for Deep Neural Classifiers},\n\tauthor       = {Yonatan Geifman and Guy Uziel and Ran El-Yaniv},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{geifman2019selectivenet,\n\ttitle        = {SelectiveNet: A Deep Neural Network with an Integrated Reject Option},\n\tauthor       = {Yonatan Geifman and Ran El-Yaniv},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{geiger01stratified,\n\ttitle        = {Stratified exponential families: graphical models and model selection},\n\tauthor       = {Dan Geiger and David Heckerman and Henry King and Christopher Meek},\n\tyear         = 2001,\n\tjournal      = {Annals of Statistics},\n\tvolume       = 29,\n\tpages        = {505--529}\n}\n@inproceedings{geiger05structured,\n\ttitle        = {Structured Variational Inference Procedures and their Realizations},\n\tauthor       = {Dan Geiger and Christopher Meek},\n\tyear         = 2005,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)}\n}\n@inproceedings{geiger2012kitti,\n\ttitle        = {Are we ready for autonomous driving? {T}he {KITTI} vision benchmark suite},\n\tauthor       = {Andreas Geiger and Philip Lenz and Raquel Urtasun},\n\tyear         = 2012,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {3354--3361}\n}\n@inproceedings{geiger2019posing,\n\ttitle        = {Posing Fair Generalization Tasks for Natural Language Inference},\n\tauthor       = {Atticus Geiger and Ignacio Cases and Lauri Karttunen and Chris Potts},\n\tyear         = 2019,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{geirhos2018generalisation,\n\ttitle        = {Generalisation in humans and deep neural networks},\n\tauthor       = {Robert Geirhos and Carlos RM Temme and Jonas Rauber and Heiko H Sch{\\\"u}tt and Matthias Bethge and Felix A Wichmann},\n\tyear         = 2018,\n\tjournal      = {Advances in neural information processing systems},\n\tvolume       = 31,\n\tpages        = {7538--7550}\n}\n@article{geirhos2020shortcut,\n\ttitle        = {Shortcut Learning in Deep Neural Networks},\n\tauthor       = {Robert Geirhos and J{\\\"o}rn-Henrik Jacobsen and Claudio Michaelis and Richard Zemel and Wieland Brendel and Matthias Bethge and Felix A Wichmann},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2004.07780}\n}\n@article{GeJKNS2016-CCA,\n\ttitle        = {{Efficient Algorithms for Large-scale Generalized Eigenvector Computation and Canonical Correlation Analysis}},\n\tauthor       = {Rong Ge and Chi Jin and Sham M. Kakade and Praneeth Netrapalli and Aaron Sidford},\n\tyear         = 2016,\n\tmonth        = apr,\n\tjournal      = {ArXiv e-prints},\n\tvolume       = {abs/1604.03930}\n}\n@article{gelman1992single,\n\ttitle        = {A single series from the {G}ibbs sampler provides a false sense of security},\n\tauthor       = {Andrew Gelman and Donald B Rubin},\n\tyear         = 1992,\n\tjournal      = {Bayesian statistics},\n\tvolume       = 4,\n\tpages        = {625--631}\n}\n@book{gelman1995bayesian,\n\ttitle        = {Bayesian data analysis},\n\tauthor       = {Andrew Gelman and John B Carlin and Hal S Stern and Donald B Rubin},\n\tyear         = {1995 1995},\n\tpublisher    = {Chapman and Hall/CRC Chapman and Hall/CRC}\n}\n@article{gelman1998simulating,\n\ttitle        = {Simulating normalizing constants: From importance sampling to bridge sampling to path sampling},\n\tauthor       = {A Gelman and XL Meng},\n\tyear         = 1998,\n\tjournal      = {Statistical science},\n\tvolume       = 13,\n\tnumber       = 2,\n\tpages        = {163--185}\n}\n@inproceedings{gelman2007police,\n\ttitle        = {An Analysis of the New York City Police Department’s “Stop-and-Frisk” Policy in the Context of Claims of Racial Bias},\n\tauthor       = {Andrew Gelmand and Jeffrey Fagan and Alex Kiss},\n\tyear         = 2007,\n\tbooktitle    = {Journal of the American Statistical Association}\n}\n@article{geman1984stochastic,\n\ttitle        = {Stochastic relaxation, {G}ibbs distributions, and the {B}ayesian restoration of images},\n\tauthor       = {Stuart Geman and Donald Geman},\n\tyear         = 1984,\n\tjournal      = {IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI)},\n\tvolume       = 6,\n\tpages        = {721--741}\n}\n@inproceedings{gemulla2011large,\n\ttitle        = {Large-scale matrix factorization with distributed stochastic gradient descent},\n\tauthor       = {Gemulla, Rainer and Nijkamp, Erik and Haas, Peter J and Sismanis, Yannis},\n\tyear         = 2011,\n\tbooktitle    = {Proceedings of the 17th ACM SIGKDD international conference on Knowledge discovery and data mining},\n\tpages        = {69--77},\n\torganization = {ACM}\n}\n@article{GeneticAlgorithm1950,\n\ttitle        = {Computing machinery and intelligence},\n\tauthor       = {Turing, Alan M.},\n\tyear         = 1950,\n\tjournal      = {Mind},\n\tpublisher    = {JSTOR},\n\tpages        = {433--460}\n}\n@inproceedings{gentile2014online,\n\ttitle        = {Online clustering of bandits},\n\tauthor       = {Gentile, Claudio and Li, Shuai and Zappella, Giovanni},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {757--765}\n}\n@book{gerritsen2015addiction,\n\ttitle        = {Self-report of Smoking Cessation in Cardiac Patients},\n\tauthor       = {Marielle Gerritsen and Nadine Berndt and Lilian Lechner and Hein de Vries and Aart Mudde and Catherine Bolmand},\n\tyear         = 2015,\n\tpublisher    = {Journal of Addiction Medicine},\n\tpages        = {308--316}\n}\n@inproceedings{gershman2015phrase,\n\ttitle        = {Phrase similarity in humans and machines},\n\tauthor       = {Samuel J. Gershman and Joshua B. Tenenbaum},\n\tyear         = 2015,\n\tbooktitle    = {CogSci}\n}\n@inproceedings{geva2018long,\n\ttitle        = {Learning to Search in Long Documents using Document Structure},\n\tauthor       = {Mor Geva and Jonathan Berant},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)}\n}\n@inproceedings{geva2019annotator,\n\ttitle        = {Are We Modeling the Task or the Annotator? An Investigation of Annotator Bias in Natural Language Understanding Datasets},\n\tauthor       = {Mor Geva and Yoav Goldberg and Jonathan Berant},\n\tyear         = 2019,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{geva2019discofuse,\n\ttitle        = {Disco{F}use: A Large-Scale Dataset for Discourse-based Sentence Fusion},\n\tauthor       = {Mor Geva and Eric Malmi and Idan Szpektor and Jonathan Berant},\n\tyear         = 2019,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@inproceedings{geva2020injecting,\n\ttitle        = {Injecting Numerical Reasoning Skills into Language Models},\n\tauthor       = {Mor Geva and Ankit Gupta and Jonathan Berant},\n\tyear         = 2020,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{geyer1995annealing,\n\ttitle        = {Annealing {M}arkov chain {M}onte {C}arlo with applications to ancestral inference},\n\tauthor       = {Charles J. Geyer and Elizabeth A. Thompson},\n\tyear         = 1995,\n\tjournal      = {Journal of the American Statistical Association},\n\tvolume       = 90,\n\tpages        = {909--920}\n}\n@article{ghadimilan,\n\ttitle        = {Stochastic first-and zeroth-order methods for nonconvex stochastic programming},\n\tauthor       = {Ghadimi, Saeed and Lan, Guanghui},\n\tyear         = 2013,\n\tjournal      = {SIAM Journal on Optimization},\n\tpublisher    = {SIAM},\n\tvolume       = 23,\n\tnumber       = 4,\n\tpages        = {2341--2368}\n}\n@article{GhadimiLan2015,\n\ttitle        = {{Accelerated gradient methods for nonconvex nonlinear and stochastic programming}},\n\tauthor       = {Ghadimi, Saeed and Lan, Guanghui},\n\tyear         = 2015,\n\tmonth        = feb,\n\tjournal      = {Mathematical Programming},\n\tpages        = {1--26},\n\tdoi          = {10.1007/s10107-015-0871-8},\n\tissn         = {0025-5610},\n\turl          = {http://arxiv.org/abs/1310.3787 http://link.springer.com/10.1007/s10107-015-0871-8},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {1310.3787},\n\teprint       = {1310.3787},\n\tfile         = {:D$\\backslash$:/Mendeley Desktop/Ghadimi, Lan - 2013 - Accelerated gradient methods for nonconvex nonlinear and stochastic programming.pdf:pdf},\n\tkeywords     = {62l20,68q25,90c15,90c25,accelerated gradient,ams 2000 subject classification,complexity,nonconvex optimization,stochastic programming},\n\tmendeley-groups = {Optimization/Gradient Descent Theory/Nonconvex,Optimization/Non-Convex}\n}\n@inproceedings{ghafoorian2017transfer,\n\ttitle        = {Transfer learning for domain adaptation in mri: Application in brain lesion segmentation},\n\tauthor       = {Ghafoorian, Mohsen and Mehrtash, Alireza and Kapur, Tina and Karssemeijer, Nico and Marchiori, Elena and Pesteie, Mehran and Guttmann, Charles RG and de Leeuw, Frank-Erik and Tempany, Clare M and Van Ginneken, Bram and Wells III, William M.},\n\tyear         = 2017,\n\tbooktitle    = {International conference on medical image computing and computer-assisted intervention},\n\tpages        = {516--524},\n\torganization = {Springer}\n}\n@inproceedings{ghahramani1994supervised,\n\ttitle        = {Supervised learning from incomplete data via an {EM} approach},\n\tauthor       = {Zoubin Ghahramani and Michael I. Jordan},\n\tyear         = 1994,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Morgan Kaufmann Publishers, Inc.},\n\tvolume       = 6,\n\tpages        = {120--127},\n\turl          = {citeseer.ist.psu.edu/ghahramani94supervised.html},\n\teditor       = {Jack D. Cowan and Gerald Tesauro and Joshua Alspector},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@techreport{ghahramani1996parameter,\n\ttitle        = {Parameter Estimation for Linear Dynamical Systems},\n\tauthor       = {Ghahramani, Zoubin and Hinton, Geoffrey E.},\n\tyear         = 1996,\n\tmonth        = feb,\n\tnumber       = {CRG-TR-96-2},\n\tabstract     = {\n\t\tLinear systems have been used extensively in engineering to model\n\n\t\tand control the behavior of dynamical systems. In this note, we present\n\n\t\tthe Expectation Maximization (EM) algorithm for estimating the parameters\n\n\t\tof linear systems (Shumway and Stoffer, 1982). We also point out\n\n\t\tthe relationship between linear dynamical systems, factor analysis,\n\n\t\tand hidden Markov models. Introduction\n\n\t\tThe goal of this note is to introduce the EM algorithm for estimating\n\n\t\tthe parameters of linear dynamical systems...\n\t},\n\tkeywords     = {dynamical, linear, systems}\n}\n@inproceedings{ghahramani1999variational,\n\ttitle        = {Variational Inference for {B}ayesian Mixtures of Factor Analysers},\n\tauthor       = {Z. Ghahramani and M. J Beal},\n\tyear         = 1999,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{ghahramani97fhmm,\n\ttitle        = {Factorial Hidden {M}arkov Models},\n\tauthor       = {Zoubin Ghahramani and Michael Jordan},\n\tyear         = 1997,\n\tjournal      = {Machine Learning},\n\tvolume       = 29,\n\tpages        = {245--273}\n}\n@inproceedings{GharanTrevisan12,\n\ttitle        = {Approximating the Expansion Profile and Almost Optimal Local Graph Clustering},\n\tauthor       = {Gharan, Shayan Oveis and Trevisan, Luca},\n\tyear         = 2012,\n\tseries       = {FOCS},\n\tpages        = {187--196}\n}\n@inproceedings{ghassemi2019health,\n\ttitle        = {Practical guidance on artificial intelligence for health-care data},\n\tauthor       = {Marzyeh Ghassemi and Tristan Naumann and Peter Schulam and Andrew L Beam and Irene Y Chen and Rajesh Ranganath},\n\tyear         = 2019,\n\tbooktitle    = {The Lancet Digital Health},\n\tpages        = {157--159}\n}\n@inproceedings{ghavamzadeh2011finite,\n\ttitle        = {Finite-sample analysis of Lasso-TD},\n\tauthor       = {Ghavamzadeh, Mohammad and Lazaric, Alessandro and Munos, R{\\'e}mi and Hoffman, Matt},\n\tyear         = 2011,\n\tbooktitle    = {International Conference on Machine Learning}\n}\n@inproceedings{ghazvininejad2016poem,\n\ttitle        = {Generating Topical Poetry},\n\tauthor       = {Marjan Ghazvininejad and Xing Shi and Yejin Choi and Kevin Knight},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{ghazvininejad2019mask,\n\ttitle        = {Mask-{P}redict: Parallel decoding of conditional masked language models},\n\tauthor       = {Marjan Ghazvininejad and Omer Levy and Yinhan Liu and Luke Zettlemoyer},\n\tyear         = 2019,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{GhLI,\n\ttitle        = {Optimal stochastic approximation algorithms for strongly convex stochastic composite optimization {I}: {A} generic algorithmic framework},\n\tauthor       = {Ghadimi, Saeed and Lan, Guanghui},\n\tyear         = 2012,\n\tjournal      = {SIAM J. Optim.},\n\tvolume       = 22,\n\tnumber       = 4,\n\tpages        = {1469--1492},\n\tdoi          = {10.1137/110848864},\n\tissn         = {1052-6234},\n\turl          = {http://dx.doi.org/10.1137/110848864},\n\tfjournal     = {SIAM Journal on Optimization},\n\tmrclass      = {62L20 (68W25 90C15 90C25)},\n\tmrnumber     = 3023780\n}\n@article{GhLII,\n\ttitle        = {Optimal stochastic approximation algorithms for strongly convex stochastic composite optimization, {II}: {S}hrinking procedures and optimal algorithms},\n\tauthor       = {Ghadimi, Saeed and Lan, Guanghui},\n\tyear         = 2013,\n\tjournal      = {SIAM J. Optim.},\n\tvolume       = 23,\n\tnumber       = 4,\n\tpages        = {2061--2089},\n\tdoi          = {10.1137/110848876},\n\tissn         = {1052-6234},\n\turl          = {http://dx.doi.org/10.1137/110848876},\n\tfjournal     = {SIAM Journal on Optimization},\n\tmrclass      = {62L20 (68Q25 68W25 90C25)},\n\tmrnumber     = 3118261\n}\n@article{ghorbani2019data,\n\ttitle        = {Data Shapley: Equitable Valuation of Data for Machine Learning},\n\tauthor       = {Amirata Ghorbani and James Zou},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1904.02868}\n}\n@inproceedings{ghorbani2019limitations,\n\ttitle        = {Limitations of Lazy Training of Two-layers Neural Network},\n\tauthor       = {Ghorbani, Behrooz and Mei, Song and Misiakiewicz, Theodor and Montanari, Andrea},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {9108--9118}\n}\n@inproceedings{ghorbani2019towards,\n\ttitle        = {Towards automatic concept-based explanations},\n\tauthor       = {Amirata Ghorbani and James Wexler and James Y Zou and Been Kim},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {9277--9286}\n}\n@inproceedings{ghosh2011moderates,\n\ttitle        = {Who moderates the moderators?: crowdsourcing abuse detection in user-generated content},\n\tauthor       = {Arpita Ghosh and Satyen Kale and Preston McAfee},\n\tyear         = 2011,\n\tbooktitle    = {12th ACM conference on Electronic commerce},\n\tpages        = {167--176}\n}\n@article{gibbs2003international,\n\ttitle        = {The international HapMap project},\n\tauthor       = {Gibbs, Richard A and Belmont, John W and Hardenbol, Paul and Willis, Thomas D and Yu, Fuli and Yang, Huanming and Ch'ang, Lan-Yang and Huang, Wei and Liu, Bin and Shen, Yan and others},\n\tyear         = 2003,\n\tjournal      = {Nature},\n\tpublisher    = {Nature Publishing Group},\n\tvolume       = 426,\n\tnumber       = 6968,\n\tpages        = {789--796}\n}\n@inproceedings{gidaris2018rotation,\n\ttitle        = {Unsupervised Representation Learning by Predicting Image Rotations},\n\tauthor       = {Spyros Gidaris and Praveer Singh and Nikos Komodakis},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{gilbert2001surfing,\n\ttitle        = {\n\t\tSurfing Wavelets on Streams: One-Pass Summaries for Approximate Aggregate\n\n\t\tQueries\n\t},\n\tauthor       = {\n\t\tGilbert, Anna C. and Kotidis, Yannis and Muthukrishnan, S. and Strauss,\n\n\t\tMartin\n\t},\n\tyear         = 2001,\n\tbooktitle    = {\n\t\tProceedings of the 27th International Conference on Very Large Data\n\n\t\tBases\n\t},\n\tpublisher    = {Morgan Kaufmann Publishers Inc.},\n\taddress      = {San Francisco, CA, USA},\n\tseries       = {VLDB '01},\n\tpages        = {79--88},\n\tisbn         = {1-55860-804-4},\n\tacmid        = 672174,\n\tnumpages     = 10\n}\n@article{gildea02semantic,\n\ttitle        = {Automatic Labeling of Semantic Roles},\n\tauthor       = {D. Gildea and D. Jurafsky},\n\tyear         = 2002,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 28,\n\tpages        = {245--288}\n}\n@inproceedings{gildea99topic,\n\ttitle        = {Topic-based language models using {EM}},\n\tauthor       = {Dan Gildea and Thomas Hofmann},\n\tyear         = 1999,\n\tbooktitle    = {Eurospeech}\n}\n@inproceedings{giles2001overfitting,\n\ttitle        = {Overfitting in Neural Nets: Backpropagation, Conjugate Gradient, and Early Stopping},\n\tauthor       = {Giles, Rich Caruana Steve Lawrence Lee},\n\tyear         = 2001,\n\tbooktitle    = {Advances in Neural Information Processing Systems 13: Proceedings of the 2000 Conference},\n\tvolume       = 13,\n\tpages        = 402\n}\n@book{giles2008communication,\n\ttitle        = {Communication accommodation theory},\n\tauthor       = {Howard Giles},\n\tyear         = 2008,\n\tpublisher    = {Sage Publications, Inc}\n}\n@article{gilks1992adaptive,\n\ttitle        = {Adaptive rejection sampling for Gibbs sampling},\n\tauthor       = {Gilks, W. R. and Wild, P.},\n\tyear         = 1992,\n\tjournal      = {Applied Statistics},\n\tvolume       = 41,\n\tpages        = {337--348}\n}\n@article{gilks2001following,\n\ttitle        = {Following a Moving Target -- {M}onte {C}arlo Inference for Dynamic Bayesian Models},\n\tauthor       = {Gilks, Walter R. and Berzuini, Carlo},\n\tyear         = 2001,\n\tjournal      = {Journal of the Royal Statistical Society. Series B (Statistical Methodology)},\n\tvolume       = 63,\n\tnumber       = 1,\n\tpages        = {127--146}\n}\n@article{gill2002snopt,\n\ttitle        = {{SNOPT}: An {SQP} algorithm for large-scale constrained optimization},\n\tauthor       = {Philip E Gill and Walter Murray and Michael A Saunders},\n\tyear         = 2002,\n\tjournal      = {SIAM Journal on Optimization},\n\tvolume       = 12,\n\tnumber       = 4,\n\tpages        = {979--1006}\n}\n@inproceedings{gillick2019learning,\n\ttitle        = {Learning Dense Representations for Entity Retrieval},\n\tauthor       = {Daniel Gillick and Sayali Kulkarni and Larry Lansing and Alessandro Presta and Jason Baldridge and Eugene Ie and Diego Garcia-Olano},\n\tyear         = 2019,\n\tbooktitle    = {Computational Natural Language Learning (CoNLL)}\n}\n@article{gillis2014and,\n\ttitle        = {The why and how of nonnegative matrix factorization},\n\tauthor       = {Gillis, Nicolas},\n\tyear         = 2014,\n\tjournal      = {Regularization, Optimization, Kernels, and Support Vector Machines},\n\tpublisher    = {Chapman \\& Hall},\n\tvolume       = 12,\n\tnumber       = 257\n}\n@misc{GillisVavasis,\n\ttitle        = {Fast and Robust Recursive Algorithms for Separable Nonnegative Matrix Factorization},\n\tauthor       = {N. Gillis and S. Vavasis},\n\tyear         = 2012,\n\tnote         = {http://arxiv.org/abs/1208.1237}\n}\n@article{gilmer2018adversarial,\n\ttitle        = {Adversarial spheres},\n\tauthor       = {Justin Gilmer and Luke Metz and Fartash Faghri and Samuel S Schoenholz and Maithra Raghu and Martin Wattenberg and Ian Goodfellow},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1801.02774}\n}\n@book{GilSeguraTemme2007,\n\ttitle        = {{Numerical Methods for Special Functions}},\n\tauthor       = {Gil, Amparo and Segura, Javier and Temme, Nico M.},\n\tyear         = 2007,\n\tmonth        = jan,\n\tpublisher    = {Society for Industrial and Applied Mathematics},\n\tpages        = 405,\n\tdoi          = {10.1137/1.9780898717822},\n\tisbn         = {978-0-89871-634-4},\n\tissn         = {0029599X},\n\turl          = {http://epubs.siam.org/doi/abs/10.1137/1.9780898717822 http://epubs.siam.org/doi/book/10.1137/1.9780898717822},\n\tabstract     = {Special functions arise in many problems of pure and applied mathematics, mathematical statistics, physics, and engineering. This book provides an up-to-date overview of numerical methods for computing special functions and discusses when to use these methods depending on the function and the range of parameters. Not only are standard and simple parameter domains considered, but methods valid for large and complex parameters are described as well. The first part of the book (basic methods) covers convergent and divergent series, Chebyshev expansions, numerical quadrature, and recurrence relations. Its focus is on the computation of special functions; however, it is suitable for general numerical courses. Pseudoalgorithms are given to help students write their own algorithms. In addition to these basic tools, the authors discuss other useful and efficient methods, such as methods for computing zeros of special functions, uniform asymptotic expansions, Pad{\\'{e}} approximations, and sequence transformations. The book also provides specific algorithms for computing several special functions (like Airy functions and parabolic cylinder functions, among others).},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Gil, Segura, Temme - 2007 - Numerical Methods for Special Functions.pdf:pdf},\n\tmendeley-groups = {Books/Book-Optimization}\n}\n@inproceedings{gimpel2010softmax,\n\ttitle        = {Softmax-margin {CRF}s: Training log-linear models with cost functions},\n\tauthor       = {Kevin Gimpel and Noah A. Smith},\n\tyear         = 2010,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)},\n\tpages        = {733--736}\n}\n@inproceedings{gimpel2013systematic,\n\ttitle        = {A systematic exploration of diversity in machine translation},\n\tauthor       = {Kevin Gimpel and Dhruv Batra and Chris Dyer and Gregory Shakhnarovich},\n\tyear         = 2013,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1100--1111}\n}\n@article{ginosar2017portraits,\n\ttitle        = {A Century of Portraits: A Visual Historical Record of American High School Yearbooks},\n\tauthor       = {Shiry Ginosar and Kate Rakelly and Sarah M. Sachs and Brian Yin and Crystal Lee and Philipp Krähenbühl and Alexei A. Efros},\n\tyear         = 2017,\n\tjournal      = {IEEE Transactions on Computational Imaging},\n\tvolume       = 3\n}\n@inproceedings{ginzburg2015understanding,\n\ttitle        = {Understanding Laughter},\n\tauthor       = {Jonathan Ginzburg and Ellen Breithholtz and Robin Cooper and Julian Hough and Ye Tian},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the 20th Amsterdam Colloquium}\n}\n@inproceedings{giordani09sql,\n\ttitle        = {Semantic Mapping Between Natural Language Questions and {SQL} Queries via Syntactic Pairing},\n\tauthor       = {Alessandra Giordani and Alessandro Moschitti},\n\tyear         = 2009,\n\tbooktitle    = {International Conference on Applications of Natural Language to Information Systems},\n\tpages        = {207--221}\n}\n@article{giordano2019higher,\n\ttitle        = {A Higher-Order {Swiss Army} Infinitesimal Jackknife},\n\tauthor       = {Ryan Giordano and Michael I Jordan and Tamara Broderick},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1907.12116}\n}\n@inproceedings{giordano2019swiss,\n\ttitle        = {A {Swiss Army} Infinitesimal Jackknife},\n\tauthor       = {Ryan Giordano and William Stephenson and Runjing Liu and Michael Jordan and Tamara Broderick},\n\tyear         = 2019,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)},\n\tpages        = {1139--1147}\n}\n@article{gislason2006landcover,\n\ttitle        = {Random Forests for land cover classification},\n\tauthor       = {Pall Oskar Gislason and Jon Atli Benediktsson and Johannes R. Sveinsson},\n\tyear         = 2006,\n\tjournal      = {Pattern Recognition Letters},\n\tvolume       = 27,\n\tnumber       = 4,\n\tpages        = {294--300}\n}\n@article{gissin2019implicit,\n\ttitle        = {The Implicit Bias of Depth: How Incremental Learning Drives Generalization},\n\tauthor       = {Gissin, Daniel and Shalev-Shwartz, Shai and Daniely, Amit},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1909.12051}\n}\n@misc{github2021repos,\n\ttitle        = {Repository search for public repositories},\n\tauthor       = {Github},\n\tyear         = 2021,\n\thowpublished = {\\url{https://github.com/search?q=is:public}}\n}\n@article{gittens2011spectral,\n\ttitle        = {The spectral norm error of the naive {Nystr{\\\"o}m} extension},\n\tauthor       = {Gittens, Alex},\n\tyear         = 2011,\n\tjournal      = {arXiv preprint arXiv:1110.5305}\n}\n@article{gittens2013revisiting,\n\ttitle        = {Revisiting the {Nystr{\\\"o}m} method for improved large-scale machine learning},\n\tauthor       = {Gittens, Alex and Mahoney, Michael W},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1303.1849}\n}\n@article{giulianelli2018under,\n\ttitle        = {Under the hood: Using diagnostic classifiers to investigate and improve how language models track agreement information},\n\tauthor       = {Giulianelli, Mario and Harding, Jack and Mohnert, Florian and Hupkes, Dieuwke and Zuidema, Willem},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1808.08079}\n}\n@inproceedings{gkkt17,\n\ttitle        = {Reliably learning the {R}e{LU} in polynomial time},\n\tauthor       = {Goel, Surbhi and Kanade, Varun and Klivans, Adam and Thaler, Justin},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1611.10258},\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@inproceedings{gkm18,\n\ttitle        = {Learning One Convolutional Layer with Overlapping Patches},\n\tauthor       = {Goel, Surbhi and Klivans, Adam and Meka, Raghu},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1802.02547},\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpublisher    = {arXiv preprint arXiv:1802.02547}\n}\n@article{gladyshev1965stochastic,\n\ttitle        = {On Stochastic Approximation},\n\tauthor       = {G. E. Gladyshev},\n\tyear         = 1965,\n\tjournal      = {Theory of Probability and its Applications},\n\tvolume       = 10,\n\tnumber       = 2,\n\tpages        = {275--278}\n}\n@article{glasserman1995sensitivity,\n\ttitle        = {Sensitivity analysis for base-stock levels in multiechelon production-inventory systems},\n\tauthor       = {Glasserman, Paul and Tayur, Sridhar},\n\tyear         = 1995,\n\tjournal      = {Management Science},\n\tpublisher    = {INFORMS},\n\tvolume       = 41,\n\tnumber       = 2,\n\tpages        = {263--281}\n}\n@inproceedings{GLMY11,\n\ttitle        = {Large-Scale Community Detection on YouTube for Topic Discovery and Exploration},\n\tauthor       = {Ullas Gargi and Wenjun Lu and Vahab S. Mirrokni and Sangho Yoon},\n\tyear         = 2011,\n\tbooktitle    = {AAAI Conference on Weblogs and Social Media}\n}\n@inproceedings{globerson2006nightmare,\n\ttitle        = {Nightmare at test time: Robust learning by feature deletion},\n\tauthor       = {Amir Globerson and Sam Roweis},\n\tyear         = 2006,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {353--360}\n}\n@article{globerson2007euclidean,\n\ttitle        = {Euclidean Embedding of Co-occurrence Data},\n\tauthor       = {Globerson, Amir and Chechik, Gal and Pereira, Fernando and Tishby, Naftali},\n\tyear         = 2007,\n\tjournal      = {Journal of Machine Learning Research}\n}\n@inproceedings{globerson2007exponentiated,\n\ttitle        = {Exponentiated gradient algorithms for log-linear structured prediction},\n\tauthor       = {Globerson, Amir and Koo, Terry Y and Carreras, Xavier and Collins, Michael},\n\tyear         = 2007,\n\tbooktitle    = {Proceedings of the 24th international conference on Machine learning},\n\tpages        = {305--312},\n\torganization = {ACM}\n}\n@inproceedings{globerson2007fixing,\n\ttitle        = {Fixing max-product: Convergent message passing algorithms for {MAP} {LP}-relaxations},\n\tauthor       = {Amir Globerson and Tommi Jaakkola},\n\tyear         = 2007,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{glockner2018breaking,\n\ttitle        = {Breaking {NLI} Systems with Sentences that Require Simple Lexical Inferences},\n\tauthor       = {Max Glockner and Vered Shwartz and Yoav Goldberg},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{glorot2010understanding,\n\ttitle        = {Understanding the difficulty of training deep feedforward neural networks},\n\tauthor       = {Glorot, Xavier and Bengio, Yoshua},\n\tyear         = 2010,\n\tbooktitle    = {Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics},\n\tpages        = {249--256}\n}\n@inproceedings{glorot2011deep,\n\ttitle        = {Deep sparse rectifier neural networks},\n\tauthor       = {Xavier Glorot and Antoine Bordes and Yoshua Bengio},\n\tyear         = 2011,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)},\n\tpages        = {315--323}\n}\n@inproceedings{glorot2011domain,\n\ttitle        = {Domain adaptation for large-scale sentiment classification: A deep learning approach},\n\tauthor       = {Xavier Glorot and Antoine Bordes and Yoshua Bengio},\n\tyear         = 2011,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{GLPR12,\n\ttitle        = {The inverse moment problem for convex polytopes},\n\tauthor       = {N. Gravin and J. Lasserre and D. Pasechnik and S. Robins},\n\tyear         = 2012,\n\tjournal      = {Discrete and Computational Geometry},\n\tnote         = {To appear}\n}\n@misc{GM,\n\ttitle        = {On the Optimization Landscape of Tensor Decompositions},\n\tauthor       = {Rong Ge and Tengyu Ma},\n\tyear         = 2016,\n\talteditor    = {editor},\n\tdate         = {},\n\toptsubtitle  = {subtitle},\n\topttitleaddon = {titleaddon},\n\toptlanguage  = {language},\n\topthowpublished = {howpublished},\n\topttype      = {type},\n\toptversion   = {version},\n\toptnote      = {note},\n\toptorganization = {organization},\n\toptlocation  = {location},\n\toptdate      = {date},\n\toptmonth     = {month},\n\toptaddendum  = {addendum},\n\toptpubstate  = {pubstate},\n\toptdoi       = {doi},\n\topteprint    = {eprint},\n\topteprintclass = {eprintclass},\n\topteprinttype = {eprinttype},\n\topturl       = {url},\n\topturldate   = {urldate}\n}\n@inproceedings{gmh13,\n\ttitle        = {Speech recognition with deep recurrent neural networks},\n\tauthor       = {Graves, Alex and Mohamed, Abdel-rahman and Hinton, Geoffrey},\n\tyear         = 2013,\n\tbooktitle    = {{IEEE} International Conference on Acoustics, Speech and Signal Processing (ICASSP)},\n\tpages        = {6645--6649},\n\torganization = {IEEE}\n}\n@article{GMMICA2013,\n\ttitle        = {{The More, the Merrier: the Blessing of Dimensionality for Learning Large Gaussian Mixtures}},\n\tauthor       = {J. Anderson and M. Belkin and N. Goyal and L. Rademacher and J. Voss},\n\tyear         = 2013,\n\tmonth        = nov,\n\tjournal      = {arXiv preprint arXiv:1311.2891}\n}\n@inproceedings{GMS,\n\ttitle        = {Approximation of functions over redundant dictionaries using coherence},\n\tauthor       = {A. Gilbert and S. Muthukrishnan and M. Strauss},\n\tyear         = 2003,\n\tbooktitle    = {SODA}\n}\n@inproceedings{GN,\n\ttitle        = {Sparse representations in unions of bases},\n\tauthor       = {R. Gribonval and M. Nielsen},\n\tyear         = 2003,\n\tbooktitle    = {IEEE Transactions on Information Theory},\n\tpages        = {3320--3325}\n}\n@article{gneiting2005weather,\n\ttitle        = {Weather Forecasting with Ensemble Methods},\n\tauthor       = {Tilmann Gneiting and Adrian E. Raftery},\n\tyear         = 2005,\n\tjournal      = {Science},\n\tvolume       = 310\n}\n@article{gneiting2007probabilistic,\n\ttitle        = {Probabilistic forecasts, calibration and sharpness},\n\tauthor       = {Gneiting, Tilmann and Balabdaoui, Fadoua and Raftery, Adrian E},\n\tyear         = 2007,\n\tjournal      = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)},\n\tpublisher    = {Wiley Online Library},\n\tvolume       = 69,\n\tnumber       = 2,\n\tpages        = {243--268}\n}\n@article{gneiting2007strictly,\n\ttitle        = {Strictly proper scoring rules, prediction, and estimation},\n\tauthor       = {Gneiting, Tilmann and Raftery, Adrian E},\n\tyear         = 2007,\n\tjournal      = {Journal of the American statistical Association},\n\tpublisher    = {Taylor \\& Francis},\n\tvolume       = 102,\n\tnumber       = 477,\n\tpages        = {359--378}\n}\n@article{godambe87quasi,\n\ttitle        = {Quasi-likelihood and Optimal estimation},\n\tauthor       = {V. P. Godambe and C. C. Heyde},\n\tyear         = 1987,\n\tjournal      = {International Statistical Review},\n\tvolume       = 55,\n\tpages        = {231--244}\n}\n@inproceedings{goel2016stopandfrisk,\n\ttitle        = {Precinct or Prejudice? Understanding Racial Disparities in New York City's Stop-and-Frisk Policy},\n\tauthor       = {Sharad Goel and Justin M. Rao and Ravi Shroff},\n\tyear         = 2016,\n\tbooktitle    = {The Annals of Applied Statistics}\n}\n@article{goel2017eigenvalue,\n\ttitle        = {Eigenvalue Decay Implies Polynomial-Time Learnability for Neural Networks},\n\tauthor       = {Goel, Surbhi and Klivans, Adam},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1708.03708}\n}\n@article{goel2017learning,\n\ttitle        = {Learning Depth-Three Neural Networks in Polynomial Time},\n\tauthor       = {Goel, Surbhi and Klivans, Adam},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1709.06010}\n}\n@article{goel2020model,\n\ttitle        = {Model Patching: Closing the Subgroup Performance Gap with Data Augmentation},\n\tauthor       = {Karan Goel and Albert Gu and Yixuan Li and Christopher R{\\'e}},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2008.06775}\n}\n@article{goemans1995improved,\n\ttitle        = {Improved approximation algorithms for maximum cut and satisfiability problems using semidefinite programming},\n\tauthor       = {M. Goemans and D. Williamson},\n\tyear         = 1995,\n\tjournal      = {Journal of the ACM (JACM)},\n\tvolume       = 42,\n\tnumber       = 6,\n\tpages        = {1115--1145}\n}\n@article{gogate2011samplesearch,\n\ttitle        = {SampleSearch: Importance sampling in presence of determinism},\n\tauthor       = {Gogate, Vibhav and Dechter, Rina},\n\tyear         = 2011,\n\tmonth        = feb,\n\tjournal      = {Artif. Intell.},\n\tpublisher    = {Elsevier Science Publishers Ltd.},\n\taddress      = {Essex, UK},\n\tvolume       = 175,\n\tnumber       = 2,\n\tpages        = {694--729},\n\tdoi          = {10.1016/j.artint.2010.10.009},\n\tissn         = {0004-3702},\n\tacmid        = 1924819,\n\tissue_date   = {February, 2011},\n\tkeywords     = {Approximate inference, Bayesian networks, Constraint satisfaction, Importance sampling, Markov chain Monte Carlo, Markov networks, Model counting, Probabilistic inference, Satisfiability},\n\tnumpages     = 36\n}\n@book{gohberg2006indefinite,\n\ttitle        = {Indefinite linear algebra and applications},\n\tauthor       = {Gohberg, Israel and Lancaster, Peter and Rodman, Leiba},\n\tyear         = 2006,\n\tpublisher    = {Springer Science \\& Business Media}\n}\n@article{gol70,\n\ttitle        = {A family of variable-metric methods derived by variational means},\n\tauthor       = {Goldfarb, Donald},\n\tyear         = 1970,\n\tjournal      = {Mathematics of computation},\n\tvolume       = 24,\n\tnumber       = 109,\n\tpages        = {23--26}\n}\n@article{Goldberg1998,\n\ttitle        = {Beyond the flow decomposition barrier},\n\tauthor       = {Goldberg, Andrew V. and Rao, Satish},\n\tyear         = 1998,\n\tmonth        = sep,\n\tjournal      = {Journal of the ACM},\n\tvolume       = 45,\n\tnumber       = 5,\n\tpages        = {783--797}\n}\n@inproceedings{goldberg2010efficient,\n\ttitle        = {An efficient algorithm for easy-first non-directional dependency parsing},\n\tauthor       = {Yoav Goldberg and Michael Elhadad},\n\tyear         = 2010,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {742--750}\n}\n@article{goldberg2013training,\n\ttitle        = {Training Deterministic Parsers with Non-Deterministic Oracles},\n\tauthor       = {Yoav Goldberg and Joakim Nivre},\n\tyear         = 2013,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 1\n}\n@article{goldberg2016bounds,\n\ttitle        = {Bounds for the query complexity of approximate equilibria},\n\tauthor       = {Goldberg, Paul W and Roth, Aaron},\n\tyear         = 2016,\n\tjournal      = {ACM Transactions on Economics and Computation (TEAC)},\n\tpublisher    = {ACM},\n\tvolume       = 4,\n\tnumber       = 4,\n\tpages        = 24\n}\n@article{goldenshluger1997spatially,\n\ttitle        = {On spatially adaptive estimation of nonparametric regression},\n\tauthor       = {Alexander Goldenshluger and Arkadi Nemirovski},\n\tyear         = 1997,\n\tjournal      = {Mathematical Methods of Statistics},\n\tvolume       = 6,\n\tpages        = {135--170}\n}\n@inproceedings{goldman2018weakly,\n\ttitle        = {Weakly-supervised Semantic Parsing with Abstract Examples},\n\tauthor       = {Omer Goldman and Veronika Latcinnik and Udi Naveh and Amir Globerson and Jonathan Berant},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{goldwasser11confidence,\n\ttitle        = {Confidence Driven Unsupervised Semantic Parsing},\n\tauthor       = {Dan Goldwasser and Roi Reichart and James Clarke and Dan Roth},\n\tyear         = 2011,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {1486--1495}\n}\n@inproceedings{goldwasser11instructions,\n\ttitle        = {Learning From Natural Instructions},\n\tauthor       = {Dan Goldwasser and Dan Roth},\n\tyear         = 2011,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)},\n\tpages        = {1794--1800}\n}\n@inproceedings{goldwater05interpolate,\n\ttitle        = {Interpolating Between Types and Tokens by Estimating Power-Law Generators},\n\tauthor       = {Sharon Goldwater and Tom Griffiths and Mark Johnson},\n\tyear         = 2005,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{goldwater06segmentation,\n\ttitle        = {Contextual Dependencies in Unsupervised Word Segmentation},\n\tauthor       = {Sharon Goldwater and Tom Griffiths and Mark Johnson},\n\tyear         = 2006,\n\tbooktitle    = {International Conference on Computational Linguistics and Association for Computational Linguistics (COLING/ACL)}\n}\n@inproceedings{goldwater07pos,\n\ttitle        = {A Fully {B}ayesian Approach to Unsupervised Part-of-Speech Tagging},\n\tauthor       = {S. Goldwater and T. Griffiths},\n\tyear         = 2007,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{goldwater09segmentation,\n\ttitle        = {A {B}ayesian Framework for Word Segmentation: Exploring the Effects of Context},\n\tauthor       = {Sharon Goldwater and Tom Griffiths and Mark Johnson},\n\tyear         = 2009,\n\tjournal      = {Cognition},\n\tvolume       = 112,\n\tpages        = {21--54}\n}\n@inproceedings{golland2010pragmatics,\n\ttitle        = {A Game-theoretic Approach to Generating Spatial Descriptions},\n\tauthor       = {Dave Golland and Percy Liang and Dan Klein},\n\tyear         = 2010,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {410--419}\n}\n@inproceedings{golovin2010near,\n\ttitle        = {Near-optimal {B}ayesian active learning with noisy observations},\n\tauthor       = {Daniel Golovin and Andreas Krause and Debajyoti Ray},\n\tyear         = 2010,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {766--774}\n}\n@inproceedings{golowich2018size,\n\ttitle        = {Size-independent sample complexity of neural networks},\n\tauthor       = {Golowich, Noah and Rakhlin, Alexander and Shamir, Ohad},\n\tyear         = 2018,\n\tbooktitle    = {Conference On Learning Theory},\n\tpages        = {297--299},\n\torganization = {PMLR}\n}\n@book{Golub&VanLoan:book,\n\ttitle        = {Matrix Computations},\n\tauthor       = {G.H. Golub and C.F. Van Loan},\n\tyear         = 1990,\n\tpublisher    = {The Johns Hopkins University Press},\n\taddress      = {Baltimore, Maryland}\n}\n@book{golub1996matrix,\n\ttitle        = {Matrix computations (3rd ed.)},\n\tauthor       = {Golub, Gene H. and Van Loan, Charles F.},\n\tyear         = 1996,\n\tpublisher    = {Johns Hopkins University Press},\n\taddress      = {Baltimore, MD, USA},\n\tisbn         = {0801854148},\n\tciteulike-article-id = 2122238,\n\tciteulike-linkout-0 = {http://portal.acm.org/citation.cfm?id=248979},\n\tciteulike-linkout-1 = {http://portal.acm.org/citation.cfm?id=248979},\n\tkeywords     = {algebra, book, computation, numerical},\n\towner        = {leili},\n\tposted-at    = {2008-03-30 22:15:25},\n\tpriority     = 2,\n\ttimestamp    = {2011.07.28}\n}\n@book{golub2012matrix,\n\ttitle        = {Matrix computations},\n\tauthor       = {Golub, Gene H. and Van Loan, Charles F.},\n\tyear         = 2012,\n\tpublisher    = {JHU Press},\n\tvolume       = 3,\n\tpages        = 784,\n\tisbn         = 1421407949,\n\tedition      = {4th},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Desktop/2013 Matrix Computations 4th.pdf:pdf},\n\tmendeley-groups = {Books/Book-Optimization}\n}\n@inproceedings{golub2017transfer,\n\ttitle        = {Two-Stage Synthesis Networks for Transfer Learning in Machine Comprehension},\n\tauthor       = {David Golub and Po-Sen Huang and Xiaodong He and Li Deng},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{gondek2012framework,\n\ttitle        = {A framework for merging and ranking of answers in {DeepQA}},\n\tauthor       = {D. C. Gondek and A. Lally and A. Kalyanpur and J. W. Murdock and P. A. Duboue and L. Zhang and Y. Pan and Z. M. Qiu and C. Welty},\n\tyear         = 2012,\n\tjournal      = {{IBM} Journal of Research and Development},\n\tvolume       = 56\n}\n@inproceedings{gong2012geodesic,\n\ttitle        = {Geodesic flow kernel for unsupervised domain adaptation},\n\tauthor       = {Gong, Boqing and Shi, Yuan and Sha, Fei and Grauman, Kristen},\n\tyear         = 2012,\n\tbooktitle    = {2012 IEEE Conference on Computer Vision and Pattern Recognition},\n\tpages        = {2066--2073},\n\torganization = {IEEE}\n}\n@inproceedings{gong2013reshaping,\n\ttitle        = {Reshaping Visual Datasets for Domain Adaptation},\n\tauthor       = {Boqing Gong and Kristen Grauman and Fei Sha},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{gong2018ruminating,\n\ttitle        = {Ruminating Reader: Reasoning with Gated Multi-Hop Attention},\n\tauthor       = {Yichen Gong and Samuel R. Bowman},\n\tyear         = 2018,\n\tbooktitle    = {Workshop on Machine Reading for Question Answering (MRQA)}\n}\n@inproceedings{gong2019context,\n\ttitle        = {Context-Sensitive Malicious Spelling Error Correction},\n\tauthor       = {Hongyu Gong and Yuchen Li and Suma Bhat and Pramod Viswanath},\n\tyear         = 2019,\n\tbooktitle    = {World Wide Web (WWW)},\n\tpages        = {2771--2777}\n}\n@inproceedings{gonzalez2010google,\n\ttitle        = {Google fusion tables: web-centered data management and collaboration},\n\tauthor       = {Hector Gonzalez and Alon Y Halevy and Christian S Jensen and Anno Langen and Jayant Madhavan and Rebecca Shapley and Warren Shen and Jonathan Goldberg-Kidon},\n\tyear         = 2010,\n\tbooktitle    = {Proceedings of the 2010 ACM SIGMOD International Conference on Management of data},\n\tpages        = {1061--1066}\n}\n@inproceedings{goodfellow2013maxout,\n\ttitle        = {Maxout Networks},\n\tauthor       = {Ian Goodfellow and David Warde-farley and Mehdi Mirza and Aaron Courville and Yoshua Bengio},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1319--1327}\n}\n@inproceedings{goodfellow2014gan,\n\ttitle        = {Generative Adversarial Nets},\n\tauthor       = {Ian J. Goodfellow and Jean Pouget-Abadie and Mehdi Mirza and Bing Xu and David Warde-Farley and Sherjil Ozair and Aaron Courville and Yoshua Bengio},\n\tyear         = 2014,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{goodfellow2014generative,\n\ttitle        = {Generative adversarial nets},\n\tauthor       = {Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua},\n\tyear         = 2014,\n\tbooktitle    = {Advances in neural information processing systems}\n}\n@inproceedings{goodfellow2015explaining,\n\ttitle        = {Explaining and harnessing adversarial examples},\n\tauthor       = {Ian J Goodfellow and Jonathon Shlens and Christian Szegedy},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{goodfellow2016cleverhans,\n\ttitle        = {cleverhans v2.0.0: an adversarial machine learning library},\n\tauthor       = {Ian Goodfellow and Nicolas Papernot and Patrick McDaniel},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@book{goodfellow2016deep,\n\ttitle        = {Deep learning},\n\tauthor       = {Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron},\n\tyear         = 2016,\n\tpublisher    = {MIT press}\n}\n@inproceedings{goodman08church,\n\ttitle        = {Church: a language for generative models},\n\tauthor       = {N. D. Goodman and V. K. Mansighka and D. Roy and K. Bonawitz and J. B. Tenenbaum},\n\tyear         = 2008,\n\tbooktitle    = {Uncertainty in Artificial Intelligence (UAI)}\n}\n@article{goodman08rule,\n\ttitle        = {A rational analysis of rule-based concept learning},\n\tauthor       = {N. D. Goodman and J. B. Tenenbaum and J. Feldman and T. L. Griffiths},\n\tyear         = 2008,\n\tjournal      = {Cognitive Science},\n\tvolume       = 32,\n\tpages        = {108--154}\n}\n@article{goodman1963statistical,\n\ttitle        = {\n\t\tStatistical Analysis Based on a Certain Multivariate Complex Gaussian\n\n\t\tDistribution (An Introduction)\n\t},\n\tauthor       = {Goodman, N. R.},\n\tyear         = 1963,\n\tjournal      = {The Annals of Mathematical Statistics},\n\tpublisher    = {Institute of Mathematical Statistics},\n\tvolume       = 34,\n\tnumber       = 1,\n\tpages        = {152--177},\n\tcopyright    = {Copyright ? 1963 Institute of Mathematical Statistics},\n\tjstor_articletype = {research-article},\n\tjstor_formatteddate = {Mar., 1963},\n\tlanguage     = {English},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@inproceedings{goodman2014concepts,\n\ttitle        = {Concepts in a probabilistic language of thought},\n\tauthor       = {Noah D Goodman and Joshua B Tenenbaum and Tobias Gerstenberg},\n\tyear         = 2014,\n\tbooktitle    = {The Conceptual Mind: New Directions in the Study of Concepts}\n}\n@book{goodman2015prob,\n\ttitle        = {Probabilistic Semantics and Pragmatics: Uncertainty in Language and Thought},\n\tauthor       = {Noah Goodman and Daniel Lassiter},\n\tyear         = 2015,\n\tpublisher    = {The Handbook of Contemporary Semantic Theory, 2nd Edition Wiley-Blackwell}\n}\n@article{goodman2016eu,\n\ttitle        = {European Union regulations on algorithmic decision-making and a ``right to explanation''},\n\tauthor       = {Bryce Goodman and Seth Flaxman},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1606.08813}\n}\n@inproceedings{goodman2016noise,\n\ttitle        = {Noise reduction and targeted exploration in imitation learning for abstract meaning representation parsing},\n\tauthor       = {James Goodman and Andreas Vlachos and Jason Naradowsky},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{goodman2016pragmatic,\n\ttitle        = {Pragmatic language interpretation as probabilistic inference},\n\tauthor       = {Noah D Goodman and Michael C Frank},\n\tyear         = 2016,\n\tjournal      = {Trends in Cognitive Sciences},\n\tvolume       = 20,\n\tnumber       = 11,\n\tpages        = {818--829}\n}\n@article{goodpaster2006loss,\n\ttitle        = {The loss of skeletal muscle strength, mass, and quality in older adults: the health, aging and body composition study},\n\tauthor       = {Bret H Goodpaster and Seok Won Park and Tamara B Harris and Steven B Kritchevsky and Michael Nevitt and Ann V Schwartz and Eleanor M Simonsick and Frances A Tylavsky and Marjolein Visser and Anne B Newman},\n\tyear         = 2006,\n\tjournal      = {The Journals of Gerontology Series A: Biological Sciences and Medical Sciences},\n\tvolume       = 61,\n\tnumber       = 10,\n\tpages        = {1059--1064}\n}\n@article{goodrich2013teleoperation,\n\ttitle        = {Teleoperation and Beyond for Assistive Humanoid Robots},\n\tauthor       = {M. A. Goodrich and J. Crandall and E. Barakova},\n\tyear         = 2013,\n\tjournal      = {Reviews of Human Factors and Ergonomics},\n\tvolume       = 9,\n\tpages        = {175--226}\n}\n@inproceedings{goodwin2020probing,\n\ttitle        = {Probing Linguistic Systematicity},\n\tauthor       = {Emily Goodwin and Koustuv Sinha and Timothy J. O'Donnell},\n\tyear         = 2020,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{googlenet15,\n\ttitle        = {Going deeper with convolutions},\n\tauthor       = {Szegedy, Christian and Liu, Wei and Jia, Yangqing and Sermanet, Pierre and Reed, Scott and Anguelov, Dragomir and Erhan, Dumitru and Vanhoucke, Vincent and Rabinovich, Andrew},\n\tyear         = 2015,\n\tmonth        = jun,\n\tbooktitle    = {Proceedings of the IEEE conference on computer vision and pattern recognition},\n\tpages        = {1--9}\n}\n@inproceedings{gopalan2011domain,\n\ttitle        = {Domain adaptation for object recognition: An unsupervised approach},\n\tauthor       = {Raghuraman Gopalan and Ruonan Li and Rama Chellappa},\n\tyear         = 2011,\n\tbooktitle    = {International Conference on Computer Vision (ICCV)},\n\tpages        = {999--1006}\n}\n@article{gopinath2016human,\n\ttitle        = {Human-in-the-loop optimization of shared autonomy in assistive robotics},\n\tauthor       = {Deepak Gopinath and Siddarth Jain and Brenna D Argall},\n\tyear         = 2016,\n\tjournal      = {IEEE Robotics and Automation Letters (RA-L)},\n\tvolume       = 2,\n\tpages        = {247--254}\n}\n@book{gordis1969reliability,\n\ttitle        = {The Inaccuracy in Using Interviews to Estimate Patient Reliability in Taking Medications at Home},\n\tauthor       = {Leon Gordis and Milton Markowitz and Abraham M. Lilienfeld},\n\tyear         = 1969,\n\tpublisher    = {Medical Care},\n\tpages        = {49--54}\n}\n@article{gordon1993novel,\n\ttitle        = {Novel approach to nonlinear/non-{G}aussian {B}ayesian state estimation},\n\tauthor       = {Neil J. Gordon and David J. Salmond and Adrian F. M. Smith},\n\tyear         = 1993,\n\tjournal      = {IEE Proceedings F (Radar and Signal Processing)},\n\tvolume       = 140,\n\tnumber       = 2,\n\tpages        = {107--113}\n}\n@inproceedings{gordon2018iqa,\n\ttitle        = {{IQA}: Visual Question Answering in Interactive Environments},\n\tauthor       = {Daniel Gordon and Aniruddha Kembhavi and Mohammad Rastegari and Joseph Redmon and Dieter Fox and Ali Farhadi},\n\tyear         = 2018,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@article{goreinov1997pseudo,\n\ttitle        = {Pseudo-skeleton approximations by matrices of maximal volume},\n\tauthor       = {Goreinov, Sergei A and Zamarashkin, Nikolai Leonidovich and Tyrtyshnikov, Evgenii Evgen'evich},\n\tyear         = 1997,\n\tjournal      = {Mathematical Notes},\n\tpublisher    = {Springer},\n\tvolume       = 62,\n\tnumber       = 4,\n\tpages        = {515--519}\n}\n@article{goreinov1997theory,\n\ttitle        = {A theory of pseudoskeleton approximations},\n\tauthor       = {Goreinov, Sergei A and Tyrtyshnikov, Eugene E and Zamarashkin, Nickolai L},\n\tyear         = 1997,\n\tjournal      = {Linear Algebra and Its Applications},\n\tpublisher    = {Elsevier},\n\tvolume       = 261,\n\tnumber       = 1,\n\tpages        = {1--21}\n}\n@article{gorman2011prosodylab,\n\ttitle        = {Prosodylab-aligner: A tool for forced alignment of laboratory speech},\n\tauthor       = {Kyle Gorman and Jonathan Howell and Michael Wagner},\n\tyear         = 2011,\n\tjournal      = {Canadian Acoustics},\n\tvolume       = 39,\n\tnumber       = 3,\n\tpages        = {192--193}\n}\n@article{gorniak07situated,\n\ttitle        = {Situated Language Understanding as Filtering Perceived Affordances},\n\tauthor       = {P. Gorniak and D. Roy},\n\tyear         = 2007,\n\tjournal      = {Cognitive Science},\n\tvolume       = 31,\n\tpages        = {197--231}\n}\n@article{gorniak2004grounded,\n\ttitle        = {Grounded semantic composition for visual scenes},\n\tauthor       = {Peter Gorniak and Deb Roy},\n\tyear         = 2004,\n\tjournal      = {Journal of Artificial Intelligence Research (JAIR)},\n\tvolume       = 21,\n\tpages        = {429--470}\n}\n@inproceedings{gorodissky2019white,\n\ttitle        = {White-to-Black: Efficient Distillation of Black-Box Adversarial Attacks},\n\tauthor       = {Or Gorodissky and Yotam Gil and Yoav Chai and Jonathan Berant},\n\tyear         = 2019,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@inproceedings{govindaraju2013understanding,\n\ttitle        = {Understanding tables in context using standard {NLP} toolkits},\n\tauthor       = {Vidhya Govindaraju and Ce Zhang and Christopher R{\\'e}},\n\tyear         = 2013,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{gowal2018effectiveness,\n\ttitle        = {On the Effectiveness of Interval Bound Propagation for Training Verifiably Robust Models},\n\tauthor       = {Sven Gowal and Krishnamurthy Dvijotham and Robert Stanforth and Rudy Bunel and Chongli Qin and Jonathan Uesato and Timothy Mann and Pushmeet Kohli},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1810.12715}\n}\n@inproceedings{gowal2019scalable,\n\ttitle        = {Scalable Verified Training for Provably Robust Image Classification},\n\tauthor       = {Sven Gowal and Krishnamurthy Dvijotham and Robert Stanforth and Rudy Bunel and Chongli Qin and Jonathan Uesato and Relja Arandjelovi{\\'c} and Timothy Mann and Pushmeet Kohli},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Computer Vision (ICCV)}\n}\n@inproceedings{goyal2014fourier,\n\ttitle        = {Fourier PCA and robust tensor decomposition},\n\tauthor       = {Goyal, Navin and Vempala, Santosh and Xiao, Ying},\n\tyear         = 2014,\n\tbooktitle    = {Proceedings of the 46th Annual ACM Symposium on Theory of Computing},\n\tpages        = {584--593},\n\torganization = {ACM}\n}\n@article{goyal2017accurate,\n\ttitle        = {Accurate, large minibatch sgd: Training imagenet in 1 hour},\n\tauthor       = {Goyal, Priya and Doll{\\'a}r, Piotr and Girshick, Ross and Noordhuis, Pieter and Wesolowski, Lukasz and Kyrola, Aapo and Tulloch, Andrew and Jia, Yangqing and He, Kaiming},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1706.02677}\n}\n@inproceedings{goyal2017making,\n\ttitle        = {Making the {V} in {VQA} matter: Elevating the role of image understanding in Visual Question Answering},\n\tauthor       = {Yash Goyal and Tejas Khot and Douglas Summers-Stay and Dhruv Batra and Devi Parikh},\n\tyear         = 2017,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@inproceedings{goyal2018continuous,\n\ttitle        = {A continuous relaxation of beam search for end-to-end training of neural sequence models},\n\tauthor       = {Kartik Goyal and Graham Neubig and Chris Dyer and Taylor Berg-Kirkpatrick},\n\tyear         = 2018,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@article{goyal2019explaining,\n\ttitle        = {Explaining classifiers with causal concept effect ({CaCE})},\n\tauthor       = {Yash Goyal and Uri Shalit and Been Kim},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1907.07165}\n}\n@inproceedings{goyal2019shaping,\n\ttitle        = {Using Natural Language for Reward Shaping in Reinforcement Learning},\n\tauthor       = {Prasoon Goyal and Scott Niekum and Raymond J. Mooney},\n\tyear         = 2019,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@article{goyal2021self,\n\ttitle        = {Self-supervised pretraining of visual features in the wild},\n\tauthor       = {Goyal, Priya and Caron, Mathilde and Lefaudeux, Benjamin and Xu, Min and Wang, Pengchao and Pai, Vivek and Singh, Mannat and Liptchinsky, Vitaliy and Misra, Ishan and Joulin, Armand and others},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2103.01988}\n}\n@article{GPR67,\n\ttitle        = {The method of projections for finding the common point of convex sets},\n\tauthor       = {Gubin, LG and Polyak, BT and Raik, EV},\n\tyear         = 1967,\n\tjournal      = {USSR Comput. Math. Math. Phys.},\n\tpublisher    = {Elsevier},\n\tvolume       = 7,\n\tnumber       = 6,\n\tpages        = {1--24},\n\tfjournal     = {USSR Computational Mathematics and Mathematical Physics}\n}\n@inproceedings{graca08em,\n\ttitle        = {Expectation Maximization and Posterior Constraints},\n\tauthor       = {João Graça and Kuzman Ganchev and Ben Taskar},\n\tyear         = 2008,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {569--576}\n}\n@article{graetz2018education,\n\ttitle        = {Mapping local variation in educational attainment across Africa},\n\tauthor       = {Nicholas Graetz and Joseph Friedman and Aaron Osgood-Zimmerman and Roy Burstein and Molly H. Biehl and Chloe Shields and Jonathan F. Mosser and Daniel C. Casey and Aniruddha Deshpande and Lucas Earl and Robert C. Reiner and Sarah E. Ray and Nancy Fullman and Aubrey J. Levine and Rebecca W. Stubbs and Benjamin K. Mayala and Joshua Longbottom and Annie J. Browne and Samir Bhatt and Daniel J. Weiss and Peter W. Gething and Ali H. Mokdad and Stephen S. Lim and Christopher J. L. Murray and Emmanuela Gakidou and Simon I. Hay},\n\tyear         = 2018,\n\tjournal      = {Nature},\n\tvolume       = 555\n}\n@incollection{graf2005parallel,\n\ttitle        = {Parallel Support Vector Machines: The Cascade SVM},\n\tauthor       = {\n\t\tHans Peter {Graf} and Eric {Cosatto} and L\\'{e}on {Bottou} and Igor\n\n\t\t{Dourdanovic} and Vladimir {Vapnik}\n\t},\n\tyear         = 2005,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {MIT Press},\n\taddress      = {Cambridge, MA},\n\tpages        = {521--528},\n\teditor       = {Lawrence K. Saul and Yair Weiss and {L\\'{e}on} Bottou}\n}\n@article{graf97predicate,\n\ttitle        = {Construction of abstract state graphs with {PVS}},\n\tauthor       = {Susanne Graf and Hassen Saidi},\n\tyear         = 1997,\n\tjournal      = {Computer Aided Verification},\n\tvolume       = 1254,\n\tpages        = {72--83}\n}\n@manual{graff2003gigawords,\n\ttitle        = {{E}nglish {G}igaword LDC2003T05},\n\tauthor       = {David Graff and Christopher Cieri},\n\tyear         = 2003\n}\n@inproceedings{grandvalet05entropy,\n\ttitle        = {Entropy Regularization},\n\tauthor       = {Yves Grandvalet and Yoshua Bengio},\n\tyear         = 2005,\n\tbooktitle    = {Semi-Supervised Learning}\n}\n@article{graves2013generating,\n\ttitle        = {Generating sequences with recurrent neural networks},\n\tauthor       = {Alex Graves},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1308.0850}\n}\n@article{GravinEtal:ConvexPolytopes,\n\ttitle        = {{The inverse moment problem for convex polytopes}},\n\tauthor       = {Nick Gravin and Jean Lasserre and Dmitrii Pasechnik and Sinai Robins},\n\tyear         = 2011,\n\tmonth        = jun,\n\tjournal      = {arXiv preprint arXiv:1106.5723}\n}\n@article{Gray05,\n\ttitle        = {Toeplitz and Circulant Matrices: A Review.},\n\tauthor       = {Gray, Robert M.},\n\tyear         = 2005,\n\tjournal      = {Foundations and Trends in Communications and Information Theory},\n\tvolume       = 2,\n\tnumber       = 3,\n\turl          = {http://dblp.uni-trier.de/db/journals/ftcit/ftcit2.html#Gray05},\n\tadded-at     = {2008-05-21T00:00:00.000+0200},\n\tbiburl       = {http://www.bibsonomy.org/bibtex/207697e274947ffbcce7cf3bff5b428b5/dblp},\n\tdate         = {2008-05-21},\n\tdescription  = {dblp},\n\tee           = {http://dx.doi.org/10.1561/0100000006},\n\tinterhash    = {46ba3a0286283541309110ada4316612},\n\tintrahash    = {07697e274947ffbcce7cf3bff5b428b5},\n\tkeywords     = {dblp},\n\ttimestamp    = {2008-05-22T11:44:22.000+0200},\n\tbdsk-url-1   = {http://dblp.uni-trier.de/db/journals/ftcit/ftcit2.html#Gray05}\n}\n@article{gray2019craftassist,\n\ttitle        = {CraftAssist: A Framework for Dialogue-enabled Interactive Agents},\n\tauthor       = {Jonathan Gray and Kavya Srinet and Yacine Jernite and Haonan Yu and Zhuoyuan Chen and Demi Guo and Siddharth Goyal and C. Lawrence Zitnick and Arthur Szlam},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1907.08584}\n}\n@inproceedings{graziani2018regression,\n\ttitle        = {Regression concept vectors for bidirectional explanations in histopathology},\n\tauthor       = {Mara Graziani and Vincent Andrearczyk and Henning M{\\\"u}ller},\n\tyear         = 2018,\n\tbooktitle    = {Understanding and Interpreting Machine Learning in Medical Image Computing Applications},\n\tpages        = {124--132}\n}\n@article{grechkin2017ezlearn,\n\ttitle        = {EZLearn: Exploiting Organic Supervision in Large-Scale Data Annotation},\n\tauthor       = {Maxim Grechkin and Hoifung Poon and Bill Howe},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1709.08600}\n}\n@inproceedings{green06biomedical,\n\ttitle        = {Generation of biomedical arguments for lay readers},\n\tauthor       = {Nancy Green},\n\tyear         = 2006,\n\tbooktitle    = {International Natural Language Generation Conference},\n\tpages        = {114--121}\n}\n@article{green1995reversible,\n\ttitle        = {Reversible jump {M}arkov chain {M}onte {C}arlo computation and {B}ayesian model determination},\n\tauthor       = {PJ Green},\n\tyear         = 1995,\n\tjournal      = {Biometrika},\n\tvolume       = 82,\n\tnumber       = 4,\n\tpages        = {711--732}\n}\n@inproceedings{green2013efficacy,\n\ttitle        = {The efficacy of human post-editing for language translation},\n\tauthor       = {Spence Green and Jeffrey Heer and Christopher D Manning},\n\tyear         = 2013,\n\tbooktitle    = {Conference on Human Factors in Computing Systems (CHI)},\n\tpages        = {439--448}\n}\n@article{greenberg1996consistency,\n\ttitle        = {Consistency, redundancy, and implied equalities in linear systems},\n\tauthor       = {Harvey J Greenberg},\n\tyear         = 1996,\n\tjournal      = {Annals of Mathematics and Artificial Intelligence},\n\tvolume       = 17,\n\tnumber       = 1,\n\tpages        = {37--83}\n}\n@inproceedings{greenberg1996insights,\n\ttitle        = {Insights into spoken language gleaned from phonetic transcription of the {S}witchboard corpus},\n\tauthor       = {Steven Greenberg and Joy Hollenback and Dan Ellis},\n\tyear         = 1996,\n\tbooktitle    = {International Conference on Spoken Language Processing (ICSLP)}\n}\n@inproceedings{greenewald2017action,\n\ttitle        = {Action centered contextual bandits},\n\tauthor       = {Kristjan Greenewald and Ambuj Tewari and Susan Murphy and Predag Klasnja},\n\tyear         = 2017,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {5977--5985}\n}\n@article{greensmith2004variance,\n\ttitle        = {Variance reduction techniques for gradient estimates in reinforcement learning},\n\tauthor       = {Evan Greensmith and Peter L Bartlett and Jonathan Baxter},\n\tyear         = 2004,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 5,\n\tpages        = {1471--1530}\n}\n@inproceedings{grefenstette2011experimental,\n\ttitle        = {Experimental support for a categorical compositional distributional model of meaning},\n\tauthor       = {Edward Grefenstette and Mehrnoosh Sadrzadeh},\n\tyear         = 2011,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1394--1404}\n}\n@article{grefenstette2013calculus,\n\ttitle        = {Towards a formal distributional semantics: Simulating logical calculi with tensors},\n\tauthor       = {Edward Grefenstette},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1304.5823}\n}\n@inproceedings{grefenstette2014deep,\n\ttitle        = {A Deep Architecture for Semantic Parsing},\n\tauthor       = {Edward Grefenstette and Phil Blunsom and Nando de Freitas and Karl Moritz Hermann},\n\tyear         = 2014,\n\tbooktitle    = {ACL Workshop on Semantic Parsing},\n\tpages        = {22--27}\n}\n@article{grefenstette2019higher,\n\ttitle        = {Generalized Inner Loop Meta-Learning},\n\tauthor       = {Edward Grefenstette and Brandon Amos and Denis Yarats and Phu Mon Htut and Artem Molchanov and F. Meier and Douwe Kiela and Kyunghyun Cho and Soumith Chintala},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1910.01727}\n}\n@article{gregor2016variational,\n\ttitle        = {Variational intrinsic control},\n\tauthor       = {Karol Gregor and Danilo Jimenez Rezende and Daan Wierstra},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1611.07507}\n}\n@article{greiner2002learning,\n\ttitle        = {Learning cost-sensitive active classifiers},\n\tauthor       = {Russell Greiner and Adam J Grove and Dan Roth},\n\tyear         = 2002,\n\tjournal      = {Artificial Intelligence},\n\tvolume       = 139,\n\tnumber       = 2,\n\tpages        = {137--174}\n}\n@article{greiner2011causal,\n\ttitle        = {Causal effects of perceived immutable characteristics},\n\tauthor       = {D James Greiner and Donald B Rubin},\n\tyear         = 2011,\n\tjournal      = {Review of Economics and Statistics},\n\tvolume       = 93,\n\tnumber       = 3,\n\tpages        = {775--785}\n}\n@inproceedings{grenager05segmentation,\n\ttitle        = {Unsupervised learning of field segmentation models for information extraction},\n\tauthor       = {Trond Grenager and Dan Klein and Christopher D. Manning},\n\tyear         = 2005,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {371--378}\n}\n@inproceedings{grenager06verb,\n\ttitle        = {Unsupervised discovery of a statistical verb lexicon},\n\tauthor       = {Trond Grenager and Christopher D. Manning},\n\tyear         = 2006,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{gretton2006kernel,\n\ttitle        = {A kernel method for the two-sample-problem},\n\tauthor       = {Gretton, Arthur and Borgwardt, Karsten M and Rasch, Malte and Sch{\\\"o}lkopf, Bernhard and Smola, Alex J},\n\tyear         = 2006,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {513--520}\n}\n@incollection{gretton2008covariate,\n\ttitle        = {Covariate Shift by Kernel Mean Matching},\n\tauthor       = {Arthur Gretton and Alex Smola and Jiayuan Huang and Marcel Schmittfull and Karsten Borgwardt and Bernhard Sch{\\\"o}lkopf},\n\tyear         = 2008,\n\tbooktitle    = {Dataset Shift in Machine Learning}\n}\n@article{gretton2012kernel,\n\ttitle        = {A Kernel Two-Sample Test},\n\tauthor       = {Gretton, Arthur and Borgwardt, Karsten M. and Rasch, Malte J. and Sch\\\"{o}lkopf, Bernhard and Smola, Alexander},\n\tyear         = 2012,\n\tmonth        = mar,\n\tjournal      = {J. Mach. Learn. Res.},\n\tpublisher    = {JMLR.org},\n\tvolume       = 13,\n\tnumber       = {null},\n\tpages        = {723–773},\n\tissn         = {1532-4435},\n\tissue_date   = {3/1/2012},\n\tnumpages     = 51\n}\n@article{greven2011longitudinal,\n\ttitle        = {Longitudinal functional principal component analysis},\n\tauthor       = {Sonja Greven and Ciprian Crainiceanu and Brian Caffo and Daniel Reich},\n\tyear         = 2011,\n\tjournal      = {Recent Advances in Functional Data Analysis and Related Topics}\n}\n@article{grice75maxims,\n\ttitle        = {Logic and Conversation},\n\tauthor       = {H. P. Grice},\n\tyear         = 1975,\n\tjournal      = {Syntax and Semantics},\n\tvolume       = 3,\n\tpages        = {41--58}\n}\n@inproceedings{griffiths06ibp,\n\ttitle        = {Infinite Latent Feature Models and the {I}ndian Buffet Process},\n\tauthor       = {Tom Griffiths and Zoubin Ghahramani},\n\tyear         = 2006,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{griffiths2004finding,\n\ttitle        = {Finding scientific topics},\n\tauthor       = {Griffiths, Thomas L and Steyvers, Mark},\n\tyear         = 2004,\n\tjournal      = {Proceedings of the National Academy of Sciences},\n\tpublisher    = {National Acad Sciences},\n\tvolume       = 101,\n\tnumber       = {suppl 1},\n\tpages        = {5228--5235}\n}\n@article{griffithsFinding,\n\ttitle        = {Finding scientific topics},\n\tauthor       = {T.~L. Griffiths and M. Steyvers},\n\tyear         = 2004,\n\tjournal      = {Proceedings of the National Academy of Sciences},\n\tvolume       = 101,\n\tpages        = {5228--5235}\n}\n@article{grigalis2014unsupervised,\n\ttitle        = {Unsupervised structured data extraction from template-generated web pages},\n\tauthor       = {Tomas Grigalis and Antanas Cenys},\n\tyear         = 2014,\n\tjournal      = {Journal of Universal Computer Science},\n\tvolume       = 20,\n\tpages        = {169--192}\n}\n@article{Grigoriadis1995,\n\ttitle        = {{A sublinear-time randomized approximation algorithm for matrix games}},\n\tauthor       = {Grigoriadis, Michael D. and Khachiyan, Leonid G.},\n\tyear         = 1995,\n\tjournal      = {Operations Research Letters},\n\tpublisher    = {Elsevier},\n\tvolume       = 18,\n\tnumber       = 2,\n\tpages        = {53--58},\n\tdoi          = {10.1016/0167-6377(95)00032-0},\n\tissn         = {01676377},\n\tabstract     = {This paper presents a parallel randomizedalgorithm which computes a pair of $\\epsilon$-optimal strategies for a given (m,n)-matrixgameA = [aij] ? [?1, 1] in O($\\epsilon$?2log2(n+m)) expected time on an (n+m)/log(n+m)-processor EREW PRAM. For any fixed accuracy ? > 0, the expected sequential running time of the suggested algorithm is O((n + m)log(n + m)), which is sublinear in mn, the number of input elements of A. On the other hand, simple arguments are given to show that for , any deterministic algorithm for computing a pair of $\\epsilon$-optimal strategies of an (m, n)-matrixgameA with ± 1 elements examines $\\Omega$(mn) of its elements. In particular, for m = n the randomizedalgorithm achieves an almost quadratic expected speedup relative to any deterministic method.},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Grigoriadis, Khachiyan - 1995 - A sublinear-time randomized approximation algorithm for matrix games.pdf:pdf},\n\tkeywords     = {approximation algorithms,complexity,linear programming,matrix games,parallel algorithms,randomized},\n\tmendeley-groups = {Optimization/Multiplicative Weight/LP}\n}\n@article{grill2020bootstrap,\n\ttitle        = {Bootstrap your own latent: A new approach to self-supervised learning},\n\tauthor       = {Grill, Jean-Bastien and Strub, Florian and Altch{\\'e}, Florent and Tallec, Corentin and Richemond, Pierre H and Buchatskaya, Elena and Doersch, Carl and Pires, Bernardo Avila and Guo, Zhaohan Daniel and Azar, Mohammad Gheshlaghi and others},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.07733},\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 33,\n\tpages        = {21271--21284}\n}\n@article{gritzmann1995,\n\ttitle        = {Largest j-simplices in n-polytopes},\n\tauthor       = {Gritzmann, Peter and Klee, Victor and Larman, David},\n\tyear         = 1995,\n\tjournal      = {Discrete \\& Computational Geometry},\n\tpublisher    = {Springer},\n\tvolume       = 13,\n\tnumber       = 1,\n\tpages        = {477--515}\n}\n@article{gromov1983topological,\n\ttitle        = {A topological application of the isoperimetric inequality},\n\tauthor       = {Mikhail Gromov and Vitali D. Milman},\n\tyear         = 1983,\n\tjournal      = {American Journal of Mathematics},\n\tvolume       = 105,\n\tnumber       = 4,\n\tpages        = {843--854}\n}\n@article{Gross11,\n\ttitle        = {Recovering Low-Rank Matrices From Few Coefficients in Any Basis},\n\tauthor       = {Gross, D.},\n\tyear         = 2011,\n\tmonth        = mar,\n\tjournal      = {IEEE Trans. Inf. Theor.},\n\tpublisher    = {IEEE Press},\n\taddress      = {Piscataway, NJ, USA},\n\tvolume       = 57,\n\tnumber       = 3,\n\tpages        = {1548--1566},\n\tdoi          = {10.1109/TIT.2011.2104999},\n\tissn         = {0018-9448},\n\turl          = {http://dx.doi.org/10.1109/TIT.2011.2104999},\n\tissue_date   = {March 2011},\n\tnumpages     = 19,\n\tacmid        = 2273790,\n\tkeywords     = {Compressed sensing, matrix completion, matrix recovery, operator large-deviation bound, quantum-state tomography}\n}\n@article{grosse2017statistical,\n\ttitle        = {On the (statistical) detection of adversarial examples},\n\tauthor       = {Kathrin Grosse and Praveen Manoharan and Nicolas Papernot and Michael Backes and Patrick McDaniel},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1702.06280}\n}\n@article{grosz1986attention,\n\ttitle        = {Attention, intentions, and the structure of discourse},\n\tauthor       = {Barbara J Grosz and Candace L Sidner},\n\tyear         = 1986,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 12,\n\tnumber       = 3,\n\tpages        = {175--204}\n}\n@techreport{grother2011,\n\ttitle        = {Report on the Evaluation of 2D Still-Image Face Recognition Algorithms},\n\tauthor       = {Patrick J. Grother and George W. Quinn and P. Jonathon Phillips},\n\tyear         = 2011,\n\tinstitution  = {NIST}\n}\n@inproceedings{grover2017unfolding,\n\ttitle        = {Sentence Alignment using Unfolding Recursive Autoencoders},\n\tauthor       = {Jeenu Grover and Pabitra Mitra},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {16--20}\n}\n@inproceedings{gruber2007hidden,\n\ttitle        = {Hidden topic markov models},\n\tauthor       = {Gruber, Amit and Weiss, Yair and Rosen-Zvi, Michal},\n\tyear         = 2007,\n\tbooktitle    = {Artificial intelligence and statistics},\n\tpages        = {163--170},\n\torganization = {PMLR}\n}\n@inproceedings{grunwald2000policies,\n\ttitle        = {Policies for dynamic clock scheduling},\n\tauthor       = {\n\t\tGrunwald, Dirk and Morrey,III, Charles B. and Levis, Philip and Neufeld,\n\n\t\tMichael and Farkas, Keith I.\n\t},\n\tyear         = 2000,\n\tbooktitle    = {\n\t\tProceedings of the 4th conference on Symposium on Operating System\n\n\t\tDesign \\& Implementation - Volume 4\n\t},\n\tlocation     = {San Diego, California},\n\tpublisher    = {USENIX Association},\n\taddress      = {Berkeley, CA, USA},\n\tseries       = {OSDI'00},\n\tpages        = {6--6},\n\tacmid        = 1251235,\n\tnumpages     = 1\n}\n@inproceedings{grzes2017reward,\n\ttitle        = {Reward Shaping in Episodic Reinforcement Learning},\n\tauthor       = {Marek Grzes},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Autonomous Agents and Multiagent Systems (AAMAS)}\n}\n@article{gs05,\n\ttitle        = {Framewise phoneme classification with bidirectional {LSTM} and other neural network architectures},\n\tauthor       = {Graves, Alex and Schmidhuber, J{\\\"u}rgen},\n\tyear         = 2005,\n\tjournal      = {Neural Networks},\n\tpublisher    = {Elsevier},\n\tvolume       = 18,\n\tnumber       = {5-6},\n\tpages        = {602--610}\n}\n@inproceedings{GS12,\n\ttitle        = {Vertex neighborhoods, low conductance cuts, and good seeds for local community methods},\n\tauthor       = {David F. Gleich and C. Seshadhri},\n\tyear         = 2012,\n\tbooktitle    = {KDD '2012}\n}\n@article{gss02,\n\ttitle        = {Learning precise timing with {LSTM} recurrent networks},\n\tauthor       = {Gers, Felix A and Schraudolph, Nicol N and Schmidhuber, J{\\\"u}rgen},\n\tyear         = 2002,\n\tjournal      = {Journal of machine learning research},\n\tvolume       = 3,\n\tnumber       = {Aug},\n\tpages        = {115--143}\n}\n@article{gtex2020gtex,\n\ttitle        = {The {GTEx Consortium} atlas of genetic regulatory effects across human tissues},\n\tauthor       = {GTEx Consortium and others},\n\tyear         = 2020,\n\tjournal      = {Science},\n\tvolume       = 369,\n\tnumber       = 6509,\n\tpages        = {1318--1330}\n}\n@article{gu1996efficient,\n\ttitle        = {Efficient algorithms for computing a strong rank-revealing QR factorization},\n\tauthor       = {Gu, Ming and Eisenstat, Stanley C},\n\tyear         = 1996,\n\tjournal      = {SIAM Journal on Scientific Computing},\n\tpublisher    = {SIAM},\n\tvolume       = 17,\n\tnumber       = 4,\n\tpages        = {848--869}\n}\n@inproceedings{gu2009recognition,\n\ttitle        = {Recognition using regions},\n\tauthor       = {Chunhui Gu and Joseph J Lim and Pablo Arbeláez and Jitendra Malik},\n\tyear         = 2009,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {1030--1037}\n}\n@inproceedings{gu2010collaborative,\n\ttitle        = {Collaborative Filtering: Weighted Nonnegative Matrix Factorization Incorporating User and Item Graphs.},\n\tauthor       = {Gu, Quanquan and Zhou, Jie and Ding, Chris HQ},\n\tyear         = 2010,\n\tbooktitle    = {SDM},\n\tpages        = {199--210},\n\torganization = {SIAM}\n}\n@book{gu2013smoothing,\n\ttitle        = {Smoothing spline ANOVA models},\n\tauthor       = {Gu, Chong},\n\tyear         = 2013,\n\tpublisher    = {Springer Science \\& Business Media},\n\tvolume       = 297\n}\n@article{gu2014subspace,\n\ttitle        = {Subspace Iteration Randomization and Singular Value Problems},\n\tauthor       = {Gu,Ming},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1408.2208}\n}\n@inproceedings{gu2015robustdenoise,\n\ttitle        = {Towards deep neural network architectures robust to adversarial examples},\n\tauthor       = {Shixiang Gu and Luca Rigazio},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Learning Representations Workshop (ICLR)}\n}\n@inproceedings{gu2016copying,\n\ttitle        = {Incorporating Copying Mechanism in Sequence-to-Sequence Learning},\n\tauthor       = {Jiatao Gu and Zhengdong Lu and Hang Li and Victor O.K. Li},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{gu2017badnets,\n\ttitle        = {Badnets: Identifying vulnerabilities in the machine learning model supply chain},\n\tauthor       = {Tianyu Gu and Brendan Dolan-Gavitt and Siddharth Garg},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1708.06733}\n}\n@article{gu2017search,\n\ttitle        = {Search Engine Guided Non-Parametric Neural Machine Translation},\n\tauthor       = {Jiatao Gu and Yong Wang and Kyunghyun Cho and Victor OK Li},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1705.07267}\n}\n@article{gu2019insertion,\n\ttitle        = {Insertion-based Decoding with automatically Inferred Generation Order},\n\tauthor       = {Jiatao Gu and Qi Liu and Kyunghyun Cho},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1902.01370}\n}\n@article{gu2019levenshtein,\n\ttitle        = {Levenshtein Transformer},\n\tauthor       = {Jiatao Gu and Changhan Wang and Jake Zhao},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1905.11006}\n}\n@article{gu2020characterize,\n\ttitle        = {How to Characterize The Landscape of Overparameterized Convolutional Neural Networks},\n\tauthor       = {Gu, Yihong and Zhang, Weizhong and Fang, Cong and Lee, Jason D and Zhang, Tong},\n\tyear         = 2020,\n\tjournal      = {Neural Information Processing Systems (NeurIPS)}\n}\n@article{gu2020domain,\n\ttitle        = {Domain-specific language model pretraining for biomedical natural language processing},\n\tauthor       = {Yu Gu and Robert Tinn and Hao Cheng and Michael Lucas and Naoto Usuyama and Xiaodong Liu and Tristan Naumann and Jianfeng Gao and Hoifung Poon},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.15779}\n}\n@inproceedings{gu2021beyond,\n\ttitle        = {Beyond I.I.D.: Three Levels of Generalization for Question Answering on Knowledge Bases},\n\tauthor       = {Yu Gu and Sue Kase and Michelle T. Vanni and Brian M. Sadler and Percy Liang and Xifeng Yan and Yu Su},\n\tyear         = 2021,\n\tbooktitle    = {World Wide Web (WWW)}\n}\n@inproceedings{guadarrama2013grounding,\n\ttitle        = {Grounding spatial relations for human-robot interaction},\n\tauthor       = {S. Guadarrama and L. Riano and D. Golland and D. Gouhring and Y. Jia and D. Klein and P. Abbeel and T. Darrell},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Intelligent Robots and Systems (IROS)}\n}\n@inproceedings{guadarrama2014open,\n\ttitle        = {Open-vocabulary object retrieval},\n\tauthor       = {S Guadarrama and E Rodner and K Saenko and N Zhang and R Farrell and J Donahue and T Darrell},\n\tyear         = 2014,\n\tbooktitle    = {Robotics: Science and Systems (RSS)}\n}\n@article{guedon2007lp,\n\ttitle        = {Lp-moments of random vectors via majorizing measures},\n\tauthor       = {Gu{\\'e}don, Olivier and Rudelson, Mark},\n\tyear         = 2007,\n\tjournal      = {Advances in Mathematics},\n\tpublisher    = {Elsevier},\n\tvolume       = 208,\n\tnumber       = 2,\n\tpages        = {798--823}\n}\n@article{guedon2014community,\n\ttitle        = {Community detection in sparse networks via {G}rothendieck's inequality},\n\tauthor       = {Olivier Gu{\\'e}don and Roman Vershynin},\n\tyear         = 2014,\n\tjournal      = {arXiv}\n}\n@article{guest2001morse,\n\ttitle        = {Morse theory in the 1990's},\n\tauthor       = {Guest, Martin},\n\tyear         = 2001,\n\tjournal      = {arXiv preprint math/0104155}\n}\n@article{guidotti2018survey,\n\ttitle        = {A Survey Of Methods For Explaining Black Box Models},\n\tauthor       = {Riccardo Guidotti and Anna Monreale and Franco Turini and Dino Pedreschi and Fosca Giannotti},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1802.01933}\n}\n@inproceedings{guillaume2017fader,\n\ttitle        = {Fader Networks: Manipulating Images by Sliding Attributes},\n\tauthor       = {Guillaume Lample and Neil Zeghidour and Nicolas Usunier and Antoine Bordes and Ludovic Denoyer and Marc'Aurelio Ranzato},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{gulcehre2016pointing,\n\ttitle        = {Pointing the Unknown Words},\n\tauthor       = {Caglar Gulcehre and Sungjin Ahn and Ramesh Nallapati and Bowen Zhou and Yoshua Bengio},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{guler1992ppa,\n\ttitle        = {New Proximal Point Algorithms for Convex Minimization},\n\tauthor       = {Osman Guler},\n\tyear         = 1992,\n\tjournal      = {SIAM Journal on Optimization},\n\tvolume       = 2,\n\tnumber       = 4,\n\tpages        = {649--664}\n}\n@inproceedings{gulrajani2021in,\n\ttitle        = {In Search of Lost Domain Generalization},\n\tauthor       = {Ishaan Gulrajani and David Lopez-Paz},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2007.01434},\n\tbooktitle    = {International Conference on Learning Representations},\n\turl          = {https://openreview.net/forum?id=lQdXeXDoWtI}\n}\n@article{gulwani2007program,\n\ttitle        = {Program verification as probabilistic inference},\n\tauthor       = {Sumit Gulwani and Nebojsa Jojic},\n\tyear         = 2007,\n\tjournal      = {ACM SIGPLAN Notices},\n\tvolume       = 42,\n\tnumber       = 1,\n\tpages        = {277--289}\n}\n@article{gulwani2011automating,\n\ttitle        = {Automating string processing in spreadsheets using input-output examples},\n\tauthor       = {Sumit Gulwani},\n\tyear         = 2011,\n\tjournal      = {ACM SIGPLAN Notices},\n\tvolume       = 46,\n\tnumber       = 1,\n\tpages        = {317--330}\n}\n@inproceedings{gulwani2014nlyze,\n\ttitle        = {N{L}yze: interactive programming by natural language for spreadsheet data analysis and manipulation},\n\tauthor       = {Sumit Gulwani and Mark Marron},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Management of Data, SIGMOD},\n\tpages        = {803--814}\n}\n@inproceedings{gunasekar2017implicit,\n\ttitle        = {Implicit regularization in matrix factorization},\n\tauthor       = {Gunasekar, Suriya and Woodworth, Blake E and Bhojanapalli, Srinadh and Neyshabur, Behnam and Srebro, Nati},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {6151--6159}\n}\n@article{gunasekar2018characterizing,\n\ttitle        = {Characterizing implicit bias in terms of optimization geometry},\n\tauthor       = {Gunasekar, Suriya and Lee, Jason and Soudry, Daniel and Srebro, Nathan},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1802.08246}\n}\n@inproceedings{gunasekar2018implicit,\n\ttitle        = {Implicit bias of gradient descent on linear convolutional networks},\n\tauthor       = {Gunasekar, Suriya and Lee, Jason D and Soudry, Daniel and Srebro, Nati},\n\tyear         = 2018,\n\tjournal      = {Neural Information Processing Systems (NIPS)},\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {9461--9471}\n}\n@inproceedings{gunopulos2001time,\n\ttitle        = {Time Series Similarity Measures and Time Series Indexing},\n\tauthor       = {Dimitrios Gunopulos and Gautam Das},\n\tyear         = 2001,\n\tbooktitle    = {SIGMOD Conference},\n\taddress      = {Santa Barbara, CA},\n\tnote         = {Tutorial},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@book{guo2009continuous,\n\ttitle        = {Continuous-time {M}arkov decision processes},\n\tauthor       = {Xianping Guo and On{\\'e}simo Hern{\\'a}ndez-Lerma},\n\tyear         = 2009,\n\tpublisher    = {Springer}\n}\n@inproceedings{guo2010approximate,\n\ttitle        = {Approximate joint diagonalization by nonorthogonal nonparametric jacobi transformations},\n\tauthor       = {Xijing Guo and Shihua Zhu and Sebastian Miron and David Brie},\n\tyear         = 2010,\n\tbooktitle    = {International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},\n\tpages        = {3774--3777}\n}\n@article{guo2012mean,\n\ttitle        = {A mean--variance optimization problem for discounted Markov decision processes},\n\tauthor       = {Guo, Xianping and Ye, Liuer and Yin, George},\n\tyear         = 2012,\n\tjournal      = {European Journal of Operational Research},\n\tpublisher    = {Elsevier},\n\tvolume       = 220,\n\tnumber       = 2,\n\tpages        = {423--429}\n}\n@inproceedings{guo2014deep,\n\ttitle        = {Deep learning for real-time Atari game play using offline Monte-Carlo tree search planning},\n\tauthor       = {Xiaoxiao Guo and Satinder Singh and Honglak Lee and Richard L Lewis and Xiaoshi Wang},\n\tyear         = 2014,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {3338--3346}\n}\n@inproceedings{guo2017calibration,\n\ttitle        = {On calibration of modern neural networks},\n\tauthor       = {Guo, Chuan and Pleiss, Geoff and Sun, Yu and Weinberger, Kilian Q},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {1321--1330},\n\torganization = {PMLR}\n}\n@article{guo2018numerical,\n\ttitle        = {Numerical analysis near singularities in RBF networks},\n\tauthor       = {Guo, Weili and Wei, Haikun and Onflg, Yew-Soon and Hervas, Jaime Rubio and Zhao, Junsheng and Wang, Hai and Zhang, Kanjian},\n\tyear         = 2018,\n\tjournal      = {The Journal of Machine Learning Research},\n\tpublisher    = {JMLR. org},\n\tvolume       = 19,\n\tnumber       = 1,\n\tpages        = {1--39}\n}\n@article{gupta2008extracting,\n\ttitle        = {Extracting dynamics from static cancer expression data},\n\tauthor       = {Anupam Gupta and Ziv Bar-Joseph},\n\tyear         = 2008,\n\tjournal      = {IEEE/ACM Transactions on Computational Biology and Bioinformatics (TCBB)},\n\tvolume       = 5,\n\tnumber       = 2,\n\tpages        = {172--182}\n}\n@inproceedings{gupta2009answering,\n\ttitle        = {Answering table augmentation queries from unstructured lists on the web},\n\tauthor       = {Rahul Gupta and Sunita Sarawagi},\n\tyear         = 2009,\n\tbooktitle    = {Very Large Data Bases (VLDB)},\n\tnumber       = 1,\n\tpages        = {289--300}\n}\n@article{gupta2016monotonic,\n\ttitle        = {Monotonic calibrated interpolated look-up tables},\n\tauthor       = {Maya Gupta and Andrew Cotter and Jan Pfeifer and Konstantin Voevodski and Kevin Canini and Alexander Mangylov and Wojciech Moczydlowski and Alexander Van Esbroeck},\n\tyear         = 2016,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 17,\n\tnumber       = 1,\n\tpages        = {3790--3836}\n}\n@inproceedings{gupta2017deepfix,\n\ttitle        = {Deepfix: Fixing Common {C} Language Errors by Deep Learning},\n\tauthor       = {Rahul Gupta and Soham Pal and Aditya Kanade and Shirish K. Shevade},\n\tyear         = 2017,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{gupta2018meta,\n\ttitle        = {Meta-reinforcement learning of structured exploration strategies},\n\tauthor       = {Abhishek Gupta and Russell Mendonca and YuXuan Liu and Pieter Abbeel and Sergey Levine},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {5302--5311}\n}\n@inproceedings{gupta2019deep,\n\ttitle        = {Deep reinforcement learning for programming language correction},\n\tauthor       = {Rahul Gupta and Aditya Kanade and Shirish Shevade},\n\tyear         = 2019,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@article{gupta2020distribution,\n\ttitle        = {Distribution-free binary classification: prediction sets, confidence intervals and calibration},\n\tauthor       = {Gupta, Chirag and Podkopaev, Aleksandr and Ramdas, Aaditya},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.10564}\n}\n@article{gupta2020gmat,\n\ttitle        = {{GMAT}: Global Memory Augmentation for Transformers},\n\tauthor       = {Ankit Gupta and Jonathan Berant},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.03274}\n}\n@article{gupta2021reset,\n\ttitle        = {Reset-Free Reinforcement Learning via Multi-Task Learning: Learning Dexterous Manipulation Behaviors without Human Intervention},\n\tauthor       = {Abhishek Gupta and Justin Yu and Tony Zhao and Vikash Kumar and Aaron Rovinsky and Kelvin Xu and Thomas Devlin and Sergey Levine},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2104.11203}\n}\n@article{gurcan2009histopathological,\n\ttitle        = {Histopathological image analysis: A review},\n\tauthor       = {Metin N Gurcan and Laura E Boucheron and Ali Can and Anant Madabhushi and Nasir M Rajpoot and Bulent Yener},\n\tyear         = 2009,\n\tjournal      = {IEEE reviews in biomedical engineering},\n\tvolume       = 2,\n\tpages        = {147--171}\n}\n@misc{gurobi2016,\n\ttitle        = {Gurobi Optimizer Reference Manual},\n\tauthor       = {{{Gurobi {Optimization}, Inc.}}},\n\tyear         = 2016\n}\n@article{gurumurthy2019mame,\n\ttitle        = {MAME: Model-Agnostic Meta-Exploration},\n\tauthor       = {Swaminathan Gurumurthy and Sumit Kumar and Katia Sycara},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1911.04024}\n}\n@inproceedings{gururangan2018annotation,\n\ttitle        = {Annotation Artifacts in Natural Language Inference Data},\n\tauthor       = {Suchin Gururangan and Swabha Swayamdipta and Omer Levy and Roy Schwartz and Samuel Bowman and Noah A Smith},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {107--112}\n}\n@article{gururangan2020don,\n\ttitle        = {Don't stop pretraining: adapt language models to domains and tasks},\n\tauthor       = {Suchin Gururangan and Ana Marasovi{\\'c} and Swabha Swayamdipta and Kyle Lo and Iz Beltagy and Doug Downey and Noah A Smith},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2004.10964}\n}\n@inproceedings{Guruswami2001,\n\ttitle        = {Expander-Based Constructions of Efficiently Decodable Codes},\n\tauthor       = {Guruswami, V. and Indyk, P.},\n\tyear         = 2001,\n\tbooktitle    = {Proceedings of the 42nd IEEE symposium on Foundations of Computer Science},\n\tpublisher    = {IEEE Computer Society},\n\taddress      = {Washington, DC, USA},\n\tseries       = {FOCS '01},\n\tpages        = {658--},\n\tisbn         = {0-7695-1390-5},\n\turl          = {http://dl.acm.org/citation.cfm?id=874063.875548},\n\tacmid        = 875548\n}\n@article{guruswami2009hardness,\n\ttitle        = {Hardness of learning halfspaces with noise},\n\tauthor       = {Venkatesan Guruswami and Prasad Raghavendra},\n\tyear         = 2009,\n\tjournal      = {SIAM Journal on Computing},\n\tvolume       = 39,\n\tnumber       = 2,\n\tpages        = {742--765}\n}\n@inproceedings{guruswami2012optimal,\n\ttitle        = {Optimal column-based low-rank matrix reconstruction},\n\tauthor       = {Guruswami, Venkatesan and Sinop, Ali Kemal},\n\tyear         = 2012,\n\tbooktitle    = {Proceedings of the Twenty-Third Annual ACM-SIAM Symposium on Discrete Algorithms},\n\tpages        = {1207--1214},\n\torganization = {SIAM}\n}\n@article{gutierrez2013guaranteed,\n\ttitle        = {Guaranteed Model Order Estimation and Learnability Bounds for LDA},\n\tauthor       = {Guti{\\'e}rrez, ED},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1312.2646}\n}\n@inproceedings{guu2015traversing,\n\ttitle        = {Traversing Knowledge Graphs in Vector Space},\n\tauthor       = {Kelvin Guu and John Miller and Percy Liang},\n\tyear         = 2015,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{guu2017bridging,\n\ttitle        = {From Language to Programs: Bridging Reinforcement Learning and Maximum Marginal Likelihood},\n\tauthor       = {Kelvin Guu and Panupong Pasupat and Evan Zheran Liu and Percy Liang},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{guu2018edit,\n\ttitle        = {Generating Sentences by Editing Prototypes},\n\tauthor       = {Kelvin Guu and Tatsunori B. Hashimoto and Yonatan Oren and Percy Liang},\n\tyear         = 2018,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = {0}\n}\n@article{guzman2019flores,\n\ttitle        = {Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-{English} and Sinhala-{English}},\n\tauthor       = {Francisco Guzmán and Peng-Jen Chen and Myle Ott and Juan Pino and Guillaume Lample and Philipp Koehn and Vishrav Chaudhary and Marc'Aurelio Ranzato},\n\tyear         = 2019,\n\tjournal      = {arXiv}\n}\n@inproceedings{GV,\n\ttitle        = {Matrix Computations},\n\tauthor       = {G. Golub and C. van Loan},\n\tyear         = 1996,\n\tbooktitle    = {The Johns Hopkins University Press}\n}\n@inproceedings{GVX,\n\ttitle        = {Fourier PCA and robust tensor decomposition},\n\tauthor       = {N. Goyal and S. Vempala and Y. Xiao.},\n\tyear         = 2014,\n\tbooktitle    = {STOC},\n\tpages        = {584--593}\n}\n@article{gwd14,\n\ttitle        = {Neural Turing Machines},\n\tauthor       = {Alex Graves and Greg Wayne and Ivo Danihelka},\n\tyear         = 2014,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1410.5401},\n\turl          = {http://arxiv.org/abs/1410.5401}\n}\n@inproceedings{GWW,\n\ttitle        = {On the local correctness of $\\ell_1$-minimization for dictionary learning},\n\tauthor       = {Q. Geng and H. Wang and J. Wright.},\n\tyear         = 2013,\n\tbooktitle    = {arXiv:1101.5672}\n}\n@misc{gym,\n\ttitle        = {OpenAI Gym},\n\tauthor       = {Greg Brockman and Vicki Cheung and Ludwig Pettersson and Jonas Schneider and John Schulman and Jie Tang and Wojciech Zaremba},\n\tyear         = 2016,\n\teprint       = {arXiv:1606.01540}\n}\n@inproceedings{gyongyi2004combating,\n\ttitle        = {Combating web spam with trustrank},\n\tauthor       = {Zolt{\\'a}n Gy{\\\"o}ngyi and Hector Garcia-Molina and Jan Pedersen},\n\tyear         = 2004,\n\tbooktitle    = {Very Large Data Bases (VLDB)}\n}\n@book{gyorfi2006distribution,\n\ttitle        = {A distribution-free theory of nonparametric regression},\n\tauthor       = {Gy{\\\"o}rfi, L{\\'a}szl{\\'o} and Kohler, Michael and Krzyzak, Adam and Walk, Harro},\n\tyear         = 2006,\n\tpublisher    = {Springer Science \\& Business Media}\n}\n@article{gyorgy2007line,\n\ttitle        = {The On-Line Shortest Path Problem Under Partial Monitoring},\n\tauthor       = {Gy{\\\"o}rgy, Andr{\\'a}s and Linder, Tam{\\'a}s and Lugosi, G{\\'a}bor and Ottucs{\\'a}k, Gy{\\\"o}rgy},\n\tyear         = 2007,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 8,\n\tpages        = {2369--2403}\n}\n@inproceedings{H,\n\ttitle        = {On the provable convergence of alternating minimization for matrix completion},\n\tauthor       = {M. Hardt},\n\tyear         = 2013,\n\tbooktitle    = {arxiv:1312.0925}\n}\n@article{h91,\n\ttitle        = {Untersuchungen zu dynamischen neuronalen Netzen},\n\tauthor       = {Hochreiter, Sepp},\n\tyear         = 1991,\n\tjournal      = {Diploma, Technische Universit{\\\"a}t M{\\\"u}nchen},\n\tvolume       = 91,\n\tnumber       = 1\n}\n@article{h98,\n\ttitle        = {On the piecewise analysis of networks of linear threshold neurons},\n\tauthor       = {Hahnloser, Richard LT},\n\tyear         = 1998,\n\tjournal      = {Neural Networks},\n\tpublisher    = {Elsevier},\n\tvolume       = 11,\n\tnumber       = 4,\n\tpages        = {691--697}\n}\n@article{ha2018world,\n\ttitle        = {World Models},\n\tauthor       = {David Ha and J{\\\"u}rgen Schmidhuber},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1803.10122}\n}\n@article{haagerup1981best,\n\ttitle        = {The best constants in the Khintchine inequality},\n\tauthor       = {Uffe Haagerup},\n\tyear         = 1981,\n\tjournal      = {Studia Mathematica},\n\tvolume       = 70,\n\tnumber       = 3,\n\tpages        = {231--283}\n}\n@inproceedings{haarnoja2018soft,\n\ttitle        = {Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor},\n\tauthor       = {Haarnoja, Tuomas and Zhou, Aurick and Abbeel, Pieter and Levine, Sergey},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1812.05905},\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {1861--1870}\n}\n@article{haastad1990tensor,\n\ttitle        = {Tensor rank is NP-complete},\n\tauthor       = {H{\\aa}stad, Johan},\n\tyear         = 1990,\n\tjournal      = {Journal of Algorithms},\n\tpublisher    = {Elsevier},\n\tvolume       = 11,\n\tnumber       = 4,\n\tpages        = {644--654}\n}\n@inproceedings{hachey2005investigating,\n\ttitle        = {Investigating the effects of selective sampling on the annotation task},\n\tauthor       = {Ben Hachey and Beatrice Alex and Markus Becker},\n\tyear         = 2005,\n\tbooktitle    = {Computational Natural Language Learning (CoNLL)},\n\tpages        = {144--151}\n}\n@inproceedings{hadsell2006dimensionality,\n\ttitle        = {Dimensionality reduction by learning an invariant mapping},\n\tauthor       = {Hadsell, Raia and Chopra, Sumit and LeCun, Yann},\n\tyear         = 2006,\n\tbooktitle    = {2006 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR'06)},\n\tvolume       = 2,\n\tpages        = {1735--1742},\n\torganization = {IEEE}\n}\n@article{haeffele2015global,\n\ttitle        = {Global optimality in tensor factorization, deep learning, and beyond},\n\tauthor       = {Haeffele, Benjamin D and Vidal, Ren{\\'e}},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1506.07540}\n}\n@inproceedings{hafner2019dream,\n\ttitle        = {Dream to Control: Learning Behaviors by Latent Imagination},\n\tauthor       = {Hafner, Danijar and Lillicrap, Timothy and Ba, Jimmy and Norouzi, Mohammad},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations}\n}\n@inproceedings{hafner2019latent,\n\ttitle        = {Learning Latent Dynamics for Planning from Pixels},\n\tauthor       = {Danijar Hafner and T. Lillicrap and Ian S. Fischer and Ruben Villegas and David R Ha and Honglak Lee and James Davidson},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{hafner2019learning,\n\ttitle        = {Learning latent dynamics for planning from pixels},\n\tauthor       = {Hafner, Danijar and Lillicrap, Timothy and Fischer, Ian and Villegas, Ruben and Ha, David and Lee, Honglak and Davidson, James},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {2555--2565},\n\torganization = {PMLR}\n}\n@inproceedings{haghighi05robust,\n\ttitle        = {Robust Textual Inference via Graph Matching},\n\tauthor       = {Aria Haghighi and Andrew Y. Ng and Christopher D. Manning},\n\tyear         = 2005,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{haghighi06induction,\n\ttitle        = {Prototype-based Grammar Induction},\n\tauthor       = {Aria Haghighi and Dan Klein},\n\tyear         = 2006,\n\tbooktitle    = {International Conference on Computational Linguistics and Association for Computational Linguistics (COLING/ACL)}\n}\n@inproceedings{haghighi06prototype,\n\ttitle        = {Prototype-Driven Learning for Sequence Models},\n\tauthor       = {Aria Haghighi and Dan Klein},\n\tyear         = 2006,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)},\n\tpages        = {320--327}\n}\n@inproceedings{haghighi07coref,\n\ttitle        = {Unsupervised Coreference Resolution in a Nonparametric {B}ayesian Model},\n\tauthor       = {Aria Haghighi and Dan Klein},\n\tyear         = 2007,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{haghighi08lexicon,\n\ttitle        = {Learning Bilingual Lexicons from Monolingual Corpora},\n\tauthor       = {Aria Haghighi and Percy Liang and Taylor Berg-Kirkpatrick and Dan Klein},\n\tyear         = 2008,\n\tbooktitle    = {Human Language Technology and Association for Computational Linguistics (HLT/ACL)}\n}\n@article{hahn2000challenges,\n\ttitle        = {The challenges of automatic summarization},\n\tauthor       = {Udo Hahn and Inderjeet Mani},\n\tyear         = 2000,\n\tjournal      = {Computer},\n\tvolume       = 33\n}\n@article{haixiang2017learning,\n\ttitle        = {Learning from class-imbalanced data: Review of methods and applications},\n\tauthor       = {Guo Haixiang and Li Yijing and Jennifer Shang and Gu Mingyun and Huang Yuanyue and Gong Bing},\n\tyear         = 2017,\n\tjournal      = {Expert Systems with Applications},\n\tvolume       = 73,\n\tpages        = {220--239}\n}\n@article{hajipour2019samplefix,\n\ttitle        = {SampleFix: Learning to Correct Programs by Sampling Diverse Fixes},\n\tauthor       = {Hossein Hajipour and Apratim Bhattacharya and Mario Fritz},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1906.10502}\n}\n@inproceedings{hakkanitur2016multidomain,\n\ttitle        = {Multi-Domain Joint Semantic Frame Parsing using Bi-directional {RNN}-{LSTM}},\n\tauthor       = {Dilek Hakkani-T{\\\"u}r and Gokhan Tur and Asli Celikyilmaz and Yun-Nung Chen and Jianfeng Gao and Li Deng and Ye-Yi Wang},\n\tyear         = 2016,\n\tbooktitle    = {InterSpeech}\n}\n@article{Hal62,\n\ttitle        = {The product of projection operators},\n\tauthor       = {Halperin, Israel},\n\tyear         = 1962,\n\tjournal      = {Acta Sci. Math.},\n\tvolume       = 23,\n\tnumber       = 1,\n\tpages        = {96--99}\n}\n@article{halikias07newbounds,\n\ttitle        = {New bounds on the unconstrained quadratic integer programming problem},\n\tauthor       = {G. D. Halikias and Imad M. Jaimoukha and U. Malik and S. K. Gungah},\n\tyear         = 2007,\n\tjournal      = {Journal of Global Optimization},\n\tvolume       = 39,\n\tnumber       = 4,\n\tpages        = {543--554}\n}\n@article{halko2011finding,\n\ttitle        = {Finding structure with randomness: Probabilistic algorithms for constructing approximate matrix decompositions},\n\tauthor       = {Halko, Nathan and Martinsson, Per-Gunnar and Tropp, Joel A},\n\tyear         = 2011,\n\tjournal      = {SIAM review},\n\tpublisher    = {SIAM},\n\tvolume       = 53,\n\tnumber       = 2,\n\tpages        = {217--288}\n}\n@book{hall2005generalized,\n\ttitle        = {Generalized method of moments},\n\tauthor       = {Alastair R Hall},\n\tyear         = 2005,\n\tpublisher    = {Oxford University Press}\n}\n@article{halpern2011dealing,\n\ttitle        = {Dealing with logical omniscience: Expressiveness and pragmatics},\n\tauthor       = {Joseph Y. Halpern and Riccardo Pucella},\n\tyear         = 2011,\n\tjournal      = {Artificial intelligence},\n\tvolume       = 175,\n\tpages        = {220--235}\n}\n@inproceedings{halpern2013unsupervised,\n\ttitle        = {Unsupervised Learning of Noisy-Or {B}ayesian Networks},\n\tauthor       = {Yoni Halpern and David Sontag},\n\tyear         = 2013,\n\tbooktitle    = {Uncertainty in Artificial Intelligence (UAI)}\n}\n@inproceedings{halpern2014anchors,\n\ttitle        = {Using Anchors to Estimate Clinical State without Labeled Data},\n\tauthor       = {Yoni Halpern and Youngduck Choi and Steve Horng and David Sontag},\n\tyear         = 2014,\n\tbooktitle    = {American Medical Informatics Association Annual Symposium},\n\tpages        = {606--615}\n}\n@book{halpern2017reasoning,\n\ttitle        = {Reasoning about uncertainty},\n\tauthor       = {Halpern, Joseph Y},\n\tyear         = 2017,\n\tpublisher    = {MIT press}\n}\n@article{hambardzumyan2021warp,\n\ttitle        = {WARP: Word-level Adversarial ReProgramming},\n\tauthor       = {Hambardzumyan, Karen and Khachatrian, Hrant and May, Jonathan},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2101.00121}\n}\n@inproceedings{hamilton2017inductive,\n\ttitle        = {Inductive Representation Learning on Large Graphs},\n\tauthor       = {William L. Hamilton and Rex Ying and Jure Leskovec},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{hamlet94random,\n\ttitle        = {Random testing},\n\tauthor       = {D. Hamlet},\n\tyear         = {1994 1994},\n\tbooktitle    = {Encyclopedia of Software Engineering},\n\tpages        = {970--978}\n}\n@article{hammoudi2016why,\n\ttitle        = {Why do Record/Replay Tests of Web Applications Break?},\n\tauthor       = {Mouna Hammoudi and Gregg Rothermel and Paolo Tonella},\n\tyear         = 2016,\n\tjournal      = {IEEE International Conference on Software Testing, Verification and Validation}\n}\n@phdthesis{hampel1968thesis,\n\ttitle        = {Contributions to the theory of robust estimation},\n\tauthor       = {Frank R. Hampel},\n\tyear         = 1968,\n\tschool       = {University of California at Berkeley}\n}\n@article{hampel1974influence,\n\ttitle        = {The influence curve and its role in robust estimation},\n\tauthor       = {Frank R Hampel},\n\tyear         = 1974,\n\tjournal      = {Journal of the American Statistical Association},\n\tvolume       = 69,\n\tnumber       = 346,\n\tpages        = {383--393}\n}\n@book{hampel1986robust,\n\ttitle        = {Robust Statistics: The Approach Based on Influence Functions},\n\tauthor       = {Frank R. Hampel and Elvezio M. Ronchetti and Peter J. Rousseeuw and Werner A. Stahel},\n\tyear         = 1986,\n\tpublisher    = {Wiley}\n}\n@inproceedings{hamze04fields,\n\ttitle        = {From Fields to Trees},\n\tauthor       = {F. Hamze and N. de Freitas},\n\tyear         = 2004,\n\tbooktitle    = {Uncertainty in Artificial Intelligence (UAI)}\n}\n@book{han2000datamining,\n\ttitle        = {Data Mining: Concepts and Techniques},\n\tauthor       = {Jiawei Han and Micheline Kamber},\n\tyear         = 2000,\n\tpublisher    = {Morgan Kaufmann}\n}\n@article{han2015deep,\n\ttitle        = {Deep compression: Compressing deep neural network with pruning, trained quantization and huffman coding},\n\tauthor       = {Han, Song and Mao, Huizi and Dally, William J},\n\tyear         = 2015,\n\tjournal      = {CoRR, abs/1510.00149},\n\tvolume       = 2\n}\n@article{han2015exploiting,\n\ttitle        = {Exploiting knowledge base to generate responses for natural language dialog listening agents},\n\tauthor       = {Sangdo Han and Jeesoo Bang and Seonghan Ryu and Gary Geunbae Lee},\n\tyear         = 2015,\n\tjournal      = {16th Annual Meeting of the Special Interest Group on Discourse and Dialogue},\n\tpages        = {129--133}\n}\n@article{han2015minimax,\n\ttitle        = {Minimax estimation of discrete distributions under $\\ell_1$ loss},\n\tauthor       = {Han, Yanjun and Jiao, Jiantao and Weissman, Tsachy},\n\tyear         = 2015,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tpublisher    = {IEEE},\n\tvolume       = 61,\n\tnumber       = 11,\n\tpages        = {6343--6354}\n}\n@article{han2020fortifying,\n\ttitle        = {Fortifying Toxic Speech Detectors Against Veiled Toxicity},\n\tauthor       = {Xiaochuang Han and Yulia Tsvetkov},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.03154}\n}\n@inproceedings{hancock2018babble,\n\ttitle        = {Training Classifiers with Natural Language Explanations},\n\tauthor       = {Braden Hancock and Paroma Varma and Stephanie Wang and Martin Bringmann and Percy Liang and Christopher R\\'e},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{hanczar2008gene,\n\ttitle        = {Classification with reject option in gene expression data},\n\tauthor       = {Blaise Hanczar and Edward R. Dougherty},\n\tyear         = 2008,\n\tjournal      = {Bioinformatics}\n}\n@article{hand2006classifier,\n\ttitle        = {Classifier technology and the illusion of progress},\n\tauthor       = {David J Hand},\n\tyear         = 2006,\n\tjournal      = {Statistical science},\n\tpages        = {1--14}\n}\n@book{hankin04lambda,\n\ttitle        = {An Introduction to Lambda Calculi for Computer Scientists},\n\tauthor       = {Chris Hankin},\n\tyear         = 2004,\n\tpublisher    = {Lightning Source}\n}\n@inproceedings{hanneke2007bound,\n\ttitle        = {A bound on the label complexity of agnostic active learning},\n\tauthor       = {Steve Hanneke},\n\tyear         = 2007,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {353--360}\n}\n@inproceedings{hanneke2019value,\n\ttitle        = {On the value of target data in transfer learning},\n\tauthor       = {Hanneke, Steve and Kpotufe, Samory},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {9871--9881}\n}\n@article{hannun2014deep,\n\ttitle        = {Deep speech: Scaling up end-to-end speech recognition},\n\tauthor       = {Awni Hannun and Carl Case and Jared Casper and Bryan Catanzaro and Greg Diamos and Erich Elsen and Ryan Prenger and Sanjeev Satheesh and Shubho Sengupta and Adam Coates and others},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1412.5567}\n}\n@article{hans2008safe,\n\ttitle        = {Safe exploration for reinforcement learning},\n\tauthor       = {Alexander Hans and Daniel Schneega{\\ss} and Anton Maximilian Sch{\\\"a}fer and Steffen Udluft},\n\tyear         = 2008,\n\tjournal      = {ESANN},\n\tpages        = {143--148}\n}\n@article{hansen1982gmm,\n\ttitle        = {Large sample properties of generalized method of moments estimators},\n\tauthor       = {Lars Peter Hansen},\n\tyear         = 1982,\n\tjournal      = {Econometrica: Journal of the Econometric Society},\n\tvolume       = 50,\n\tpages        = {1029--1054}\n}\n@article{hansen2013forest,\n\ttitle        = {High-Resolution Global Maps of 21st-Century Forest Cover Change},\n\tauthor       = {M. C. Hansen and P. V. Potapov and R. Moore and M. Hancher and S. A. Turubanova and A. Tyukavina and D. Thau and S. V. Stehman and S. J. Goetz and T. R. Loveland and A. Kommareddy and A. Egorov and L. Chini and C. O. Justice and J. R. G. Townshend},\n\tyear         = 2013,\n\tjournal      = {Science},\n\tvolume       = 342\n}\n@article{hansen2013strategy,\n\ttitle        = {Strategy iteration is strongly polynomial for 2-player turn-based stochastic games with a constant discount factor},\n\tauthor       = {Hansen, Thomas Dueholm and Miltersen, Peter Bro and Zwick, Uri},\n\tyear         = 2013,\n\tjournal      = {Journal of the ACM (JACM)},\n\tpublisher    = {ACM},\n\tvolume       = 60,\n\tnumber       = 1,\n\tpages        = 1\n}\n@article{hansen2014uncertainty,\n\ttitle        = {Uncertainty Outside and Inside Economic Models},\n\tauthor       = {Lars Peter Hansen},\n\tyear         = 2014,\n\tjournal      = {Journal of Political Economy},\n\tvolume       = 122,\n\tnumber       = 5,\n\tpages        = {945--987}\n}\n@inproceedings{hao2020adaptive,\n\ttitle        = {Adaptive exploration in linear contextual bandit},\n\tauthor       = {Hao, Botao and Lattimore, Tor and Szepesvari, Csaba},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Artificial Intelligence and Statistics},\n\tpages        = {3536--3545},\n\torganization = {PMLR}\n}\n@article{hao2020high,\n\ttitle        = {High-Dimensional Sparse Linear Bandits},\n\tauthor       = {Hao, Botao and Lattimore, Tor and Wang, Mengdi},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2011.04020}\n}\n@inproceedings{hao2021online,\n\ttitle        = {Online Sparse Reinforcement Learning},\n\tauthor       = {Hao, Botao and Lattimore, Tor and Szepesv{\\'a}ri, Csaba and Wang, Mengdi},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Artificial Intelligence and Statistics},\n\tpages        = {316--324},\n\torganization = {PMLR}\n}\n@article{haochen2020shape,\n\ttitle        = {Shape matters: Understanding the implicit bias of the noise covariance},\n\tauthor       = {HaoChen, Jeff Z and Wei, Colin and Lee, Jason D and Ma, Tengyu},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.08680}\n}\n@misc{haochen2021provable,\n\ttitle        = {Provable Guarantees for Self-Supervised Deep Learning with Spectral Contrastive Loss},\n\tauthor       = {Jeff Z. HaoChen and Colin Wei and Adrien Gaidon and Tengyu Ma},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2106.04156},\n\teprint       = {2106.04156},\n\tarchiveprefix = {arXiv},\n\tprimaryclass = {cs.LG}\n}\n@inproceedings{har2007maximum,\n\ttitle        = {Maximum margin coresets for active and noise tolerant learning},\n\tauthor       = {{Har-Peled}, Sariel and Roth, Dan and Zimak, Dav},\n\tyear         = 2007,\n\tbooktitle    = {Proceedings of the 20th international joint conference on Artifical intelligence},\n\tpages        = {836--841},\n\torganization = {Morgan Kaufmann Publishers Inc.}\n}\n@inproceedings{harabagiu06methodsfor,\n\ttitle        = {Methods for Using Textual Entailment in Open-Domain Question Answering},\n\tauthor       = {Sanda Harabagiu and Andrew Hickl},\n\tyear         = 2006,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{hardt2013algorithms,\n\ttitle        = {Algorithms and Hardness for Robust Subspace Recovery},\n\tauthor       = {Moritz Hardt and Ankur Moitra},\n\tyear         = 2013,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@article{hardt2013provable,\n\ttitle        = {On the Provable Convergence of Alternating Minimization for Matrix Completion},\n\tauthor       = {Hardt, Moritz},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1312.0925}\n}\n@inproceedings{hardt2014fast,\n\ttitle        = {Fast Matrix Completion Without the Condition Number},\n\tauthor       = {Hardt, Moritz and Wootters, Mary},\n\tyear         = 2014,\n\tbooktitle    = {COLT 2014},\n\tpages        = {638--678}\n}\n@inproceedings{hardt2014noisy,\n\ttitle        = {The noisy power method: A meta algorithm with applications},\n\tauthor       = {Hardt, Moritz and Price, Eric},\n\tyear         = 2014,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {2861--2869}\n}\n@inproceedings{hardt2014understanding,\n\ttitle        = {Understanding alternating minimization for matrix completion},\n\tauthor       = {Hardt, Moritz},\n\tyear         = 2014,\n\tbooktitle    = {FOCS 2014},\n\torganization = {IEEE}\n}\n@article{hardt2015train,\n\ttitle        = {Train faster, generalize better: Stability of stochastic gradient descent},\n\tauthor       = {Hardt, Moritz and Recht, Benjamin and Singer, Yoram},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1509.01240}\n}\n@inproceedings{hardt2016,\n\ttitle        = {Equality of Opportunity in Supervised Learning},\n\tauthor       = {Moritz Hardt and Eric Price and Nathan Srebo},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {3315--3323}\n}\n@inproceedings{hardt2016equality,\n\ttitle        = {Equality of opportunity in supervised learning},\n\tauthor       = {Hardt, Moritz and Price, Eric and Srebro, Nati and others},\n\tyear         = 2016,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {3315--3323}\n}\n@article{hardt2016identity,\n\ttitle        = {Identity matters in deep learning},\n\tauthor       = {Hardt, Moritz and Ma, Tengyu},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1611.04231}\n}\n@inproceedings{hardt2016strategic,\n\ttitle        = {Strategic classification},\n\tauthor       = {Moritz Hardt and Nimrod Megiddo and Christos Papadimitriou and Mary Wootters},\n\tyear         = 2016,\n\tbooktitle    = {Proceedings of the 2016 ACM conference on innovations in theoretical computer science},\n\tpages        = {111--122}\n}\n@article{hardt2018gradient,\n\ttitle        = {Gradient Descent Learns Linear Dynamical Systems},\n\tauthor       = {Hardt, Moritz and Ma, Tengyu and Recht, Benjamin},\n\tyear         = 2018,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 19,\n\tpages        = {1--44}\n}\n@misc{hardtma,\n\ttitle        = {Express your Identity with deep learning},\n\tauthor       = {Moritz Hardt, Tengyu Ma},\n\tyear         = 2016,\n\talteditor    = {editor},\n\tdate         = 2016,\n\toptsubtitle  = {subtitle},\n\topttitleaddon = {titleaddon},\n\toptlanguage  = {language},\n\topthowpublished = {howpublished},\n\topttype      = {type},\n\toptversion   = {version},\n\toptnote      = {note},\n\toptorganization = {organization},\n\toptlocation  = {location},\n\toptdate      = {date},\n\toptmonth     = {month},\n\toptaddendum  = {addendum},\n\toptpubstate  = {pubstate},\n\toptdoi       = {doi},\n\topteprint    = {eprint},\n\topteprintclass = {eprintclass},\n\topteprinttype = {eprinttype},\n\topturl       = {url},\n\topturldate   = {urldate}\n}\n@inproceedings{harman1992overview,\n\ttitle        = {Overview of the first {TREC} Text Retrieval Conference},\n\tauthor       = {D. K. Harman},\n\tyear         = 1992,\n\tbooktitle    = {Text Retrieval Conference}\n}\n@article{harman1993trec,\n\ttitle        = {The first text retrieval conference (TREC-1) Rockville, MD, U.S.A., 4-6 November, 1992},\n\tauthor       = {Donna K. Harman},\n\tyear         = 1993,\n\tjournal      = {Information Processing and Management},\n\tvolume       = 29,\n\tpages        = {411--414}\n}\n@article{harmon2004amazon,\n\ttitle        = {Amazon Glitch Unmasks War Of Reviewers},\n\tauthor       = {Amy Harmon},\n\tyear         = 2004,\n\tjournal      = {New York Times}\n}\n@article{harrell1996prognostic,\n\ttitle        = {Multivariable prognostic models: issues in developing models, evaluating assumptions and adequacy, and measuring and reducing errors},\n\tauthor       = {Frank E. Harrell and Kerry Lamont Lee and Daniel B. Mark},\n\tyear         = 1996,\n\tjournal      = {Statistics in medicine},\n\tvolume       = 15,\n\tnumber       = 4,\n\tpages        = {361--387}\n}\n@inproceedings{harris2011sparql,\n\ttitle        = {{SPARQL} 1.1 Query Language},\n\tauthor       = {S. Harris and A. Seaborne},\n\tyear         = 2011,\n\tbooktitle    = {W3C Working Draft, 12 May}\n}\n@inproceedings{harris2012measuring,\n\ttitle        = {Measuring Mental Entrenchment of Phrases with Perceptual Identification, Familiarity Ratings, and Corpus Frequency Statistics},\n\tauthor       = {Catherine Caldwell-Harris and Jonathan Berant and Shimon Edelman},\n\tyear         = 2012,\n\tbooktitle    = {Frequency Effects in Cognitive Linguistics (Vol. 1): Statistical Effects in Learnability, Processing and Change},\n\tpages        = {165--194}\n}\n@article{harsanyi1975,\n\ttitle        = {Can the Maximin Principle Serve as a Basis for Morality? A Critique of John Rawls's Theory},\n\tauthor       = {John C. Harsanyi},\n\tyear         = 1975,\n\tjournal      = {The American Political Science Review},\n\tvolume       = 69,\n\tpages        = {594--606}\n}\n@article{harsanyi2004games,\n\ttitle        = {Games with incomplete information played by \"Bayesian\" players},\n\tauthor       = {John C Harsanyi},\n\tyear         = 2004,\n\tjournal      = {Management science},\n\tvolume       = 50,\n\tpages        = {1804--1817}\n}\n@techreport{Harshman,\n\ttitle        = {Foundations of the {PARAFAC} procedure: model and conditions for an `explanatory' multi-mode factor analysis},\n\tauthor       = {R. Harshman},\n\tyear         = 1970,\n\tinstitution  = {UCLA Working Papers in Phonetics}\n}\n@article{harshman1970foundations,\n\ttitle        = {Foundations of the PARAFAC procedure: models and conditions for an\" explanatory\" multimodal factor analysis},\n\tauthor       = {Harshman, Richard A},\n\tyear         = 1970,\n\tpublisher    = {University of California at Los Angeles Los Angeles}\n}\n@article{harshman1994parafac,\n\ttitle        = {PARAFAC: Parallel factor analysis},\n\tauthor       = {Harshman, Richard A and Lundy, Margaret E},\n\tyear         = 1994,\n\tjournal      = {Computational Statistics \\& Data Analysis},\n\tpublisher    = {Elsevier},\n\tvolume       = 18,\n\tnumber       = 1,\n\tpages        = {39--72}\n}\n@inproceedings{hartmann2010what,\n\ttitle        = {What would other programmers do: suggesting solutions to error messages},\n\tauthor       = {Bj{\\\"o}rn Hartmann and Daniel MacDougall and Joel Brandt and Scott R. Klemmer},\n\tyear         = 2010,\n\tbooktitle    = {Conference on Human Factors in Computing Systems (CHI)}\n}\n@book{harvey1990forecasting,\n\ttitle        = {Forecasting, Structural Time Series Models and the Kalman Filter},\n\tauthor       = {Harvey, Andrew C.},\n\tyear         = 1990,\n\tmonth        = mar,\n\tday          = 30,\n\tpublisher    = {Cambridge University Press},\n\tisbn         = {0521321964},\n\tabstract     = {\n\t\tThis book provides a synthesis of concepts and materials that ordinarily\n\n\t\tappear separately in time series and econometrics literature, presenting\n\n\t\ta comprehensive review of both theoretical and applied concepts.\n\n\t\tPerhaps the most novel feature of the book is its use of Kalman filtering\n\n\t\ttogether with econometric and time series methodology. From a technical\n\n\t\tpoint of view, state space models and the Kalman filter play a key\n\n\t\trole in the statistical treatment of structural time series models.\n\n\t\tThis technique was originally developed in control engineering but\n\n\t\tis becoming increasingly important in economics and operations research.\n\n\t\tThe book is primarily concerned with modeling economic and social\n\n\t\ttime series and with addressing the special problems that the treatment\n\n\t\tof such series pose.\n\t},\n\thowpublished = {Hardcover},\n\tkeywords     = {forecasting, kalman-filter, state-space-models, statistics, textbook},\n\tmyurl        = {http://www.worldcat.org/isbn/0521321964},\n\tsubjects     = {Time-series analysis.; Kalman filtering.}\n}\n@inproceedings{harvey2017nearly,\n\ttitle        = {Nearly-tight VC-dimension bounds for piecewise linear neural networks},\n\tauthor       = {Harvey, Nick and Liaw, Christopher and Mehrabian, Abbas},\n\tyear         = 2017,\n\tbooktitle    = {Conference on Learning Theory},\n\tpages        = {1064--1068},\n\torganization = {PMLR}\n}\n@article{hasanbeig2018logically,\n\ttitle        = {Logically-constrained reinforcement learning},\n\tauthor       = {Hasanbeig, Mohammadhosein and Abate, Alessandro and Kroening, Daniel},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1801.08099}\n}\n@inproceedings{hasenjager2002active,\n\ttitle        = {Active learning in neural networks},\n\tauthor       = {M Hasenjäger and H Ritter},\n\tyear         = 2002,\n\tbooktitle    = {New learning paradigms in soft computing},\n\tpages        = {137--169}\n}\n@inproceedings{hashimoto2016learning,\n\ttitle        = {Learning Population-Level Diffusions with Generative {RNNs}},\n\tauthor       = {Tatsunori Hashimoto and David Gifford and Tommi Jaakkola},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {2417--2426}\n}\n@article{hashimoto2016word,\n\ttitle        = {Word Embeddings as Metric Recovery in Semantic Spaces},\n\tauthor       = {Hashimoto, Tatsunori B. and Alvarez-Melis, David and Jaakkola, Tommi S.},\n\tyear         = 2016,\n\tjournal      = {Transactions of the Association for Computational Linguistics},\n\tvolume       = 4,\n\tpages        = {273--286}\n}\n@inproceedings{hashimoto2017joint,\n\ttitle        = {A joint many-task model: Growing a neural network for multiple {NLP} tasks},\n\tauthor       = {Kazuma Hashimoto and Caiming Xiong and Yoshimasa Tsuruoka and Richard Socher},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{hashimoto2017transformation,\n\ttitle        = {Unsupervised Transformation Learning via Convex Relaxations},\n\tauthor       = {Tatsunori B. Hashimoto and John Duchi and Percy Liang},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{hashimoto2018detecting,\n\ttitle        = {Detecting absurd conversations from intelligent assistant logs by exploiting user feedback utterances},\n\tauthor       = {Chikara Hashimoto and Manabu Sassano},\n\tyear         = 2018,\n\tbooktitle    = {World Wide Web (WWW)},\n\tpages        = {147--156}\n}\n@inproceedings{hashimoto2018edit,\n\ttitle        = {A Retrieve-and-Edit Framework for Predicting Structured Outputs},\n\tauthor       = {Tatsunori Hashimoto and Kelvin Guu and Yonatan Oren and Percy Liang},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{hashimoto2018repeated,\n\ttitle        = {Fairness Without Demographics in Repeated Loss Minimization},\n\tauthor       = {Tatsunori B. Hashimoto and Megha Srivastava and Hongseok Namkoong and Percy Liang},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{hashimoto2019huse,\n\ttitle        = {Unifying Human and Statistical Evaluation for Natural Language Generation},\n\tauthor       = {Tatsu Hashimoto and Hugh Zhang and Percy Liang},\n\tyear         = 2019,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{haslett1996updating,\n\ttitle        = {Updating linear models with dependent errors to include additional data and/or parameters},\n\tauthor       = {Stephen Haslett},\n\tyear         = 1996,\n\tjournal      = {Linear algebra and its applications},\n\tvolume       = 237,\n\tpages        = {329--349}\n}\n@article{hastad1990tensor,\n\ttitle        = {Tensor rank is {NP}-complete},\n\tauthor       = {J. Ho{a}stad},\n\tyear         = 1990,\n\tjournal      = {Journal of Algorithms},\n\tvolume       = 11,\n\tnumber       = 4\n}\n@book{hastie2003elements,\n\ttitle        = {The Elements of Statistical Learning},\n\tauthor       = {Hastie, T. and Tibshirani, R. and Friedman, J. H.},\n\tyear         = 2003,\n\tmonth        = jul,\n\tpublisher    = {Springer},\n\taddress      = {New York, NY, USA},\n\tseries       = {Springer Series in Statistics},\n\tisbn         = {0387952845},\n\tedition      = {Corrected},\n\tabstract     = {\n\t\tDuring the past decade there has been an explosion in computation\n\n\t\tand information technology. With it has come vast amounts of data\n\n\t\tin a variety of fields such as medicine, biology, finance, and marketing.\n\n\t\tThe challenge of understanding these data has led to the development\n\n\t\tof new tools in the field of statistics, and spawned new areas such\n\n\t\tas data mining, machine learning, and bioinformatics.\n\n\t\tMany of these tools have common underpinnings but are often expressed\n\n\t\twith different terminology. This book describes the important ideas\n\n\t\tin these areas in a common conceptual framework. While the approach\n\n\t\tis statistical, the emphasis is on concepts rather than mathematics.\n\n\t\tMany examples are given, with a liberal use of color graphics. It\n\n\t\tshould be a valuable resource for statisticians and anyone interested\n\n\t\tin data mining in science or industry.\n\n\t\tThe book's coverage is broad, from supervised learning (prediction)\n\n\t\tto unsupervised learning. The many topics include neural networks,\n\n\t\tsupport vector machines, classification trees and boosting--the first\n\n\t\tcomprehensive treatment of this topic in any book.\n\n\t\tTrevor Hastie, Robert Tibshirani, and Jerome Friedman are professors\n\n\t\tof statistics at Stanford University. They are prominent researchers\n\n\t\tin this area: Hastie and Tibshirani developed generalized additive\n\n\t\tmodels and wrote a popular book of that title. Hastie wrote much\n\n\t\tof the statistical modeling software in S-PLUS and invented principal\n\n\t\tcurves and surfaces. Tibshirani proposed the Lasso and is co-author\n\n\t\tof the very successful An Introduction to the Bootstrap. Friedman\n\n\t\tis the co-inventor of many data-mining tools including CART, MARS,\n\n\t\tand projection pursuit.\n\n\t\tFROM THE REVIEWS:\n\n\t\tTECHNOMETRICS \"[This] is a vast and complex book. Generally, it concentrates\n\n\t\ton explaining why and how the methods work, rather than how to use\n\n\t\tthem. Examples and especially the visualizations are principle features...As\n\n\t\ta source for the methods of statistical learning...it will probably\n\n\t\tbe a long time before there is a competitor to this book.\"\n\t},\n\thowpublished = {Hardcover},\n\tkeywords     = {machine-learning, statistic},\n\towner        = {leili},\n\tposted-at    = {2007-02-13 15:09:19},\n\tpriority     = 2,\n\ttimestamp    = {2011.07.28}\n}\n@article{hastie2014matrix,\n\ttitle        = {Matrix completion and low-rank SVD via fast alternating least squares},\n\tauthor       = {Hastie, Trevor and Mazumder, Rahul and , Jason and Zadeh, Reza},\n\tyear         = 2014,\n\tjournal      = {Journal of Machine Learning Research}\n}\n@article{hastie2019surprises,\n\ttitle        = {Surprises in high-dimensional ridgeless least squares interpolation},\n\tauthor       = {Hastie, Trevor and Montanari, Andrea and Rosset, Saharon and Tibshirani, Ryan J},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1903.08560}\n}\n@article{hastings1970monte,\n\ttitle        = {{M}onte {C}arlo sampling methods using {M}arkov chains and their applications},\n\tauthor       = {Keith W. Hastings},\n\tyear         = 1970,\n\tjournal      = {Biometrika},\n\tvolume       = 57,\n\tnumber       = 1,\n\tpages        = {97--109}\n}\n@inproceedings{haug2018neural,\n\ttitle        = {Neural Multi-Step Reasoning for Question Answering on Semi-Structured Tables},\n\tauthor       = {Till Haug and Octavian-Eugen Ganea and Paulina Grnarova},\n\tyear         = 2018,\n\tbooktitle    = {European Conference on Information Retrieval}\n}\n@article{haupt2006signal,\n\ttitle        = {Signal reconstruction from noisy random projections},\n\tauthor       = {Haupt, Jarvis and Nowak, Robert},\n\tyear         = 2006,\n\tjournal      = {Information Theory, IEEE Transactions on},\n\tpublisher    = {IEEE},\n\tvolume       = 52,\n\tnumber       = 9,\n\tpages        = {4036--4048}\n}\n@article{hauser2008using,\n\ttitle        = {Using motion primitives in probabilistic sample-based planning for humanoid robots},\n\tauthor       = {K. Hauser and T. Bretl and K. Harada and J. Latombe},\n\tyear         = 2008,\n\tjournal      = {Algorithmic foundation of robotics},\n\tvolume       = 7,\n\tpages        = {507--522}\n}\n@article{hausknecht2015deeprq,\n\ttitle        = {Deep Recurrent {Q}-Learning for Partially Observable MDPs},\n\tauthor       = {M. Hausknecht and P. Stone},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1507.06527}\n}\n@inproceedings{haussler94rigorous,\n\ttitle        = {Rigorous Learning Curve Bounds from Statistical Mechanics},\n\tauthor       = {David Haussler and Michael Kearns and H. Sebastian Seung and Naftali Tishby},\n\tyear         = 1994,\n\tbooktitle    = {Conference on Learning Theory (COLT)},\n\tpages        = {76--87}\n}\n@inproceedings{Haveliwala02,\n\ttitle        = {Topic-sensitive PageRank},\n\tauthor       = {Taher H. Haveliwala},\n\tyear         = 2002,\n\tbooktitle    = {WWW '02},\n\tpages        = {517--526}\n}\n@article{hawkins2000estimating,\n\ttitle        = {Estimating transition probabilities from aggregate samples plus partial transition data},\n\tauthor       = {DL Hawkins and Chien-Pai Han},\n\tyear         = 2000,\n\tjournal      = {Biometrics},\n\tvolume       = 56,\n\tnumber       = 3,\n\tpages        = {848--854}\n}\n@article{hawkins2015conducting,\n\ttitle        = {Conducting real-time multiplayer experiments on the web},\n\tauthor       = {Robert XD Hawkins},\n\tyear         = 2015,\n\tjournal      = {Behavior Research Methods},\n\tvolume       = 47,\n\tnumber       = 4,\n\tpages        = {966--976}\n}\n@inproceedings{hawkins2015you,\n\ttitle        = {Why do you ask{? G}ood questions provoke informative answers},\n\tauthor       = {Robert X. D. Hawkins and Andreas Stuhlm\\\"uller and Judith Degen and Noah D. Goodman},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the Thirty-Seventh Annual Conference of the {C}ognitive {S}cience {S}ociety}\n}\n@inproceedings{hawkins2020continual,\n\ttitle        = {Continual adaptation for efficient machine communication},\n\tauthor       = {Robert D. Hawkins and Minae Kwon and Dorsa Sadigh and Noah D. Goodman},\n\tyear         = 2020,\n\tbooktitle    = {Computational Natural Language Learning (CoNLL)}\n}\n@book{hayakawa1994ctrw,\n\ttitle        = {Choose the Right Word: A Contemporary Guide to Selecting the Precise Word for Every Situation},\n\tauthor       = {Samuel Ichiye Hayakawa},\n\tyear         = 1994,\n\tpublisher    = {Collins Reference}\n}\n@inproceedings{hayati2018retrieval,\n\ttitle        = {Retrieval-based neural code generation},\n\tauthor       = {Shirley Anugrah Hayati and Raphael Olivier and Pravalika Avvaru and Pengcheng Yin and Anthony Tomasic and Graham Neubig},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{hayes1986writing,\n\ttitle        = {Writing research and the writer},\n\tauthor       = {John R Hayes and Linda S Flower},\n\tyear         = 1986,\n\tjournal      = {American psychologist},\n\tvolume       = 41,\n\tnumber       = 10,\n\tpages        = {1106--1113}\n}\n@inproceedings{hayes2018contamination,\n\ttitle        = {Contamination Attacks and Mitigation in Multi-Party Machine Learning},\n\tauthor       = {Jamie Hayes and Olga Ohrimenko},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {6604--6615}\n}\n@inproceedings{hazan11beyond,\n\ttitle        = {Beyond the regret minimization barrier: an optimal algorithm for stochastic strongly-convex optimization},\n\tauthor       = {Elad Hazan and Satyen Kale},\n\tyear         = 2011,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@article{hazan2011hard,\n\ttitle        = {How hard is it to approximate the best Nash equilibrium?},\n\tauthor       = {Hazan, Elad and Krauthgamer, Robert},\n\tyear         = 2011,\n\tjournal      = {SIAM Journal on Computing},\n\tpublisher    = {SIAM},\n\tvolume       = 40,\n\tnumber       = 1,\n\tpages        = {79--91}\n}\n@incollection{Hazan2012-survey,\n\ttitle        = {The Convex Optimization Approach to Regret Minimization},\n\tauthor       = {Hazan, Elad},\n\tyear         = 2012,\n\tbooktitle    = {Optimization for machine learning},\n\tpublisher    = {MIT press},\n\tpages        = {287--304},\n\teditors      = {Suvrit Sra, Sebastian Nowozin and Stephen J. Wright},\n\tchapter      = 10\n}\n@article{hazan2014beyond,\n\ttitle        = {Beyond the regret minimization barrier: optimal algorithms for stochastic strongly-convex optimization.},\n\tauthor       = {Hazan, Elad and Kale, Satyen},\n\tyear         = 2014,\n\tjournal      = {Journal of Machine Learning Research},\n\tpublisher    = {JMLR.org},\n\tvolume       = 15,\n\tnumber       = 1,\n\tpages        = {2489--2512}\n}\n@inproceedings{hazan2015beyond,\n\ttitle        = {Beyond convexity: Stochastic quasi-convex optimization},\n\tauthor       = {Hazan, Elad and Levy, Kfir and Shalev-Shwartz, Shai},\n\tyear         = 2015,\n\tmonth        = jul,\n\tjournal      = {ArXiv e-prints},\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {1594--1602},\n\tadsnote      = {Provided by the SAO/NASA Astrophysics Data System},\n\tadsurl       = {http://adsabs.harvard.edu/abs/2015arXiv150702030H},\n\tarchiveprefix = {arXiv},\n\teprint       = {1507.02030},\n\tkeywords     = {Computer Science - Learning, Mathematics - Optimization and Control},\n\tprimaryclass = {cs.LG}\n}\n@inproceedings{hazan2016anon,\n\ttitle        = {A Non-generative Framework and Convex Relaxations for Unsupervised Learning.},\n\tauthor       = {Elad Hazan and Tengyu Ma},\n\tyear         = 2016,\n\tbooktitle    = {Neural Information Processing Systems (NIPS), 2016},\n\turl          = {http://arxiv.org/abs/1610.01132},\n\ttimestamp    = {Wed, 02 Nov 2016 09:51:26 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/journals/corr/HazanM16},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{hazan2018provably,\n\ttitle        = {Provably efficient maximum entropy exploration},\n\tauthor       = {Hazan, Elad and Kakade, Sham M and Singh, Karan and Van Soest, Abby},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning}\n}\n@article{HazanBook,\n\ttitle        = {{DRAFT}: Introduction to Online Convex Optimimization},\n\tauthor       = {Elad Hazan},\n\tyear         = 2015,\n\tjournal      = {Foundations and Trends in Machine Learning},\n\tvolume       = {XX},\n\tnumber       = {XX},\n\tpages        = {1--168}\n}\n@article{HazanKoren2015trustregion,\n\ttitle        = {A linear-time algorithm for trust region problems},\n\tauthor       = {Hazan, Elad and Koren, Tomer},\n\tyear         = 2015,\n\tjournal      = {Mathematical Programming},\n\tpublisher    = {Springer},\n\tpages        = {1--19}\n}\n@inproceedings{HazanKS2012,\n\ttitle        = {{Near-optimal algorithms for online matrix prediction}},\n\tauthor       = {Hazan, Elad and Kale, Satyen and {Shalev-Shwartz}, Shai},\n\tyear         = 2012,\n\tbooktitle    = {Proceedings of the 25th Annual Conference on Learning Theory - COLT '12},\n\tpages        = {38.1----38.13},\n\tissn         = 15337928,\n\turl          = {http://arxiv.org/abs/1204.0136},\n\tabstract     = {In several online prediction problems of recent interest the comparison class is composed of matrices with bounded entries. For example, in the online max-cut problem, the comparison class is matrices which represent cuts of a given graph and in online gambling the comparison class is matrices which represent permutations over n teams. Another important example is online collaborative filtering in which a widely used comparison class is the set of matrices with a small trace norm. In this paper we isolate a property of matrices, which we call (beta,tau)-decomposability, and derive an efficient online learning algorithm, that enjoys a regret bound of O*(sqrt(beta tau T)) for all problems in which the comparison class is composed of (beta,tau)-decomposable matrices. By analyzing the decomposability of cut matrices, triangular matrices, and low trace-norm matrices, we derive near optimal regret bounds for online max-cut, online gambling, and online collaborative filtering. In particular, this resolves (in the affirmative) an open problem posed by Abernethy (2010); Kleinberg et al (2010). Finally, we derive lower bounds for the three problems and show that our upper bounds are optimal up to logarithmic factors. In particular, our lower bound for the online collaborative filtering problem resolves another open problem posed by Shamir and Srebro (2011).},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {arXiv:1204.0136v1},\n\teprint       = {arXiv:1204.0136v1},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Hazan, Kale, Shalev-Shwartz - 2012 - Near-optimal algorithms for online matrix prediction.pdf:pdf},\n\tmendeley-groups = {Optimization/Mirror Descent/Mirror Descent for NP-hard Problems}\n}\n@inproceedings{he15deepresidual,\n\ttitle        = {Deep Residual Learning for Image Recognition},\n\tauthor       = {Kaiming He and Xiangyu Zhang and Shaoqing Ren and Jian Sun},\n\tyear         = 2015,\n\tbooktitle    = {arXiv prepring arXiv:1506.01497}\n}\n@article{he2006spoken,\n\ttitle        = {Spoken language understanding using the hidden vector state model},\n\tauthor       = {Yulan He and Steve Young},\n\tyear         = 2006,\n\tjournal      = {Speech Communication},\n\tvolume       = 48,\n\tpages        = {262--275}\n}\n@inproceedings{he2012cost,\n\ttitle        = {Cost-sensitive dynamic feature selection},\n\tauthor       = {He He and Hal {Daum{\\'e} III} and Jason Eisner},\n\tyear         = 2012,\n\tbooktitle    = {ICML Inferning Workshop}\n}\n@inproceedings{he2013dynamic,\n\ttitle        = {Dynamic Feature Selection for Dependency Parsing},\n\tauthor       = {He He and Hal {Daum{\\'e} III} and Jason Eisner},\n\tyear         = 2013,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1455--1464}\n}\n@inproceedings{he2015delving,\n\ttitle        = {Delving deep into rectifiers: Surpassing human-level performance on imagenet classification},\n\tauthor       = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1502.01852},\n\tbooktitle    = {Proceedings of the IEEE international conference on computer vision},\n\tpages        = {1026--1034}\n}\n@inproceedings{he2015multi,\n\ttitle        = {Multi-Perspective Sentence Similarity Modeling with Convolutional Neural Networks},\n\tauthor       = {Hua He and Kevin Gimpel and Jimmy Lin},\n\tyear         = 2015,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{he2015question,\n\ttitle        = {Question-Answer Driven Semantic Role Labeling: Using Natural Language to Annotate Natural Language},\n\tauthor       = {Luheng He and Mike Lewis and Luke Zettlemoyer},\n\tyear         = 2015,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{he2015syntax,\n\ttitle        = {Syntax-based Rewriting for Simultaneous Machine Translation},\n\tauthor       = {He He and Alvin {Grissom II} and Jordan Boyd-Graber and Hal {Daum{\\'e} III}},\n\tyear         = 2015,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {55--64}\n}\n@inproceedings{he2016amazonreview,\n\ttitle        = {Ups and Downs: Modeling the Visual Evolution of Fashion Trends with One-class Collaborative Filtering},\n\tauthor       = {Ruining He and Julian McAuley},\n\tyear         = 2016,\n\tbooktitle    = {World Wide Web (WWW)}\n}\n@inproceedings{he2016deep,\n\ttitle        = {Deep residual learning for image recognition},\n\tauthor       = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},\n\tyear         = 2016,\n\tbooktitle    = {Proceedings of the IEEE conference on computer vision and pattern recognition},\n\tpages        = {770--778}\n}\n@inproceedings{he2016human,\n\ttitle        = {Human-in-the-Loop Parsing},\n\tauthor       = {Luheng He and Julian Michael and Mike Lewis and Luke Zettlemoyer},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{he2016identity,\n\ttitle        = {Identity mappings in deep residual networks},\n\tauthor       = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},\n\tyear         = 2016,\n\tbooktitle    = {European Conference on Computer Vision},\n\tpages        = {630--645},\n\torganization = {Springer}\n}\n@inproceedings{he2016opponent,\n\ttitle        = {Opponent Modeling in Deep Reinforcement Learning},\n\tauthor       = {He He and Jordan L. Boyd-Graber},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{he2017symmetric,\n\ttitle        = {Learning Symmetric Collaborative Dialogue Agents with Dynamic Knowledge Graph Embeddings},\n\tauthor       = {He He and Anusha Balakrishnan and Mihail Eric and Percy Liang},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {1766--1776}\n}\n@inproceedings{he2018negotiation,\n\ttitle        = {Decoupling Strategy and Generation in Negotiation Dialogues},\n\tauthor       = {He He and Derek Chen and Anusha Balakrishnan and Percy Liang},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{he2019unlearn,\n\ttitle        = {Unlearn Dataset Bias in Natural Language Inference by Fitting the Residual},\n\tauthor       = {He He and Sheng Zha and Haohan Wang},\n\tyear         = 2019,\n\tbooktitle    = {Workshop on Deep Learning for Low-Resource Natural Language Processing (DeepLo)}\n}\n@article{he2020logarithmic,\n\ttitle        = {Logarithmic Regret for Reinforcement Learning with Linear Function Approximation},\n\tauthor       = {He, Jiafan and Zhou, Dongruo and Gu, Quanquan},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2011.11566}\n}\n@inproceedings{he2020moco,\n\ttitle        = {Momentum Contrast for Unsupervised Visual Representation Learning},\n\tauthor       = {Kaiming He and Haoqi Fan and Yuxin Wu and Saining Xie and Ross Girshick},\n\tyear         = 2020,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@inproceedings{he2020momentum,\n\ttitle        = {Momentum contrast for unsupervised visual representation learning},\n\tauthor       = {He, Kaiming and Fan, Haoqi and Wu, Yuxin and Xie, Saining and Girshick, Ross},\n\tyear         = 2020,\n\tmonth        = {June},\n\tbooktitle    = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},\n\tpages        = {9729--9738}\n}\n@article{he2020towards,\n\ttitle        = {Towards Non-{IID} Image Classification: A Dataset and Baselines},\n\tauthor       = {Yue He and Zheyan Shen and Peng Cui},\n\tyear         = 2020,\n\tjournal      = {Pattern Recognition},\n\tvolume       = 110\n}\n@inproceedings{heafield2013scalable,\n\ttitle        = {Scalable Modified {K}neser-{N}ey Language Model Estimation},\n\tauthor       = {Kenneth Heafield and Ivan Pouzyrevsky and Jonathan H. Clark and Philipp Koehn},\n\tyear         = 2013,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {690--696}\n}\n@inproceedings{hearst1992automatic,\n\ttitle        = {Automatic acquisition of hyponyms from large text corpora},\n\tauthor       = {Marti A Hearst},\n\tyear         = 1992,\n\tbooktitle    = {Interational Conference on Computational linguistics},\n\tpages        = {539--545}\n}\n@article{hearst1998automated,\n\ttitle        = {Automated discovery of WordNet relations},\n\tauthor       = {Marti A Hearst},\n\tyear         = 1998,\n\tjournal      = {WordNet: an electronic lexical database}\n}\n@inproceedings{heath2006mercury,\n\ttitle        = {\n\t\tMercury and freon: temperature emulation and management for server\n\n\t\tsystems\n\t},\n\tauthor       = {\n\t\tHeath, Taliver and Centeno, Ana Paula and George, Pradeep and Ramos,\n\n\t\tLuiz and Jaluria, Yogesh and Bianchini, Ricardo\n\t},\n\tyear         = 2006,\n\tbooktitle    = {\n\t\tProceedings of the 12th international conference on Architectural\n\n\t\tsupport for programming languages and operating systems\n\t},\n\tlocation     = {San Jose, California, USA},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {ASPLOS-XII},\n\tpages        = {106--116},\n\tdoi          = {http://doi.acm.org/10.1145/1168857.1168872},\n\tisbn         = {1-59593-451-0},\n\tacmid        = 1168872,\n\tkeywords     = {\n\t\tenergy conservation, server clusters, temperature modeling, thermal\n\n\t\tmanagement\n\t},\n\tnumpages     = 11\n}\n@book{Hebb1949,\n\ttitle        = {The Organization of Behavior: A Neuropsychological Theory},\n\tauthor       = {Hebb, Donald O.},\n\tyear         = 1949,\n\tmonth        = jun,\n\tday          = 15,\n\tpublisher    = {Wiley},\n\taddress      = {New York},\n\tisbn         = {0805843000},\n\turl          = {http://www.worldcat.org/isbn/0805843000},\n\tedition      = {New edition},\n\thowpublished = {Hardcover},\n\tkeywords     = {viva},\n\tposted-at    = {2008-10-07 15:32:39},\n\tpriority     = 2,\n\ttimestamp    = {2013.08.27}\n}\n@article{hebert2017calibration,\n\ttitle        = {Calibration for the (computationally-identifiable) masses},\n\tauthor       = {H{\\'e}bert-Johnson, Ursula and Kim, Michael P and Reingold, Omer and Rothblum, Guy N},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1711.08513}\n}\n@inproceedings{hebert2018multicalibration,\n\ttitle        = {Multicalibration: Calibration for the (computationally-identifiable) masses},\n\tauthor       = {H{\\'e}bert-Johnson, Ursula and Kim, Michael and Reingold, Omer and Rothblum, Guy},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {1939--1948},\n\torganization = {PMLR}\n}\n@article{hebertjohnson2017,\n\ttitle        = {Calibration for the (Computationally-Identifiable) Masses},\n\tauthor       = {{\\'U}rsula H{\\'e}bert-Johnson and Michael P. Kim and Omer Reingold and Guy N. Rothblum},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1711.08513}\n}\n@article{heckerman92towardsnormative,\n\ttitle        = {Towards normative expert systems: Probability-based representations for efficient knowledge acquisition and inference},\n\tauthor       = {David E. Heckerman and Bharat N. Nathwani},\n\tyear         = 1992,\n\tjournal      = {Methods Archive},\n\tvolume       = 31,\n\tnumber       = 2,\n\tpages        = {106--116}\n}\n@article{heider1944experimental,\n\ttitle        = {An experimental study of apparent behavior},\n\tauthor       = {Fritz Heider and Marianne Simmel},\n\tyear         = 1944,\n\tjournal      = {American Journal of Psychology},\n\tvolume       = 57,\n\tnumber       = 2,\n\tpages        = {243--259}\n}\n@book{Heij07,\n\ttitle        = {Introduction to mathematical systems theory : linear systems, identification and control},\n\tauthor       = {Heij, Christiaan and Ran, Andr{\\'e} and Schagen, Freek van},\n\tyear         = 2007,\n\tpublisher    = {Birkh{\\\"a}user},\n\taddress      = {Basel, Boston, Berlin},\n\tisbn         = {3-7643-7548-5},\n\turl          = {http://opac.inria.fr/record=b1130636},\n\tbdsk-url-1   = {http://opac.inria.fr/record=b1130636}\n}\n@inproceedings{heilman2010tree,\n\ttitle        = {Tree edit models for recognizing textual entailments, paraphrases, and answers to questions},\n\tauthor       = {Michael Heilman and Noah A Smith},\n\tyear         = 2010,\n\tbooktitle    = {Human Language Technology and North American Association for Computational Linguistics (HLT/NAACL)},\n\tpages        = {1011--1019}\n}\n@book{heim98semantics,\n\ttitle        = {Semantics in Generative Grammar},\n\tauthor       = {Irene Heim and Angelika Kratzer},\n\tyear         = 1998,\n\tpublisher    = {Wiley-Blackwell}\n}\n@inproceedings{hein2017formal,\n\ttitle        = {Formal guarantees on the robustness of a classifier against adversarial manipulation},\n\tauthor       = {Matthias Hein and Maksym Andriushchenko},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {2263--2273}\n}\n@article{heinze2017conditional,\n\ttitle        = {Conditional variance penalties and domain shift robustness},\n\tauthor       = {Christina Heinze-Deml and Nicolai Meinshausen},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1710.11469}\n}\n@article{heinzedeml2018invariant,\n\ttitle        = {{Invariant Causal Prediction for Nonlinear Models}},\n\tauthor       = {Heinze-Deml, Christina and Meinshausen, Nicolai and Peters, Jonas},\n\tyear         = 2018,\n\tmonth        = {September},\n\tjournal      = {Journal of Causal Inference},\n\tvolume       = 6,\n\tnumber       = 2,\n\tpages        = {1--35}\n}\n@inproceedings{hellendoorn2019code,\n\ttitle        = {When code completion fails: A case study on real-world completions},\n\tauthor       = {Vincent J Hellendoorn and Sebastian Proksch and Harald C Gall and Alberto Bacchelli},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Software Engineering (ICSE)}\n}\n@inproceedings{heller05hierarchical,\n\ttitle        = {{B}ayesian Hierarchical Clustering},\n\tauthor       = {K. A. Heller and Z. Ghahramani},\n\tyear         = 2005,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{heller2010elastictree,\n\ttitle        = {ElasticTree: saving energy in data center networks},\n\tauthor       = {\n\t\tHeller, Brandon and Seetharaman, Srini and Mahadevan, Priya and Yiakoumis,\n\n\t\tYiannis and Sharma, Puneet and Banerjee, Sujata and McKeown, Nick\n\t},\n\tyear         = 2010,\n\tbooktitle    = {\n\t\tProceedings of the 7th USENIX conference on Networked systems design\n\n\t\tand implementation\n\t},\n\tlocation     = {San Jose, California},\n\tpublisher    = {USENIX Association},\n\taddress      = {Berkeley, CA, USA},\n\tseries       = {NSDI'10},\n\tpages        = {17--17},\n\tacmid        = 1855728,\n\tnumpages     = 1\n}\n@article{hellman1970nearest,\n\ttitle        = {The nearest neighbor classification rule with a reject option},\n\tauthor       = {Martin E Hellman},\n\tyear         = 1970,\n\tjournal      = {IEEE Transactions on Systems Science and Cybernetics},\n\tvolume       = 6,\n\tnumber       = 3,\n\tpages        = {179--185}\n}\n@article{hellman1970probability,\n\ttitle        = {Probability of error, equivocation, and the Chernoff bound},\n\tauthor       = {Martin Hellman and Josef Raviv},\n\tyear         = 1970,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tvolume       = 16,\n\tnumber       = 4,\n\tpages        = {368--372}\n}\n@article{helmberg2007spectral,\n\ttitle        = {Smoothing technique and its applications in semidefinite optimization},\n\tauthor       = {Yurii Nesterov},\n\tyear         = 2007,\n\tjournal      = {Mathematical Programming},\n\tvolume       = 110,\n\tnumber       = 2,\n\tpages        = {245--259}\n}\n@inproceedings{helmbold1997some,\n\ttitle        = {Some label efficient learning results},\n\tauthor       = {David Helmbold and Sandra Panizza},\n\tyear         = 1997,\n\tbooktitle    = {Conference on Learning Theory (COLT)},\n\tpages        = {218--230}\n}\n@article{helmbold2015inductive,\n\ttitle        = {On the Inductive Bias of Dropout},\n\tauthor       = {David P. Helmbold and Philip M. Long},\n\tyear         = 2015,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 16,\n\tpages        = {3403--3454}\n}\n@inproceedings{henaff2017tracking,\n\ttitle        = {Tracking the World State with Recurrent Entity Networks},\n\tauthor       = {Mikael Henaff and Jason Weston and Arthur Szlam and Antoine Bordes and Yann LeCun},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{henaff2020data,\n\ttitle        = {Data-efficient image recognition with contrastive predictive coding},\n\tauthor       = {Henaff, Olivier},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {4182--4192},\n\torganization = {PMLR}\n}\n@article{henderson2012influence,\n\ttitle        = {The influence of race and ethnicity on the biology of cancer},\n\tauthor       = {Brian E Henderson and Norman H Lee and Victoria Seewaldt and Hongbing Shen},\n\tyear         = 2012,\n\tjournal      = {Nature Reviews Cancer},\n\tvolume       = 12,\n\tnumber       = 9,\n\tpages        = {648--653}\n}\n@article{henderson2017deep,\n\ttitle        = {Deep reinforcement learning that matters},\n\tauthor       = {P. Henderson and R. Islam and P. Bachman and J. Pineau and D. Precup and D. Meger},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1709.06560}\n}\n@inproceedings{hendrickx2010relations,\n\ttitle        = {SemEval-2010 Task 8: Multi-Way Classification of Semantic Relations Between Pairs of Nominals},\n\tauthor       = {Iris Hendrickx and Su Nam Kim and Zornitsa Kozareva and Preslav Nakov and Diarmuid OSeaghdha and Sebastian Pado and Marco Pennacchiotti and Lorenza Romano and Stan Szpakowicz},\n\tyear         = 2010,\n\tbooktitle    = {5th International Workshop on Semantic Evaluation}\n}\n@article{hendrix1978developing,\n\ttitle        = {Developing a natural language interface to complex data},\n\tauthor       = {Gary G Hendrix and Earl D Sacerdoti and Daniel Sagalowicz and Jonathan Slocum},\n\tyear         = 1978,\n\tjournal      = {ACM Transactions on Database Systems (TODS)},\n\tvolume       = 3,\n\tpages        = {105--147}\n}\n@inproceedings{hendrycks2017baseline,\n\ttitle        = {A Baseline for Detecting Misclassified and Out-of-Distribution Examples in Neural Networks},\n\tauthor       = {Dan Hendrycks and Kevin Gimpel},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{hendrycks2017early,\n\ttitle        = {Early Methods for Detecting Adversarial Images},\n\tauthor       = {Dan Hendrycks and Kevin Gimpel},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations Workshop (ICLR)}\n}\n@inproceedings{hendrycks2019anomaly,\n\ttitle        = {Deep Anomaly Detection with Outlier Exposure},\n\tauthor       = {Dan Hendrycks and Mantas Mazeika and Thomas Dietterich},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{hendrycks2019augmix,\n\ttitle        = {Augmix: A simple data processing method to improve robustness and uncertainty},\n\tauthor       = {Dan Hendrycks and Norman Mu and Ekin D Cubuk and Barret Zoph and Justin Gilmer and Balaji Lakshminarayanan},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{hendrycks2019benchmarking,\n\ttitle        = {Benchmarking neural network robustness to common corruptions and perturbations},\n\tauthor       = {Hendrycks, Dan and Dietterich, Thomas},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1903.12261},\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{hendrycks2019natural,\n\ttitle        = {Natural adversarial examples},\n\tauthor       = {Dan Hendrycks and Kevin Zhao and Steven Basart and Jacob Steinhardt and Dawn Song},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1907.07174}\n}\n@inproceedings{hendrycks2019pretraining,\n\ttitle        = {Using Pre-Training Can Improve Model Robustness and Uncertainty},\n\tauthor       = {Dan Hendrycks and Kimin Lee and Mantas Mazeika},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{hendrycks2019selfsupervised,\n\ttitle        = {Using Self-Supervised Learning Can Improve Model Robustness and Uncertainty},\n\tauthor       = {Dan Hendrycks and Mantas Mazeika and Saurav Kadavath and Dawn Song},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@misc{hendrycks2020gaussian,\n\ttitle        = {Gaussian Error Linear Units (GELUs)},\n\tauthor       = {Dan Hendrycks and Kevin Gimpel},\n\tyear         = 2020,\n\teprint       = {1606.08415},\n\tarchiveprefix = {arXiv},\n\tprimaryclass = {cs.LG}\n}\n@article{hendrycks2020many,\n\ttitle        = {The many faces of robustness: A critical analysis of out-of-distribution generalization},\n\tauthor       = {Dan Hendrycks and Steven Basart and Norman Mu and Saurav Kadavath and Frank Wang and Evan Dorundo and Rahul Desai and Tyler Zhu and Samyak Parajuli and Mike Guo and Dawn Song and Jacob Steinhardt and Justin Gilmer},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.16241}\n}\n@article{hendrycks2020pretrained,\n\ttitle        = {Pretrained transformers improve out-of-distribution robustness},\n\tauthor       = {Dan Hendrycks and Xiaoyuan Liu and Eric Wallace and Adam Dziedzic and Rishabh Krishnan and Dawn Song},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2004.06100}\n}\n@inproceedings{henrion1986propagating,\n\ttitle        = {\n\t\tPropagating uncertainty in {B}ayesian networks by probabilistic\n\n\t\tlogic sampling\n\t},\n\tauthor       = {Max Henrion},\n\tyear         = 1986,\n\tbooktitle    = {UAI},\n\tpages        = {149--164}\n}\n@inproceedings{henrion2005detecting,\n\ttitle        = {Detecting global optimality and extracting solutions in {G}lopti{P}oly},\n\tauthor       = {Didier Henrion and Jean-Bernard Lasserre},\n\tyear         = 2005,\n\tbooktitle    = {Positive polynomials in control},\n\tpages        = {293--310}\n}\n@inproceedings{henzinger02lazy,\n\ttitle        = {Lazy Abstraction},\n\tauthor       = {Thomas A. Henzinger and Ranjit Jhala and Rupak Majumdar and Grégoire Sutre},\n\tyear         = 2002,\n\tbooktitle    = {Principles of Programming Languages (POPL)}\n}\n@article{herbert2021scalable,\n\ttitle        = {Scalable Learning of Safety Guarantees for Autonomous Systems using {Hamilton-Jacobi} Reachability},\n\tauthor       = {Sylvia L. Herbert and Jason J. Choi and Suvansh Qazi and Marsalis Gibson and K. Sreenath and C. Tomlin},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2101.05916}\n}\n@inproceedings{herda2000skeleton,\n\ttitle        = {\n\t\tSkeleton-Based Motion Capture for Robust Reconstruction of Human\n\n\t\tMotion\n\t},\n\tauthor       = {\n\t\tHerda, L. and Fua, P. and Pl\\\"{a}nkers, R. and Boulic, R. and Thalmann,\n\n\t\tD.\n\t},\n\tyear         = 2000,\n\tbooktitle    = {Proceedings of the Computer Animation},\n\tpublisher    = {IEEE Computer Society},\n\taddress      = {Washington, DC, USA},\n\tseries       = {CA '00},\n\tpages        = {77--86},\n\tacmid        = 872908,\n\tkeywords     = {Motion capture, skeleton-based tracking}\n}\n@inproceedings{herlant2016assistive,\n\ttitle        = {Assistive teleoperation of robot arms via automatic time-optimal mode switching},\n\tauthor       = {Laura V Herlant and Rachel M Holladay and Siddhartha S Srinivasa},\n\tyear         = 2016,\n\tbooktitle    = {ACM/IEEE International Conference on Human Robot Interaction (HRI)},\n\tpages        = {35--42}\n}\n@inproceedings{hermann2014semantic,\n\ttitle        = {Semantic frame identification with distributed word representations},\n\tauthor       = {Karl Moritz Hermann and Dipanjan Das and Jason Weston and Kuzman Ganchev},\n\tyear         = 2014,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{hermann2015read,\n\ttitle        = {Teaching Machines to Read and Comprehend},\n\tauthor       = {Karl Moritz Hermann and Tomáš Kočiský and Edward Grefenstette and Lasse Espeholt and Will Kay and Mustafa Suleyman and Phil Blunsom},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{hermann2017grounded,\n\ttitle        = {Grounded Language Learning in a Simulated 3D World},\n\tauthor       = {Karl Moritz Hermann and Felix Hill and Simon Green and Fumin Wang and Ryan Faulkner and Hubert Soyer and David Szepesvari and Wojciech Czarnecki and Max Jaderberg and Denis Teplyashin and Marcus Wainwright and Chris Apps and Demis Hassabis and Phil Blunsom},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1706.06551}\n}\n@article{hermans2012recurrent,\n\ttitle        = {Recurrent kernel machines: Computing with infinite echo state networks},\n\tauthor       = {Michiel Hermans and Benjamin Schrauwen},\n\tyear         = 2012,\n\tjournal      = {Neural Computation},\n\tvolume       = 24,\n\tnumber       = 1,\n\tpages        = {104--133}\n}\n@inproceedings{hermjakob01parsing,\n\ttitle        = {Parsing and Question Classification for Question Answering},\n\tauthor       = {Ulf Hermjakob},\n\tyear         = 2001,\n\tbooktitle    = {Workshop on Open-domain question answering, ACL},\n\tpages        = {1--6}\n}\n@inproceedings{herzig2017multi,\n\ttitle        = {Neural Semantic Parsing over Multiple Knowledge-bases},\n\tauthor       = {Jonathan Herzig and Jonathan Berant},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{herzig2018mapping,\n\ttitle        = {Mapping Images to Scene Graphs with Permutation-Invariant Structured Prediction},\n\tauthor       = {Roei Herzig and Moshiko Raboh and Gal Chechik and Jonathan Berant and Amir Globerson},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{herzig2018zeroshot,\n\ttitle        = {Decoupling Structure and Lexicon for Zero-Shot Semantic Parsing},\n\tauthor       = {Jonathan Herzig and Jonathan Berant},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{herzig2019detect,\n\ttitle        = {Don't paraphrase, detect! Rapid and Effective Data Collection for Semantic Parsing},\n\tauthor       = {Jonathan Herzig and Jonathan Berant},\n\tyear         = 2019,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@book{hespanha2009,\n\ttitle        = {Linear systems theory},\n\tauthor       = {Hespanha, Joao P},\n\tyear         = 2009,\n\tpublisher    = {Princeton university press}\n}\n@article{hessel2017rainbow,\n\ttitle        = {Rainbow: Combining Improvements in Deep Reinforcement Learning},\n\tauthor       = {Matteo Hessel and Joseph Modayil and Hado Van Hasselt and Tom Schaul and Georg Ostrovski and Will Dabney and Dan Horgan and Bilal Piot and Mohammad Azar and David Silver},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1710.02298}\n}\n@article{HessianPearlmutter,\n\ttitle        = {Fast exact multiplication by the Hessian},\n\tauthor       = {Pearlmutter, Barak A},\n\tyear         = 1994,\n\tjournal      = {Neural computation},\n\tpublisher    = {MIT Press},\n\tvolume       = 6,\n\tnumber       = 1,\n\tpages        = {147--160}\n}\n@article{hester2017learning,\n\ttitle        = {Learning from Demonstrations for Real World Reinforcement Learning},\n\tauthor       = {Todd Hester and Matej Vecerik and Olivier Pietquin and Marc Lanctot and Tom Schaul and Bilal Piot and Andrew Sendonaris and Gabriel Dulac-Arnold and Ian Osband and John Agapiou and others},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1704.03732}\n}\n@inproceedings{hester2018deep,\n\ttitle        = {Deep {Q}-learning from Demonstrations},\n\tauthor       = {Todd Hester and Matej Vecerik and Olivier Pietquin and Marc Lanctot and Tom Schaul and Bilal Piot and Andrew Sendonaris and Gabriel Dulac{-}Arnold and Ian Osband and John Agapiou and Joel Z. Leibo and Audrunas Gruslys},\n\tyear         = 2018,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{hewitt2019control,\n\ttitle        = {Designing and Interpreting Probes with Control Tasks},\n\tauthor       = {John Hewitt and Percy Liang},\n\tyear         = 2019,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{hewitt2019structural,\n\ttitle        = {A structural probe for finding syntax in word representations},\n\tauthor       = {Hewitt, John and Manning, Christopher D},\n\tyear         = 2019,\n\tbooktitle    = {Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)},\n\tpages        = {4129--4138}\n}\n@inproceedings{hewitt2020rnn,\n\ttitle        = {{RNN}s can generate bounded hierarchical languages with optimal memory},\n\tauthor       = {John Hewitt and Michael Hahn and Surya Ganguli and Percy Liang and Christopher D. Manning},\n\tyear         = 2020,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{hewitt2020rnns,\n\ttitle        = {RNNs can generate bounded hierarchical languages with optimal memory},\n\tauthor       = {Hewitt, John and Hahn, Michael and Ganguli, Surya and Liang, Percy and Manning, Christopher D},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.07515}\n}\n@inproceedings{hewitt2021conditional,\n\ttitle        = {Conditional probing: measuring usable information beyond a baseline},\n\tauthor       = {John Hewitt and Kawin Ethayarajh and Percy Liang and Christopher D. Manning},\n\tyear         = 2021,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{hewlett2016wikireading,\n\ttitle        = {Wikireading: A novel large-scale language understanding task over {W}ikipedia},\n\tauthor       = {Daniel Hewlett and Alexandre Lacoste and Llion Jones and Illia Polosukhin and Andrew Fandrianto and Jay Han and Matthew Kelcey and David Berthelot},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{hewlett2017accurate,\n\ttitle        = {Accurate Supervised and Semi-Supervised Machine Reading for Long Documents},\n\tauthor       = {Daniel Hewlett and Llion Jones and Alexandre Lacoste and others},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {2011--2020}\n}\n@inproceedings{HHR03,\n\ttitle        = {A polynomial-time tree decomposition to minimize congestion},\n\tauthor       = {Harrelson, Chris and Hildrum, Kirsten and Rao, Satish},\n\tyear         = 2003,\n\tseries       = {SPAA '03},\n\tpages        = {34--43},\n\tisbn         = {1-58113-661-7},\n\tnumpages     = 10\n}\n@article{hicks2017missing,\n\ttitle        = {Missing data and technical variability in single-cell {RNA}-sequencing experiments},\n\tauthor       = {Stephanie C Hicks and F William Townes and Mingxiang Teng and Rafael A Irizarry},\n\tyear         = 2017,\n\tjournal      = {Biostatistics},\n\tvolume       = 19,\n\tnumber       = 4,\n\tpages        = {562--578}\n}\n@inproceedings{higgins2017beta,\n\ttitle        = {beta-vae: Learning basic visual concepts with a constrained variational framework},\n\tauthor       = {Irina Higgins and Loic Matthey and Arka Pal and Christopher Burgess and Xavier Glorot and Matthew Botvinick and Shakir Mohamed and Alexander Lerchner},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@book{Higham2008,\n\ttitle        = {Functions of Matrices},\n\tauthor       = {Higham, N.},\n\tyear         = 2008,\n\tpublisher    = {Society for Industrial and Applied Mathematics},\n\taddress      = {},\n\tdoi          = {10.1137/1.9780898717778},\n\turl          = {http://epubs.siam.org/doi/abs/10.1137/1.9780898717778},\n\tedition      = {},\n\teprint       = {http://epubs.siam.org/doi/pdf/10.1137/1.9780898717778}\n}\n@inproceedings{hill2015goldilocks,\n\ttitle        = {The Goldilocks Principle: Reading Children's Books with Explicit Memory Representations},\n\tauthor       = {Felix Hill and Antoine Bordes and Sumit Chopra and Jason Weston},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{hill2020human,\n\ttitle        = {Human Instruction-Following with Deep Reinforcement Learning via Transfer-Learning from Text},\n\tauthor       = {Felix Hill and Sona Mokra and Nathaniel Wong and Tim Harley},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2005.09382}\n}\n@article{hill2020wrongfully,\n\ttitle        = {Wrongfully Accused by an Algorithm},\n\tauthor       = {Kashmir Hill},\n\tyear         = 2020,\n\tjournal      = {The New York Times},\n\turl          = {https://www.nytimes.com/2020/06/24/technology/facial-recognition-arrest.html}\n}\n@article{hillar2009most,\n\ttitle        = {Most tensor problems are {NP} hard},\n\tauthor       = {C. Hillar and L.-H. Lim},\n\tyear         = 2013,\n\tjournal      = {J. ACM}\n}\n@article{hillar2013most,\n\ttitle        = {Most tensor problems are NP-hard},\n\tauthor       = {Hillar, Christopher J and Lim, Lek-Heng},\n\tyear         = 2013,\n\tjournal      = {Journal of the ACM (JACM)},\n\tpublisher    = {ACM},\n\tvolume       = 60,\n\tnumber       = 6,\n\tpages        = 45\n}\n@article{HillarL13,\n\ttitle        = {Most Tensor Problems Are NP-Hard},\n\tauthor       = {Christopher J. Hillar and Lek{-}Heng Lim},\n\tyear         = 2013,\n\tmonth        = nov,\n\tjournal      = {J. {ACM}},\n\tvolume       = 60,\n\tnumber       = 6,\n\tpages        = 45,\n\tdoi          = {10.1145/2512329},\n\turl          = {http://doi.acm.org/10.1145/2512329},\n\ttimestamp    = {Fri, 06 Dec 2013 15:28:53 +0100},\n\tbiburl       = {http://dblp2.uni-trier.de/rec/bib/journals/jacm/HillarL13},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org},\n\tarticle      = 45\n}\n@inproceedings{hills2013rapid,\n\ttitle        = {Rapid Exploration of Processing and Design Guidelines to Overcome Carbon Nanotube Variations},\n\tauthor       = {Gage Hills and Jie Zhang and Charles Mackin and Max Shulaker and Hai Wei and Hon Sun Philip Wong and Subhasish Mitra},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of the 50th Annual Design Automation Conference}\n}\n@inproceedings{hindman2011mesos,\n\ttitle        = {Mesos: A Platform for Fine-Grained Resource Sharing in the Data Center.},\n\tauthor       = {Hindman, Benjamin and Konwinski, Andy and Zaharia, Matei and Ghodsi, Ali and Joseph, Anthony D and Katz, Randy H and Shenker, Scott and Stoica, Ion},\n\tyear         = 2011,\n\tbooktitle    = {NSDI},\n\tvolume       = 11,\n\tpages        = {22--22}\n}\n@article{hintikka1975impossible,\n\ttitle        = {Impossible Possible Worlds Vindicated},\n\tauthor       = {Jaakko Hintikka},\n\tyear         = 1975,\n\tjournal      = {Journal of Philosophical Logic},\n\tvolume       = 4,\n\tnumber       = 4\n}\n@book{hinton1984distributed,\n\ttitle        = {Parallel Distributed Processing: Explorations in the Microstructure of Cognition},\n\tyear         = 1986,\n\teditor       = {Rumelhart, David E. and Hinton, Geoffrey E. and McClelland, James L.}\n}\n@article{Hinton2002,\n\ttitle        = {Training products of experts by minimizing contrastive divergence},\n\tauthor       = {Hinton, Geoffrey E.},\n\tyear         = 2002,\n\tmonth        = aug,\n\tjournal      = {Neural Comput.},\n\tpublisher    = {MIT Press},\n\taddress      = {Cambridge, MA, USA},\n\tvolume       = 14,\n\tnumber       = 8,\n\tpages        = {1771--1800},\n\tdoi          = {10.1162/089976602760128018},\n\tissn         = {0899-7667},\n\turl          = {http://dx.doi.org/10.1162/089976602760128018},\n\tacmid        = 639730,\n\tissue_date   = {August 2002},\n\tnumpages     = 30,\n\towner        = {gewor_000},\n\ttimestamp    = {2013.09.15}\n}\n@article{Hinton2006,\n\ttitle        = {A fast learning algorithm for deep belief nets},\n\tauthor       = {Hinton, Geoffrey E. and Osindero, Simon and Teh, Yee-Whye},\n\tyear         = 2006,\n\tmonth        = jul,\n\tjournal      = {Neural Comput.},\n\tpublisher    = {MIT Press},\n\taddress      = {Cambridge, MA, USA},\n\tvolume       = 18,\n\tnumber       = 7,\n\tpages        = {1527--1554},\n\tdoi          = {10.1162/neco.2006.18.7.1527},\n\tissn         = {0899-7667},\n\turl          = {http://dx.doi.org/10.1162/neco.2006.18.7.1527},\n\tacmid        = 1161605,\n\tissue_date   = {July 2006},\n\tnumpages     = 28\n}\n@article{hinton2006fast,\n\ttitle        = {A fast learning algorithm for deep belief nets},\n\tauthor       = {Geoffrey E Hinton and Simon Osindero and Yee-Whye Teh},\n\tyear         = 2006,\n\tjournal      = {Neural computation},\n\tvolume       = 18,\n\tnumber       = 7,\n\tpages        = {1527--1554}\n}\n@techreport{hinton2010practical,\n\ttitle        = {A practical guide to training restricted {B}oltzmann machines},\n\tauthor       = {G. Hinton},\n\tyear         = 2010,\n\tinstitution  = {University of Toronto}\n}\n@article{hinton2012improving,\n\ttitle        = {Improving neural networks by preventing co-adaptation of feature detectors},\n\tauthor       = {Geoffrey E Hinton and Nitish Srivastava and Alex Krizhevsky and Ilya Sutskever and Ruslan R Salakhutdinov},\n\tyear         = 2012,\n\tjournal      = {arXiv preprint arXiv:1207.0580}\n}\n@article{hinton2012speech,\n\ttitle        = {Deep neural networks for acoustic modeling in speech recognition: The shared views of four research groups},\n\tauthor       = {Geoffrey Hinton and Li Deng and Dong Yu and George E Dahl and Abdel-rahman Mohamed and Navdeep Jaitly and Andrew Senior and Vincent Vanhoucke and Patrick Nguyen and Tara N Sainath and others},\n\tyear         = 2012,\n\tjournal      = {Signal Processing Magazine, IEEE},\n\tvolume       = 29,\n\tnumber       = 6,\n\tpages        = {82--97}\n}\n@inproceedings{hinton2015distilling,\n\ttitle        = {Distilling the Knowledge in a Neural Network},\n\tauthor       = {Geoffrey Hinton and Oriol Vinyals and Jeffrey Dean},\n\tyear         = 2015,\n\tbooktitle    = {NIPS Deep Learning and Representation Learning Workshop}\n}\n@inproceedings{hinton99poe,\n\ttitle        = {Products of Experts},\n\tauthor       = {Goeffrey Hinton},\n\tyear         = 1999,\n\tbooktitle    = {International Conference on Artificial Neural Networks (ICANN)}\n}\n@inproceedings{hintz2016language,\n\ttitle        = {Language transfer learning for supervised lexical substitution},\n\tauthor       = {Hintz, Gerold and Biemann, Chris},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{hiraoka2014framing,\n\ttitle        = {Reinforcement Learning of Cooperative Persuasive Dialogue Policies using Framing},\n\tauthor       = {Hiraoka, Takuya and Neubig, Graham and Sakti, Sakriani and Toda, Tomoki and Nakamura, Satoshi},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)}\n}\n@inproceedings{hiraoka2015trading,\n\ttitle        = {Reinforcement Learning in Multi-Party Trading Dialog},\n\tauthor       = {Takuya Hiraoka and Kallirroi Georgila and Elnaz Nouri and David Traum},\n\tyear         = 2015,\n\tbooktitle    = {Special Interest Group on Discourse and Dialogue (SIGDIAL)}\n}\n@article{hiraoka2020meta,\n\ttitle        = {Meta-Model-Based Meta-Policy Optimization},\n\tauthor       = {Takuya Hiraoka and Takahisa Imagawa and Voot Tangkaratt and Takayuki Osa and Takashi Onishi and Yoshimasa Tsuruoka},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.02608}\n}\n@inproceedings{hirschman1999deep,\n\ttitle        = {Deep read: A reading comprehension system},\n\tauthor       = {Lynette Hirschman and Marc Light and Eric Breck and John D Burger},\n\tyear         = 1999,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {325--332}\n}\n@article{hirschman2001natural,\n\ttitle        = {Natural language question answering: the view from here},\n\tauthor       = {Lynette Hirschman and Robert Gaizauskas},\n\tyear         = 2001,\n\tjournal      = {Natural Language Engineering},\n\tvolume       = 7\n}\n@inproceedings{hixon2015dialog,\n\ttitle        = {Learning knowledge graphs for question answering through conversational dialog},\n\tauthor       = {Ben Hixon and Peter Clark and Hannaneh Hajishirzi},\n\tyear         = 2015,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@inproceedings{HJ,\n\ttitle        = {Matrix Analysis},\n\tauthor       = {R. Horn and C. Johnson},\n\tyear         = 1990,\n\tbooktitle    = {Cambridge University Press}\n}\n@inproceedings{hjelm2018learning,\n\ttitle        = {Learning deep representations by mutual information estimation and maximization},\n\tauthor       = {Hjelm, R Devon and Fedorov, Alex and Lavoie-Marchildon, Samuel and Grewal, Karan and Bachman, Phil and Trischler, Adam and Bengio, Yoshua},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations}\n}\n@article{hjorungnes2007complex,\n\ttitle        = {Complex-Valued Matrix Differentiation: Techniques and Key Results},\n\tauthor       = {Hjorungnes, A. and Gesbert, D.},\n\tyear         = 2007,\n\tmonth        = jun,\n\tjournal      = {IEEE Transactions on Signal Processing},\n\tvolume       = 55,\n\tnumber       = 6,\n\tpages        = {2740--2746},\n\tabstract     = {\n\t\tA systematic theory is introduced for finding the derivatives of complex-valued\n\n\t\tmatrix functions with respect to a complex-valued matrix variable\n\n\t\tand the complex conjugate of this variable. In the framework introduced,\n\n\t\tthe differential of the complex-valued matrix function is used to\n\n\t\tidentify the derivatives of this function. Matrix differentiation\n\n\t\tresults are derived and summarized in tables which can be exploited\n\n\t\tin a wide range of signal processing related situations\n\t},\n\tkeywords     = {\n\t\tcomplex conjugate;complex-valued matrix differentiation;complex-valued\n\n\t\tmatrix function;signal processing;matrix algebra;signal processing;\n\t}\n}\n@misc{HK12,\n\ttitle        = {Learning mixtures of spherical {G}aussians: moment methods and spectral decompositions},\n\tauthor       = {Daniel Hsu and Sham M. Kakade},\n\tyear         = 2012,\n\tbooktitle    = {Fourth Innovations in Theoretical Computer Science},\n\turl          = {http://arxiv.org/abs/1206.5766},\n\tnote         = {arXiv:1206.5766 (to appear in ITCS, 2013)},\n\teprint       = {arXiv:1206.5766}\n}\n@inproceedings{HKZ09,\n\ttitle        = {A spectral algorithm for learning hidden {M}arkov models},\n\tauthor       = {D. Hsu and S. M. Kakade and T. Zhang},\n\tyear         = 2009,\n\tbooktitle    = {COLT}\n}\n@article{HKZ12,\n\ttitle        = {A spectral algorithm for learning hidden {M}arkov models},\n\tauthor       = {Daniel Hsu and Sham M. Kakade and Tong Zhang},\n\tyear         = 2012,\n\tjournal      = {Journal of Computer and System Sciences},\n\tbooktitle    = {Conference on Learning Theory (COLT)},\n\tvolume       = 78,\n\tnumber       = 5,\n\tpages        = {1460--1480}\n}\n@incollection{HLA,\n\ttitle        = {Tensors and hypermatrices},\n\tauthor       = {L.-H. Lim},\n\tyear         = 2013,\n\tbooktitle    = {Handbook of Linear Algebra},\n\tpublisher    = {CRC Press},\n\teditor       = {L. Hogben},\n\tedition      = {2nd}\n}\n@inproceedings{HLM2015,\n\ttitle        = {Variance Reduced Stochastic Gradient Descent with Neighbors},\n\tauthor       = {Hofmann, Thomas and Lucchi, Aurelien and Lacoste-Julien, Simon and McWilliams, Brian},\n\tyear         = 2015,\n\tbooktitle    = {NIPS 2015},\n\tpages        = {2296--2304}\n}\n@inproceedings{hlszz18,\n\ttitle        = {Spectral Filtering for General Linear Dynamical Systems},\n\tauthor       = {Hazan, Elad and Lee, Holden and Singh, Karan and Zhang, Cyril and Zhang, Yi},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NINPS)}\n}\n@article{HO00,\n\ttitle        = {Independent component analysis: algorithms and applications},\n\tauthor       = {A. Hyv{\\\"a}rinen and E. Oja},\n\tyear         = 2000,\n\tjournal      = {Neural Networks},\n\tvolume       = 13,\n\tnumber       = {4--5},\n\tpages        = {411--430}\n}\n@inproceedings{ho2013more,\n\ttitle        = {More effective distributed ml via a stale synchronous parallel parameter server},\n\tauthor       = {Ho, Qirong and Cipar, James and Cui, Henggang and Lee, Seunghak and Kim, Jin Kyu and Gibbons, Phillip B and Gibson, Garth A and Ganger, Greg and Xing, Eric},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {1223--1231}\n}\n@article{ho2014comparative,\n\ttitle        = {Comparative analysis of metazoan chromatin organization},\n\tauthor       = {Joshua WK Ho and Youngsook L Jung and Tao Liu and Burak H Alver and Soohyun Lee and Kohta Ikegami and Kyung-Ah Sohn and Aki Minoda and Michael Y Tolstorukov and Alex Appert and others},\n\tyear         = 2014,\n\tjournal      = {Nature},\n\tvolume       = 512,\n\tnumber       = 7515,\n\tpages        = {449--452}\n}\n@inproceedings{ho2016generative,\n\ttitle        = {Generative adversarial imitation learning},\n\tauthor       = {J. Ho and S. Ermon},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {4565--4573}\n}\n@article{hochreiter1997flat,\n\ttitle        = {Flat minima},\n\tauthor       = {Hochreiter, Sepp and Schmidhuber, J{\\\"u}rgen},\n\tyear         = 1997,\n\tjournal      = {Neural Computation},\n\tpublisher    = {MIT Press},\n\tvolume       = 9,\n\tnumber       = 1,\n\tpages        = {1--42}\n}\n@inproceedings{hochreiter2001learning,\n\ttitle        = {Learning to learn using gradient descent},\n\tauthor       = {Sepp Hochreiter and A Steven Younger and Peter R Conwell},\n\tyear         = 2001,\n\tbooktitle    = {International Conference on Artificial Neural Networks (ICANN)},\n\tpages        = {87--94}\n}\n@phdthesis{hockenmaier03ccg,\n\ttitle        = {Data and Models for Statistical Parsing with Combinatory Categorial Grammar},\n\tauthor       = {Julia Hockenmaier},\n\tyear         = 2003,\n\tschool       = {University of Edinburgh}\n}\n@article{hodge2004survey,\n\ttitle        = {A survey of outlier detection methodologies},\n\tauthor       = {Victoria Hodge and Jim Austin},\n\tyear         = 2004,\n\tjournal      = {Artificial intelligence review},\n\tvolume       = 22,\n\tnumber       = 2,\n\tpages        = {85--126}\n}\n@article{hodosh2013framing,\n\ttitle        = {Framing image description as a ranking task: Data, models and evaluation metrics},\n\tauthor       = {Micah Hodosh and Peter Young and Julia Hockenmaier},\n\tyear         = 2013,\n\tjournal      = {Journal of Artificial Intelligence Research (JAIR)},\n\tvolume       = 47,\n\tpages        = {853--899}\n}\n@book{hoeck2015counterfactual,\n\ttitle        = {Cognitive neuroscience of human counterfactual reasoning},\n\tauthor       = {Nicole Van Hoeck and Patrick D. Watson and Aron K. Barbey},\n\tyear         = 2015,\n\tpublisher    = {Frontiers in Human Neuroscience}\n}\n@article{hoeffding,\n\ttitle        = {Probability Inequalities for Sums of Bounded Random Variables},\n\tauthor       = {Wassily Hoeffding},\n\tyear         = 1963,\n\tjournal      = {Journal of the American Statistical Association},\n\tpublisher    = {[American Statistical Association, Taylor \\& Francis, Ltd.]},\n\tvolume       = 58,\n\tnumber       = 301,\n\tpages        = {13--30},\n\tissn         = {01621459},\n\turl          = {http://www.jstor.org/stable/2282952},\n\tabstract     = {Upper bounds are derived for the probability that the sum S of n independent random variables exceeds its mean ES by a positive number nt. It is assumed that the range of each summand of S is bounded or bounded above. The bounds for <tex-math>$\\Pr \\{ S - ES \\geq nt \\}$</tex-math> depend only on the endpoints of the ranges of the summands and the mean, or the mean and the variance of S. These results are then used to obtain analogous inequalities for certain sums of dependent random variables such as U statistics and the sum of a random sample without replacement from a finite population.}\n}\n@article{hoeffding1963,\n\ttitle        = {Probability Inequalities for Sums of Bounded Random Variables},\n\tauthor       = {Hoeffding, Wassily},\n\tyear         = 1963,\n\tjournal      = {Journal of the American Statistical Association},\n\tpublisher    = {American Statistical Association},\n\tvolume       = 58,\n\tnumber       = 301,\n\tpages        = {pp. 13--30},\n\tissn         = {01621459},\n\turl          = {http://www.jstor.org/stable/2282952},\n\tcopyright    = {Copyright Â© 1963 American Statistical Association},\n\tabstract     = {Upper bounds are derived for the probability that the sum S of n independent random variables exceeds its mean ES by a positive number nt. It is assumed that the range of each summand of S is bounded or bounded above. The bounds for <tex-math>$\\Pr \\{ S - ES \\geq nt \\}$</tex-math> depend only on the endpoints of the ranges of the summands and the mean, or the mean and the variance of S. These results are then used to obtain analogous inequalities for certain sums of dependent random variables such as U statistics and the sum of a random sample without replacement from a finite population.},\n\tjstor_articletype = {research-article},\n\tjstor_formatteddate = {Mar., 1963},\n\tlanguage     = {English}\n}\n@inproceedings{Hof,\n\ttitle        = {Probabilistic latent semantic analysis},\n\tauthor       = {T. Hofmann},\n\tyear         = 1999,\n\tbooktitle    = {UAI},\n\tpages        = {289--296}\n}\n@inproceedings{hofer2014extracting,\n\ttitle        = {Extracting Kinematic Background Knowledge from Interactions Using Task-Sensitive Relational Learning},\n\tauthor       = {S. H{\\\"o}fer and T. Lang and O. Brock},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Robotics and Automation (ICRA)}\n}\n@inproceedings{hoffer2017train,\n\ttitle        = {Train longer, generalize better: closing the generalization gap in large batch training of neural networks},\n\tauthor       = {Hoffer, Elad and Hubara, Itay and Soudry, Daniel},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {1731--1741}\n}\n@article{hoffer2018fix,\n\ttitle        = {Fix your classifier: the marginal value of training the last weight layer},\n\tauthor       = {Hoffer, Elad and Hubara, Itay and Soudry, Daniel},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1801.04540}\n}\n@inproceedings{hoffer2018norm,\n\ttitle        = {Norm matters: efficient and accurate normalization schemes in deep networks},\n\tauthor       = {Hoffer, Elad and Banner, Ron and Golan, Itay and Soudry, Daniel},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {2160--2170}\n}\n@article{hoffman1966on,\n\ttitle        = {On nonterminating stochastic games},\n\tauthor       = {Hoffman, Alan J and Karp, Richard M},\n\tyear         = 1966,\n\tjournal      = {Management Sci.},\n\tvolume       = 12,\n\tpages        = {359--370},\n\tdoi          = {10.1287/mnsc.12.5.359},\n\tissn         = {0025-1909},\n\turl          = {https://doi.org/10.1287/mnsc.12.5.359},\n\tfjournal     = {Management Science. Journal of the Institute of Management Science. Application and Theory Series},\n\tmrclass      = {90.72},\n\tmrnumber     = {0189842}\n}\n@inproceedings{hoffman2010online,\n\ttitle        = {Online learning for latent {D}irichlet allocation},\n\tauthor       = {M. D. Hoffman and D. M. Blei and F. Bach},\n\tyear         = 2010,\n\tbooktitle    = {Advances in Neural Information Processing Systems}\n}\n@inproceedings{hoffman2012discovering,\n\ttitle        = {Discovering Latent Domains for Multisource Domain Adaptation},\n\tauthor       = {Judy Hoffman and Brian Kulis and Trevor Darrell and Kate Saenko},\n\tyear         = 2012,\n\tbooktitle    = {European Conference on Computer Vision (ECCV)},\n\tpages        = {702--715}\n}\n@inproceedings{hoffman2014continuous,\n\ttitle        = {Continuous Manifold Based Adaptation for Evolving Visual Domains},\n\tauthor       = {Judy Hoffman and Trevor Darrell and Kate Saenko},\n\tyear         = 2014,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@inproceedings{hoffman2018cycada,\n\ttitle        = {CyCADA: Cycle Consistent Adversarial Domain Adaptation},\n\tauthor       = {Judy Hoffman and Eric Tzeng and Taesung Park and Jun-Yan Zhu and Phillip Isola and Kate Saenko and Alexei A. Efros and Trevor Darrell},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{hoffman2018msda,\n\ttitle        = {Algorithms and Theory for Multiple-source Adaptation},\n\tauthor       = {Judy Hoffman and Mehryar Mohri and Ningshan Zhang},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {8256--8266}\n}\n@inproceedings{hoffmann2011knowledge,\n\ttitle        = {Knowledge-Based Weak Supervision for Information Extraction of Overlapping Relations},\n\tauthor       = {Raphael Hoffmann and Congle Zhang and Xiao Ling and Luke S Zettlemoyer and Daniel S Weld},\n\tyear         = 2011,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {541--550}\n}\n@article{hoffmann2015effect,\n\ttitle        = {Effect of age and gender on reference intervals of red blood cell distribution width ({RDW}) and mean red cell volume ({MCV})},\n\tauthor       = {Johannes JML Hoffmann and Karin CAM Nabbe and Nicole MA van den Broek},\n\tyear         = 2015,\n\tjournal      = {Clinical Chemistry and Laboratory Medicine (CCLM)},\n\tvolume       = 53,\n\tnumber       = 12\n}\n@inproceedings{hofmann1999plsa,\n\ttitle        = {Probilistic latent semantic analysis},\n\tauthor       = {Thomas Hofmann},\n\tyear         = 1999,\n\tbooktitle    = {UAI}\n}\n@inproceedings{hofmann1999probabilistic,\n\ttitle        = {Probabilistic latent semantic analysis},\n\tauthor       = {Hofmann, Thomas},\n\tyear         = 1999,\n\tbooktitle    = {Proceedings of the Fifteenth Conference on Uncertainty in Artificial Intelligence}\n}\n@inproceedings{hoi2006batch,\n\ttitle        = {Batch mode active learning and its application to medical image classification},\n\tauthor       = {Steven CH Hoi and Rong Jin and Jianke Zhu and Michael R Lyu},\n\tyear         = 2006,\n\tbooktitle    = {Proceedings of the 23rd international conference on Machine learning},\n\tpages        = {417--424}\n}\n@inproceedings{hoiem05geometric,\n\ttitle        = {Geometric Context from a Single Image},\n\tauthor       = {Derek Hoiem and Alexei A. Efros and Martial Herbert},\n\tyear         = 2005,\n\tbooktitle    = {International Conference on Computer Vision (ICCV)}\n}\n@inproceedings{hoiem07occlusion,\n\ttitle        = {Recovering Occlusion Boundaries from a Single Image},\n\tauthor       = {Derek Hoiem and Andrew N. Stein and Alexei A. Efros and Martial Herbert},\n\tyear         = 2007,\n\tbooktitle    = {International Conference on Computer Vision (ICCV)}\n}\n@article{hoke2006intemon,\n\ttitle        = {InteMon: continuous mining of sensor data in large-scale self-infrastructures},\n\tauthor       = {\n\t\tHoke, Evan and Sun, Jimeng and Strunk, John D. and Ganger, Gregory\n\n\t\tR. and Faloutsos, Christos\n\t},\n\tyear         = 2006,\n\tjournal      = {SIGOPS Oper. Syst. Rev.},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tvolume       = 40,\n\tnumber       = 3,\n\tpages        = {38--44},\n\tdoi          = {http://doi.acm.org/10.1145/1151374.1151384},\n\tissn         = {0163-5980}\n}\n@article{holland1983stochastic,\n\ttitle        = {Stochastic blockmodels: Some first steps},\n\tauthor       = {Paul W. Holland and Kathryn B. Laskey and Samuel Leinhardt},\n\tyear         = 1983,\n\tjournal      = {Social Networks},\n\tvolume       = 5,\n\tpages        = {109--137}\n}\n@article{holland1986statistics,\n\ttitle        = {Statistics and causal inference},\n\tauthor       = {Paul W Holland},\n\tyear         = 1986,\n\tjournal      = {Journal of the American statistical Association},\n\tvolume       = 81,\n\tnumber       = 396,\n\tpages        = {945--960}\n}\n@article{holland2003causation,\n\ttitle        = {Causation and race},\n\tauthor       = {Paul W Holland},\n\tyear         = 2003,\n\tjournal      = {ETS Research Report Series},\n\tvolume       = 2003,\n\tnumber       = 1\n}\n@inproceedings{holland99proofs,\n\ttitle        = {Verbalization of High-Level Formal Proofs},\n\tauthor       = {Amanda M. Holland-Minkley and Regina Barzilay and Robert L. Constable},\n\tyear         = 1999,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@article{holodnak2014conditioning,\n\ttitle        = {Conditioning of Leverage Scores and Computation by QR Decomposition},\n\tauthor       = {Holodnak, John T and Ipsen, Ilse CF and Wentworth, Thomas},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1402.0957}\n}\n@inproceedings{holtzman2020curious,\n\ttitle        = {The Curious Case of Neural Text Degeneration},\n\tauthor       = {Ari Holtzman and Jan Buys and Li Du and Maxwell Forbes and Yejin Choi},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{honein2001accutane,\n\ttitle        = {Continued occurrence of Accutane exposed pregnancies},\n\tauthor       = {M.A. Honein and L.J. Paulozzi and J.D. Erickson},\n\tyear         = 2001,\n\tbooktitle    = {Teratology}\n}\n@article{hong2004groups,\n\ttitle        = {Groups of diverse problem solvers can outperform groups of high-ability problem solvers},\n\tauthor       = {Lu Hong and Scott E Page},\n\tyear         = 2004,\n\tjournal      = {Science},\n\tvolume       = 101,\n\tnumber       = 46\n}\n@article{hong2018gradient,\n\ttitle        = {Gradient Primal-Dual Algorithm Converges to Second-Order Stationary Solutions for Nonconvex Distributed Optimization},\n\tauthor       = {Hong, Mingyi and Lee, Jason D and Razaviyayn, Meisam},\n\tyear         = 2018,\n\tjournal      = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{hong2021disentangling,\n\ttitle        = {Disentangling Label Distribution for Long-tailed Visual Recognition},\n\tauthor       = {Hong, Youngkyu and Han, Seungju and Choi, Kwanghee and Seo, Seokjun and Kim, Beomsu and Chang, Buru},\n\tyear         = 2021,\n\tbooktitle    = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},\n\tpages        = {6626--6636}\n}\n@inproceedings{honnibal2015nmdp,\n\ttitle        = {An Improved Non-monotonic Transition System for Dependency Parsing},\n\tauthor       = {Matthew  Honnibal and Mark  Johnson},\n\tyear         = 2015,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1373--1378}\n}\n@inproceedings{hopkins2018mixture,\n\ttitle        = {Mixture Models, Robustness, and Sum of Squares Proofs},\n\tauthor       = {Samuel B. Hopkins and Jerry Li},\n\tyear         = 2018,\n\tbooktitle    = {Symposium on Theory of Computing (STOC)}\n}\n@inproceedings{hopm_1995,\n\ttitle        = {Higher-order power method—Application in independent component analysis},\n\tauthor       = {L. De Lathauwer and P. Comon and B. De Moor and J. Vandewalle},\n\tyear         = 1995,\n\tbooktitle    = {International Symposium on Nonlinear Theory and Its Applications},\n\tpages        = {91--96}\n}\n@article{horn1984toward,\n\ttitle        = {Toward a new taxonomy for pragmatic inference: {Q}-based and {R}-based implicature},\n\tauthor       = {Laurence Horn},\n\tyear         = 1984,\n\tjournal      = {Meaning, form, and use in context: Linguistic applications},\n\tpages        = {11--42}\n}\n@article{horn1990analog,\n\ttitle        = {An analog of the {C}auchy-{S}chwarz inequality for {H}adamard products and unitarily invariant norms},\n\tauthor       = {Roger A. Horn and Roy Mathias},\n\tyear         = 1990,\n\tjournal      = {SIAM Journal on Matrix Analysis and Applications},\n\tvolume       = 11,\n\tnumber       = 4,\n\tpages        = {481--498}\n}\n@book{horn2012matrix,\n\ttitle        = {Matrix analysis},\n\tauthor       = {Horn, Roger A. and Johnson, Charles R.},\n\tyear         = 2012,\n\tpublisher    = {Cambridge university press}\n}\n@article{hornik1989multilayer,\n\ttitle        = {Multilayer feedforward networks are universal approximators},\n\tauthor       = {Hornik, Kurt and Stinchcombe, Maxwell and White, Halbert},\n\tyear         = 1989,\n\tjournal      = {Neural networks},\n\tpublisher    = {Elsevier},\n\tvolume       = 2,\n\tnumber       = 5,\n\tpages        = {359--366}\n}\n@article{horvath2013dna,\n\ttitle        = {{DNA} methylation age of human tissues and cell types},\n\tauthor       = {Steve Horvath},\n\tyear         = 2013,\n\tjournal      = {Genome Biology},\n\tvolume       = 14,\n\tnumber       = 10\n}\n@article{hosmer1980goodness,\n\ttitle        = {Goodness of fit tests for the multiple logistic regression model},\n\tauthor       = {David W. Hosmer and Stanley Lemeshow},\n\tyear         = 1980,\n\tjournal      = {Communications in Statistics - Theory and Methods},\n\tvolume       = 9,\n\tpages        = {1043--1069}\n}\n@inproceedings{hosseini2014learning,\n\ttitle        = {Learning to Solve Arithmetic Word Problems with Verb Categorization},\n\tauthor       = {Mohammad Javad Hosseini and Hannaneh Hajishirzi and Oren Etzioni and Nate Kushman},\n\tyear         = 2014,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {523--533}\n}\n@article{hosseini2017deceiving,\n\ttitle        = {Deceiving {G}oogle's {P}erspective {API} Built for Detecting Toxic Comments},\n\tauthor       = {Hossein Hosseini and Sreeram Kannan and Baosen Zhang and Radha Poovendran},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1702.08138}\n}\n@inproceedings{hosu2016playing,\n\ttitle        = {Playing {A}tari Games with Deep Reinforcement Learning and Human Checkpoint Replay},\n\tauthor       = {Ionel-Alexandru Hosu and Traian Rebedea},\n\tyear         = 2016,\n\tbooktitle    = {Evaluating General Purpose AI}\n}\n@article{Hotelling35,\n\ttitle        = {The most predictable criterion},\n\tauthor       = {H. Hotelling},\n\tyear         = 1935,\n\tjournal      = {Journal of Educational Psychology},\n\tvolume       = 26,\n\tnumber       = 2,\n\tpages        = {139--142}\n}\n@article{houlsby2011bayesian,\n\ttitle        = {{B}ayesian active learning for classification and preference learning},\n\tauthor       = {Neil Houlsby and Ferenc Husz{\\'a}r and Zoubin Ghahramani and M{\\'a}t{\\'e} Lengyel},\n\tyear         = 2011,\n\tjournal      = {arXiv preprint arXiv:1112.5745}\n}\n@article{houlsby2019parameter,\n\ttitle        = {Parameter-Efficient Transfer Learning for {NLP}},\n\tauthor       = {Neil Houlsby and Andrei Giurgiu and Stanislaw Jastrzebski and Bruna Morrone and Quentin de Laroussilhe and Andrea Gesmundo and Mona Attariyan and Sylvain Gelly},\n\tyear         = 2019,\n\tjournal      = {arXiv}\n}\n@inproceedings{houthooft2016vime,\n\ttitle        = {Vime: Variational information maximizing exploration},\n\tauthor       = {Rein Houthooft and Xi Chen and Yan Duan and John Schulman and Filip De Turck and Pieter Abbeel},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1109--1117}\n}\n@inproceedings{houthooft2018evolved,\n\ttitle        = {Evolved policy gradients},\n\tauthor       = {Rein Houthooft and Yuhua Chen and Phillip Isola and Bradly Stadie and Filip Wolski and OpenAI Jonathan Ho and Pieter Abbeel},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {5400--5409}\n}\n@inproceedings{hovy2015,\n\ttitle        = {Tagging Performance Correlates with Age},\n\tauthor       = {Dirk Hovy and Anders Søgaard},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {483--488}\n}\n@inproceedings{hovy2016social,\n\ttitle        = {The social impact of natural language processing},\n\tauthor       = {Dirk Hovy and Shannon L Spruit},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {591--598}\n}\n@book{howard1960dynamic,\n\ttitle        = {Dynamic programming and {M}arkov processes},\n\tauthor       = {Howard, Ronald A.},\n\tyear         = 1960,\n\tpublisher    = {The MIT press, Cambridge, MA},\n\tpages        = {viii+136},\n\tmrclass      = {90.00},\n\tmrnumber     = {0118514},\n\tmrreviewer   = {R. E. Kalaba}\n}\n@inproceedings{howard2018universal,\n\ttitle        = {Universal language model fine-tuning for text classification},\n\tauthor       = {Jeremy Howard and Sebastian Ruder},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{hoyer2002non,\n\ttitle        = {Non-negative sparse coding},\n\tauthor       = {Hoyer, Patrik O},\n\tyear         = 2002,\n\tbooktitle    = {Neural Networks for Signal Processing, 2002. Proceedings of the 2002 12th IEEE Workshop on},\n\tpages        = {557--565},\n\torganization = {IEEE},\n\towner        = {gewor_000},\n\ttimestamp    = {2013.11.10}\n}\n@inproceedings{hron2020infinite,\n\ttitle        = {Infinite attention: NNGP and NTK for deep attention networks},\n\tauthor       = {Hron, Jiri and Bahri, Yasaman and Sohl-Dickstein, Jascha and Novak, Roman},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {4376--4386},\n\torganization = {PMLR}\n}\n@article{hs97,\n\ttitle        = {Long short-term memory},\n\tauthor       = {Hochreiter, Sepp and Schmidhuber, J{\\\"u}rgen},\n\tyear         = 1997,\n\tjournal      = {Neural computation},\n\tpublisher    = {MIT Press},\n\tvolume       = 9,\n\tnumber       = 8,\n\tpages        = {1735--1780}\n}\n@inproceedings{hsiao2010contact,\n\ttitle        = {Contact-reactive grasping of objects with partial shape information},\n\tauthor       = {K. Hsiao and S. Chitta and M. Ciocarlie and E. Jones},\n\tyear         = {2010 2010},\n\tbooktitle    = {International Conference on Intelligent Robots and Systems (IROS) IROS}\n}\n@inproceedings{hsu12identifiability,\n\ttitle        = {Identifiability and Unmixing of Latent Parse Trees},\n\tauthor       = {Daniel Hsu and Sham M. Kakade and Percy Liang},\n\tyear         = 2012,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{hsu2004example,\n\ttitle        = {Example-based control of human motion},\n\tauthor       = {Eugene Hsu and Sommer Gentry and Jovan Popovi\\'{c}},\n\tyear         = 2004,\n\tbooktitle    = {\n\t\tSCA '04: Proceedings of the 2004 ACM SIGGRAPH/Eurographics symposium\n\n\t\ton Computer animation\n\t},\n\tlocation     = {Grenoble, France},\n\tpublisher    = {Eurographics Association},\n\taddress      = {Aire-la-Ville, Switzerland, Switzerland},\n\tpages        = {69--77},\n\tdoi          = {http://doi.acm.org/10.1145/1028523.1028534},\n\tisbn         = {3-905673-14-2},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@inproceedings{hsu2012random,\n\ttitle        = {Random Design Analysis of Ridge Regression},\n\tauthor       = {Daniel Hsu and Sham M. Kakade and Tong Zhang},\n\tyear         = 2012,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@article{hsu2012spectral,\n\ttitle        = {A spectral algorithm for learning hidden Markov models},\n\tauthor       = {Hsu, Daniel and Kakade, Sham M and Zhang, Tong},\n\tyear         = 2012,\n\tjournal      = {Journal of Computer and System Sciences},\n\tpublisher    = {Elsevier},\n\tvolume       = 78,\n\tnumber       = 5,\n\tpages        = {1460--1480}\n}\n@article{hsu2012tail,\n\ttitle        = {A tail inequality for quadratic forms of subgaussian random vectors},\n\tauthor       = {Hsu, Daniel and Kakade, Sham and Zhang, Tong and others},\n\tyear         = 2012,\n\tjournal      = {Electronic Communications in Probability},\n\tpublisher    = {The Institute of Mathematical Statistics and the Bernoulli Society},\n\tvolume       = 17\n}\n@inproceedings{hsu2013learning,\n\ttitle        = {Learning mixtures of spherical gaussians: moment methods and spectral decompositions},\n\tauthor       = {Hsu, Daniel and Kakade, Sham M},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of the 4th conference on Innovations in Theoretical Computer Science},\n\tpages        = {11--20},\n\torganization = {ACM}\n}\n@inproceedings{hsz17,\n\ttitle        = {Learning linear dynamical systems via spectral filtering},\n\tauthor       = {Hazan, Elad and Singh, Karan and Zhang, Cyril},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NIPS)},\n\tpages        = {6702--6712}\n}\n@article{hu1996hmm,\n\ttitle        = {{HMM} based online handwriting recognition},\n\tauthor       = {Jianying Hu and Michael K Brown and William Turin},\n\tyear         = 1996,\n\tjournal      = {IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI)},\n\tvolume       = 18,\n\tnumber       = 10,\n\tpages        = {1039--1045}\n}\n@inproceedings{hu1998multiagent,\n\ttitle        = {Multiagent reinforcement learning: theoretical framework and an algorithm.},\n\tauthor       = {Hu, Junling and Wellman, Michael P and others},\n\tyear         = 1998,\n\tbooktitle    = {ICML},\n\tvolume       = 98,\n\tpages        = {242--250},\n\torganization = {Citeseer}\n}\n@article{hu2003nash,\n\ttitle        = {Nash {Q}-learning for general-sum stochastic games},\n\tauthor       = {Hu, Junling and Wellman, Michael P},\n\tyear         = 2003,\n\tjournal      = {Journal of machine learning research},\n\tvolume       = 4,\n\tnumber       = {Nov},\n\tpages        = {1039--1069}\n}\n@inproceedings{hu2009accelerated,\n\ttitle        = {Accelerated gradient methods for stochastic optimization and online learning},\n\tauthor       = {Hu, Chonghai and Pan, Weike and Kwok, James T},\n\tyear         = 2009,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {781--789}\n}\n@article{hu2013fast,\n\ttitle        = {Fast and accurate matrix completion via truncated nuclear norm regularization},\n\tauthor       = {Hu, Yao and Zhang, Debing and Ye, Jieping and Li, Xuelong and He, Xiaofei},\n\tyear         = 2013,\n\tjournal      = {Pattern Analysis and Machine Intelligence, IEEE Transactions on},\n\tpublisher    = {IEEE},\n\tvolume       = 35,\n\tnumber       = 9,\n\tpages        = {2117--2130}\n}\n@inproceedings{hu2014convolutional,\n\ttitle        = {Convolutional neural network architectures for matching natural language sentences},\n\tauthor       = {Baotian Hu and Zhengdong Lu and Hang Li and Qingcai Chen},\n\tyear         = 2014,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{hu2014polylingual,\n\ttitle        = {Polylingual Tree-Based Topic Models for Translation Domain Adaptation},\n\tauthor       = {Yuening Hu and Ke Zhai and Vladimir Eidelman and Jordan Boyd-Graber},\n\tyear         = 2014,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {1166--1176}\n}\n@article{hu2016harnessing,\n\ttitle        = {Harnessing deep neural networks with logic rules},\n\tauthor       = {Zhiting Hu and Xuezhe Ma and Zhengzhong Liu and Eduard Hovy and Eric Xing},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1603.06318}\n}\n@article{hu2017diffusion,\n\ttitle        = {On the diffusion approximation of nonconvex stochastic gradient descent},\n\tauthor       = {Hu, Wenqing and Li, Chris Junchi and Li, Lei and Liu, Jian-Guo},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1705.07562}\n}\n@inproceedings{hu2017learning,\n\ttitle        = {Learning to reason: End-to-end module networks for visual question answering},\n\tauthor       = {Ronghang Hu and Jacob Andreas and Marcus Rohrbach and Trevor Darrell and Kate Saenko},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Computer Vision (ICCV)}\n}\n@inproceedings{hu2017toward,\n\ttitle        = {Toward Controlled Generation of Text},\n\tauthor       = {Zhiting Hu and Zichao Yang and Xiaodan Liang and Ruslan Salakhutdinov and Eric P Xing},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{hu2018does,\n\ttitle        = {Does Distributionally Robust Supervised Learning Give Robust Classifiers?},\n\tauthor       = {Weihua Hu and Gang Niu and Issei Sato and Masashi Sugiyama},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{hu2018mnemonic,\n\ttitle        = {Reinforced Mnemonic Reader for Machine Reading Comprehension},\n\tauthor       = {Minghao Hu and Yuxing Peng and Xipeng Qiu},\n\tyear         = 2018,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@inproceedings{hu2019parabank,\n\ttitle        = {ParaBank: Monolingual Bitext Generation and Sentential Paraphrasing via Lexically-constrained Neural Machine Translation},\n\tauthor       = {J. Edward Hu and Rachel Rudinger and Matt Post and Benjamin Van Durme},\n\tyear         = 2019,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{hu2020open,\n\ttitle        = {{Open Graph Benchmark}: Datasets for machine learning on graphs},\n\tauthor       = {Weihua Hu and Matthias Fey and Marinka Zitnik and Yuxiao Dong and Hongyu Ren and Bowen Liu and Michele Catasta and Jure Leskovec},\n\tyear         = 2020,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{hu2020pretraining,\n\ttitle        = {Strategies for Pre-training Graph Neural Networks},\n\tauthor       = {Weihua Hu and Bowen Liu and Joseph Gomes and Marinka Zitnik and Percy Liang and Vijay Pande and Jure Leskovec},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{hu2020xtreme,\n\ttitle        = {Xtreme: A massively multilingual multi-task benchmark for evaluating cross-lingual generalization},\n\tauthor       = {Junjie Hu and Sebastian Ruder and Aditya Siddhant and Graham Neubig and Orhan Firat and Melvin Johnson},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2003.11080}\n}\n@inproceedings{huang2005better,\n\ttitle        = {Better k-best parsing},\n\tauthor       = {Liang Huang and David Chiang},\n\tyear         = 2005,\n\tbooktitle    = {Proceedings of the Ninth International Workshop on Parsing Technology},\n\tpages        = {53--64}\n}\n@inproceedings{huang2006correcting,\n\ttitle        = {Correcting sample selection bias by unlabeled data},\n\tauthor       = {Huang, Jiayuan and Gretton, Arthur and Borgwardt, Karsten M and Sch{\\\"o}lkopf, Bernhard and Smola, Alex J},\n\tyear         = 2006,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {601--608}\n}\n@inproceedings{huang2007forest,\n\ttitle        = {Forest Rescoring: Faster Decoding with Integrated Language Models},\n\tauthor       = {Liang Huang and David Chiang},\n\tyear         = 2007,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{huang2008forest,\n\ttitle        = {Forest reranking: Discriminative parsing with non-local features},\n\tauthor       = {Liang Huang},\n\tyear         = 2008,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{huang2011adversarial,\n\ttitle        = {Adversarial machine learning},\n\tauthor       = {Ling Huang and Anthony D Joseph and Blaine Nelson and Benjamin IP Rubinstein and JD Tygar},\n\tyear         = 2011,\n\tbooktitle    = {Proceedings of the 4th ACM workshop on Security and artificial intelligence},\n\tpages        = {43--58}\n}\n@inproceedings{huang2012structured,\n\ttitle        = {Structured {P}erceptron with inexact search},\n\tauthor       = {Liang Huang and Suphan Fayong and Yang Guo},\n\tyear         = 2012,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {142--151}\n}\n@inproceedings{huang2012wordrep,\n\ttitle        = {Improving Word Representations via Global Context and Multiple Word Prototypes},\n\tauthor       = {Eric H. Huang and Richard Socher and Christopher D. Manning and Andrew Y. Ng},\n\tyear         = 2012,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{huang2013fast,\n\ttitle        = {Fast Detection of Overlapping Communities via Online Tensor Methods},\n\tauthor       = {Furong Huang and U N Niranjan and Mohammad Umar Hakeem and Animashree Anandkumar},\n\tyear         = 2013,\n\tjournal      = {arXiv}\n}\n@article{huang2015bidirectional,\n\ttitle        = {Bidirectional {LSTM-CRF} Models for Sequence Tagging},\n\tauthor       = {Zhiheng Huang and Wei Xu and Kai Yu},\n\tyear         = 2015,\n\tjournal      = {arXiv}\n}\n@inproceedings{huang2016part,\n\ttitle        = {Part-stacked {CNN} for fine-grained visual categorization},\n\tauthor       = {Shaoli Huang and Zhe Xu and Dacheng Tao and Ya Zhang},\n\tyear         = 2016,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {1173--1182}\n}\n@article{huang2017adversarial,\n\ttitle        = {Adversarial attacks on neural network policies},\n\tauthor       = {Sandy Huang and Nicolas Papernot and Ian Goodfellow and Yan Duan and Pieter Abbeel},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@inproceedings{huang2017densely,\n\ttitle        = {Densely connected convolutional networks},\n\tauthor       = {Huang, Gao and Liu, Zhuang and Van Der Maaten, Laurens and Weinberger, Kilian Q},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the IEEE conference on computer vision and pattern recognition},\n\tpages        = {4700--4708}\n}\n@inproceedings{huang2017safety,\n\ttitle        = {Safety verification of deep neural networks},\n\tauthor       = {Xiaowei Huang and Marta Kwiatkowska and Sen Wang and Min Wu},\n\tyear         = 2017,\n\tbooktitle    = {Computer Aided Verification (CAV)},\n\tpages        = {3--29}\n}\n@inproceedings{huang2018fusion,\n\ttitle        = {FusionNet: Fusing via Fully-Aware Attention with Application to Machine Comprehension},\n\tauthor       = {Hsin-Yuan Huang and Chenguang Zhu and Yelong Shen and Weizhu Chen},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{huang2019achieving,\n\ttitle        = {Achieving Verified Robustness to Symbol Substitutions via Interval Bound Propagation},\n\tauthor       = {Po-Sen Huang and Robert Stanforth and Johannes Welbl and Chris Dyer and Dani Yogatama and Sven Gowal and Krishnamurthy Dvijotham and Pushmeet Kohli},\n\tyear         = 2019,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@book{hubbard1998vector,\n\ttitle        = {Vector Calculus, Linear Algebra, And Differential Forms},\n\tauthor       = {John H. Hubbard and Barbara B. Hubbard},\n\tyear         = 1998,\n\tpublisher    = {Prentice Hall}\n}\n@article{huber1964robust,\n\ttitle        = {Robust estimation of a location parameter},\n\tauthor       = {Peter J. Huber},\n\tyear         = 1964,\n\tjournal      = {The Annals of Mathematical Statistics},\n\tvolume       = 35,\n\tnumber       = 1,\n\tpages        = {73--101}\n}\n@book{huber2009robust,\n\ttitle        = {Robust Statistics},\n\tauthor       = {Peter J. Huber and Elvezio M. Ronchetti},\n\tyear         = 2009,\n\tpublisher    = {Wiley}\n}\n@article{hubmap2019human,\n\ttitle        = {The human body at cellular resolution: the {NIH} Human Biomolecular Atlas Program},\n\tauthor       = {HuBMAP Consortium and others},\n\tyear         = 2019,\n\tjournal      = {Nature},\n\tvolume       = 574,\n\tnumber       = 7777\n}\n@inproceedings{hudson2018mac,\n\ttitle        = {Compositional Attention Networks for Machine Reasoning},\n\tauthor       = {Drew A. Hudson and Christopher D. Manning},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{hudson2019gqa,\n\ttitle        = {{GQA}: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering},\n\tauthor       = {Drew A. Hudson and Christopher D. Manning},\n\tyear         = 2019,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@inproceedings{hudson2019nsm,\n\ttitle        = {Learning by Abstraction: The Neural State Machine},\n\tauthor       = {Drew A. Hudson and Christopher D. Manning},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{huesel2017gans,\n\ttitle        = {GANs Trained by a Two Time-Scale Update Rule Converge to a Local Nash Equilibrium},\n\tauthor       = {Martin Heusel and Hubert Ramsauer and Thomas Unterthiner and Bernhard Nessler and Sepp Hochreiter},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{hull1994database,\n\ttitle        = {A database for handwritten text recognition research},\n\tauthor       = {Jonathan J. Hull},\n\tyear         = 1994,\n\tjournal      = {IEEE Transactions on pattern analysis and machine intelligence},\n\tvolume       = 16,\n\tnumber       = 5,\n\tpages        = {550--554}\n}\n@article{humplik2019meta,\n\ttitle        = {Meta reinforcement learning as task inference},\n\tauthor       = {Jan Humplik and Alexandre Galashov and Leonard Hasenclever and Pedro A Ortega and Yee Whye Teh and Nicolas Heess},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1905.06424}\n}\n@article{hunt2020verifiably,\n\ttitle        = {Verifiably safe exploration for end-to-end reinforcement learning},\n\tauthor       = {Hunt, Nathan and Fulton, Nathan and Magliacane, Sara and Hoang, Nghia and Das, Subhro and Solar-Lezama, Armando},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.01223}\n}\n@article{huseby2004system,\n\ttitle        = {System reliability evaluation using conditional {M}onte {C}arlo methods},\n\tauthor       = {Arne Bang Huseby and Morten Naustdal and Ingeborg Drengstig V˚arli},\n\tyear         = 2004,\n\tjournal      = {in “Statistical Res. Rep},\n\tvolume       = 2,\n\tpages        = {0806--3842}\n}\n@article{hussain2018autonomous,\n\ttitle        = {Autonomous cars: Research results, issues, and future challenges},\n\tauthor       = {Hussain, Rasheed and Zeadally, Sherali},\n\tyear         = 2018,\n\tjournal      = {IEEE Communications Surveys \\& Tutorials},\n\tpublisher    = {IEEE},\n\tvolume       = 21,\n\tnumber       = 2,\n\tpages        = {1275--1313}\n}\n@inproceedings{hussein2017deep,\n\ttitle        = {Deep reward shaping from demonstrations},\n\tauthor       = {Ahmed Hussein and Eyad Elyan and Mohamed Medhat Gaber and Chrisina Jayne},\n\tyear         = 2017,\n\tbooktitle    = {International Joint Conference on Neural Networks}\n}\n@article{huttegger2010evolutionary,\n\ttitle        = {Evolutionary dynamics of {Lewis} signaling games: signaling systems vs. partial pooling},\n\tauthor       = {Simon M Huttegger and Brian Skyrms and Rory Smead and Kevin JS Zollman},\n\tyear         = 2010,\n\tjournal      = {Synthese},\n\tvolume       = 172,\n\tnumber       = 1,\n\tpages        = {177--191}\n}\n@inproceedings{HW,\n\ttitle        = {A bound on tail probabilities for quadratic forms in independent random variables},\n\tauthor       = {D. Hanson and F. Wright},\n\tyear         = 1971,\n\tbooktitle    = {Annals of Math. Stat.},\n\tpages        = {1079--1083}\n}\n@book{hwang1979multiple,\n\ttitle        = {Multiple Objective Decision Making  Methods and Applications},\n\tauthor       = {Ching-Lai Hwang and Abu Syed Md. Masud},\n\tyear         = 1979,\n\tpublisher    = {Springer}\n}\n@article{hyde2019applications,\n\ttitle        = {Applications of supervised machine learning in autism spectrum disorder research: a review},\n\tauthor       = {Hyde, Kayleigh K and Novack, Marlena N and LaHaye, Nicholas and Parlett-Pelleriti, Chelsea and Anden, Raymond and Dixon, Dennis R and Linstead, Erik},\n\tyear         = 2019,\n\tjournal      = {Review Journal of Autism and Developmental Disorders},\n\tpublisher    = {Springer},\n\tvolume       = 6,\n\tnumber       = 2,\n\tpages        = {128--146}\n}\n@article{hyvarinen06pseudolikelihood,\n\ttitle        = {Consistency of pseudolikelihood estimation of fully visible {B}oltzmann machines},\n\tauthor       = {Aapo Hyv\\\"arinen},\n\tyear         = 2006,\n\tjournal      = {Neural Computation},\n\tvolume       = 18,\n\tpages        = {2283--2292}\n}\n@article{hyvarinen2000independent,\n\ttitle        = {Independent component analysis: algorithms and applications},\n\tauthor       = {Aapo Hyv{\\\"a}rinen and Erkki Oja},\n\tyear         = 2000,\n\tjournal      = {Neural Networks},\n\tvolume       = 13,\n\tnumber       = {4-5},\n\tpages        = {411--430},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de},\n\tee           = {http://dx.doi.org/10.1016/S0893-6080(00)00026-5},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@book{hyvarinen2001independent,\n\ttitle        = {Independent Component Analysis},\n\tauthor       = {Hyv\\\"{a}rinen, Aapo and Karhunen, Juha and Oja, Erkki},\n\tyear         = 2001,\n\tpublisher    = {Wiley-Interscience},\n\tisbn         = {047140540X},\n\tedition      = 1,\n\tabstract     = {\n\t\t{A comprehensive introduction to ICA for students and practitioners<br>\n\n\t\tIndependent Component Analysis (ICA) is one of the most exciting\n\n\t\tnew topics in fields such as neural networks, advanced statistics,\n\n\t\tand signal processing. This is the first book to provide a comprehensive\n\n\t\tintroduction to this new technique complete with the fundamental\n\n\t\tmathematical background needed to understand and utilize it. It offers\n\n\t\ta general overview of the basics of ICA, important solutions and\n\n\t\talgorithms, and in-depth coverage of new applications in image processing,\n\n\t\ttelecommunications, audio signal processing, and more.<br> Independent\n\n\t\tComponent Analysis is divided into four sections that cover:<br>\n\n\t\t* General mathematical concepts utilized in the book<br> * The basic\n\n\t\tICA model and its solution<br> * Various extensions of the basic\n\n\t\tICA model<br> * Real-world applications for ICA models<br> Authors\n\n\t\tHyvarinen, Karhunen, and Oja are well known for their contributions\n\n\t\tto the development of ICA and here cover all the relevant theory,\n\n\t\tnew algorithms, and applications in various fields. Researchers,\n\n\t\tstudents, and practitioners from a variety of disciplines will find\n\n\t\tthis accessible volume both helpful and informative.}\n\t},\n\thowpublished = {Hardcover},\n\tkeywords     = {analysis, component, independent}\n}\n@inproceedings{ias2012execution,\n\ttitle        = {RoboEarth Action Recipe Execution},\n\tauthor       = {D Marco and M Tenorth and K H{\\\"a}ussermann and O Zweigle and P Levi},\n\tyear         = 2012,\n\tbooktitle    = {International Conference on Intelligent Autonomous Systems (IAS)}\n}\n@article{ICA:BelkinEtal12,\n\ttitle        = {{Blind Signal Separation in the Presence of Gaussian Noise}},\n\tauthor       = {Mikhail Belkin and Luis Rademacher and James Voss},\n\tyear         = 2012,\n\tmonth        = nov,\n\tjournal      = {arXiv preprint arXiv:1211.1716}\n}\n@book{ICAbook,\n\ttitle        = {Independent Component Analysis},\n\tauthor       = {Aapo Hyv{\\\"a}rinen and J. Karhunen and E. Oja.},\n\tyear         = 2001,\n\tpublisher    = {Wiley Interscience}\n}\n@inproceedings{icarte2018using,\n\ttitle        = {Using reward machines for high-level task specification and decomposition in reinforcement learning},\n\tauthor       = {Icarte, Rodrigo Toro and Klassen, Toryn and Valenzano, Richard and McIlraith, Sheila},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {2107--2116}\n}\n@inproceedings{icarte2019learning,\n\ttitle        = {Learning Reward Machines for Partially Observable Reinforcement Learning},\n\tauthor       = {Icarte, Rodrigo Toro and Waldie, Ethan and Klassen, Toryn and Valenzano, Rick and Castro, Margarita and McIlraith, Sheila},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {15497--15508}\n}\n@inproceedings{ide2008masc,\n\ttitle        = {{MASC}: the Manually Annotated Sub-Corpus of {A}merican {E}nglish},\n\tauthor       = {Nancy Ide and Collin Baker and Christiane Fellbaum and Charles Fillmore and Rebecca Passonneau},\n\tyear         = 2008,\n\tbooktitle    = {Language Resources and Evaluation (LREC)}\n}\n@inproceedings{ide2010masc,\n\ttitle        = {The Manually Annotated Sub-Corpus: A Community Resource for and by the People},\n\tauthor       = {Nancy Ide and Collin Baker and Christiane Fellbaum and Rebecca Passonneau},\n\tyear         = 2010,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{igor2017attribute,\n\ttitle        = {Improved Neural Text Attribute Transfer with Non-parallel Data},\n\tauthor       = {Igor Melnyk and Cicero Nogueira dos Santos and Kahini Wadhawan and Inkit Padhi and Abhishek Kumar},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1711.09395}\n}\n@article{ilyas2019adversarial,\n\ttitle        = {Adversarial examples are not bugs, they are features},\n\tauthor       = {Andrew Ilyas and Shibani Santurkar and Dimitris Tsipras and Logan Engstrom and Brandon Tran and Aleksander Madry},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1905.02175}\n}\n@incollection{ImageNet,\n\ttitle        = {ImageNet Classification with Deep Convolutional Neural Networks},\n\tauthor       = {Alex Krizhevsky and Ilya Sutskever and Geoff Hinton},\n\tyear         = 2012,\n\tbooktitle    = {Advances in Neural Information Processing Systems 25},\n\tpages        = {1106--1114},\n\towner        = {rongge},\n\ttimestamp    = {2013.09.26}\n}\n@inproceedings{imagenet_cvpr09,\n\ttitle        = {{ImageNet: A Large-Scale Hierarchical Image Database}},\n\tauthor       = {Deng, J. and Dong, W. and Socher, R. and Li, L.-J. and Li, K. and Fei-Fei, L.},\n\tyear         = 2009,\n\tbooktitle    = {CVPR09},\n\tbibsource    = {http://www.image-net.org/papers/imagenet_cvpr09.bib}\n}\n@inproceedings{ImprovedCheeger2013,\n\ttitle        = {Improved Cheeger's Inequality: Analysis of Spectral Partitioning Algorithms through Higher Order Spectral Gap},\n\tauthor       = {Kwok, Tsz Chiu and Lau, Lap Chi and Lee, Yin Tat and {Oveis Gharan}, Shayan and Trevisan, Luca},\n\tyear         = 2013,\n\tmonth        = jan,\n\tbooktitle    = {STOC '13}\n}\n@inproceedings{indyk2004approximate,\n\ttitle        = {Approximate nearest neighbor under edit distance via product metrics},\n\tauthor       = {Piotr Indyk},\n\tyear         = 2004,\n\tbooktitle    = {Symposium on Discrete Algorithms (SODA)},\n\tpages        = {646--650}\n}\n@inproceedings{IndykStrauss,\n\ttitle        = {Combining Geometry and Combinatorics: a Unified Approach to Sparse Signal Recovery},\n\tauthor       = {Berinde, R. and Gilbert, A.C. and Indyk, P. and Karloff, H. and Strauss, M.J.},\n\tyear         = 2008,\n\tbooktitle    = {46th Annual Allerton Conference on Communication, Control, and Computing},\n\tpages        = {798--805}\n}\n@article{inoue2003line,\n\ttitle        = {On-line learning theory of soft committee machines with correlated hidden units--steepest gradient descent and natural gradient descent--},\n\tauthor       = {Inoue, Masato and Park, Hyeyoung and Okada, Masato},\n\tyear         = 2003,\n\tjournal      = {Journal of the Physical Society of Japan},\n\tpublisher    = {The Physical Society of Japan},\n\tvolume       = 72,\n\tnumber       = 4,\n\tpages        = {805--810}\n}\n@inproceedings{inoue2018cross,\n\ttitle        = {Cross-domain weakly-supervised object detection through progressive domain adaptation},\n\tauthor       = {Naoto Inoue and Ryosuke Furuta and Toshihiko Yamasaki and Kiyoharu Aizawa},\n\tyear         = 2018,\n\tbooktitle    = {Proceedings of the IEEE conference on computer vision and pattern recognition},\n\tpages        = {5001--5009}\n}\n@misc{intel2007intel,\n\ttitle        = {Intel Research Advances 'Era Of Tera': www.intel.com/pressroom/archive/releases/20070204comp.htm},\n\tauthor       = {Intel},\n\tyear         = 2007,\n\turl          = {http://www.intel.com/pressroom/archive/releases/20070204comp.htm}\n}\n@misc{interpolation,\n\ttitle        = {\"Lagrange Interpolating Polynomial.\" From MathWorld--A Wolfram Web Resource.},\n\tauthor       = {Archer, Branden and Weisstein, Eric W},\n\turl          = {http://mathworld.wolfram.com/LagrangeInterpolatingPolynomial.html},\n\tbdsk-url-1   = {http://mathworld.wolfram.com/LagrangeInterpolatingPolynomial.html}\n}\n@article{ioffe2015batch,\n\ttitle        = {Batch normalization: Accelerating deep network training by reducing internal covariate shift},\n\tauthor       = {Ioffe, Sergey and Szegedy, Christian},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1502.03167},\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {448--456},\n\turl          = {http://jmlr.org/proceedings/papers/v37/ioffe15.html},\n\tcrossref     = {DBLP:conf/icml/2015},\n\ttimestamp    = {Tue, 12 Jul 2016 21:51:15 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/icml/IoffeS15},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@article{IP,\n\ttitle        = {On the complexity of K-SAT},\n\tauthor       = {Impagliazzo, Russel and Paturi, Ramamohan},\n\tyear         = 2001,\n\tmonth        = mar,\n\tjournal      = {J. Comput. Syst. Sci.},\n\tpublisher    = {Academic Press, Inc.},\n\taddress      = {Orlando, FL, USA},\n\tvolume       = 62,\n\tnumber       = 2,\n\tpages        = {367--375},\n\tdoi          = {10.1006/jcss.2000.1727},\n\tissn         = {0022-0000},\n\turl          = {http://dx.doi.org/10.1006/jcss.2000.1727},\n\tissue_date   = {March 2001},\n\tnumpages     = 9,\n\tacmid        = 374991\n}\n@inproceedings{IPDPSW2011,\n\ttitle        = {Efficiently Computing Tensor Eigenvalues on a {GPU}},\n\tauthor       = {G. Ballard and T. G. Kolda and T. Plantenga},\n\tyear         = 2011,\n\tmonth        = may,\n\tbooktitle    = {IPDPSW'11: Proceedings of the 2011 IEEE International Symposium on Parallel and Distributed Processing Workshops and PhD Forum},\n\tpublisher    = {IEEE Computer Society},\n\tpages        = {1340--1348}\n}\n@inproceedings{ippolito2019unsupervised,\n\ttitle        = {Unsupervised Hierarchical Story Infilling},\n\tauthor       = {Daphne Ippolito and David Grangier and Chris Callison-Burch and Douglas Eck},\n\tyear         = 2019,\n\tbooktitle    = {NAACL Workshop on Narrative Understanding},\n\tpages        = {37--43}\n}\n@article{ipsen1998relative,\n\ttitle        = {Relative perturbation results for matrix eigenvalues and singular values},\n\tauthor       = {Ipsen, Ilse CF},\n\tyear         = 1998,\n\tjournal      = {Acta numerica},\n\tpublisher    = {Cambridge Univ Press},\n\tvolume       = 7,\n\tpages        = {151--201}\n}\n@inproceedings{iqbal2019actor,\n\ttitle        = {Actor-attention-critic for multi-agent reinforcement learning},\n\tauthor       = {Iqbal, Shariq and Sha, Fei},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {2961--2970},\n\torganization = {PMLR}\n}\n@article{ireland2011language,\n\ttitle        = {Language style matching predicts relationship initiation and stability},\n\tauthor       = {Molly E Ireland and Richard B Slatcher and Paul W Eastwick and Lauren E Scissors and Eli J Finkel and James W Pennebaker},\n\tyear         = 2011,\n\tjournal      = {Psychological Science},\n\tvolume       = 22,\n\tnumber       = 1,\n\tpages        = {39--44}\n}\n@inproceedings{irvin2019chexpert,\n\ttitle        = {Chexpert: A large chest radiograph dataset with uncertainty labels and expert comparison},\n\tauthor       = {Jeremy Irvin and Pranav Rajpurkar and Michael Ko and Yifan Yu and Silviana Ciurea-Ilcus and Chris Chute and Henrik Marklund and Behzad Haghgoo and Robyn Ball and Katie Shpanskaya and others},\n\tyear         = 2019,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tvolume       = 33,\n\tpages        = {590--597}\n}\n@article{ishiguro97bootstrapping,\n\ttitle        = {Bootstrapping Log Likelihood and {EIC}, an Extension of {AIC}},\n\tauthor       = {M. Ishiguro and Y. Sakamoto and G. Kitagawa},\n\tyear         = 1997,\n\tjournal      = {Annals of the Institute of Statistical Mathematics},\n\tvolume       = 49,\n\tpages        = {411--434}\n}\n@article{ishwaran01gibbs,\n\ttitle        = {{G}ibbs Sampling Methods for Stick-Breaking Priors},\n\tauthor       = {H. Ishwaran and L. F. James},\n\tyear         = 2001,\n\tjournal      = {Journal of the American Statistical Association (JASA)},\n\tvolume       = 96,\n\tpages        = {161--173}\n}\n@article{ishwaran02exact,\n\ttitle        = {Exact and approximate sum-representations for the {D}irichlet process},\n\tauthor       = {H. Ishwaran and M. Zarepour},\n\tyear         = 2002,\n\tjournal      = {Canadian Journal of Statististics},\n\tvolume       = 30,\n\tpages        = {269--284}\n}\n@article{islam2017reproducibility,\n\ttitle        = {Reproducibility of benchmarked deep reinforcement learning tasks for continuous control},\n\tauthor       = {R. Islam and P. Henderson and M. Gomrokchi and D. Precup},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1708.04133}\n}\n@inproceedings{ivanovic2005dialogue,\n\ttitle        = {Dialogue Act Tagging for Instant Messaging Chat Sessions},\n\tauthor       = {Edward Ivanovic},\n\tyear         = 2005,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{iyer2017neural,\n\ttitle        = {Learning a Neural Semantic Parser from User Feedback},\n\tauthor       = {Srinivasan Iyer and Ioannis Konstas and Alvin Cheung and Jayant Krishnamurthy and Luke Zettlemoyer},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@misc{iyer2017qqp,\n\ttitle        = {First Quora Dataset Release: Question Pairs},\n\tauthor       = {Shankar Iyer and Nikhil Dandekar and Korn{'e}l Csernai},\n\tyear         = 2017,\n\thowpublished = {\\url{https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs}}\n}\n@inproceedings{iyer2018mapping,\n\ttitle        = {Mapping Language to Code in Programmatic Context},\n\tauthor       = {Srinivasan Iyer and Ioannis Konstas and Alvin Cheung and Luke S. Zettlemoyer},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{iyer2019learning,\n\ttitle        = {Learning Programmatic Idioms for Scalable Semantic Parsing},\n\tauthor       = {Srinivasan Iyer and Alvin Cheung and Luke Zettlemoyer},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1904.09086}\n}\n@inproceedings{iyyer2014factoid,\n\ttitle        = {A Neural Network for Factoid Question Answering over Paragraphs},\n\tauthor       = {Mohit Iyyer and Jordan Boyd-Graber and Leonardo Claudino and Hal Daumé III},\n\tyear         = 2014,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{iyyer2016answering,\n\ttitle        = {Answering Complicated Question Intents Expressed in Decomposed Question Sequences},\n\tauthor       = {Mohit Iyyer and Wen{-}tau Yih and Ming{-}Wei Chang},\n\tyear         = 2016,\n\tjournal      = {CoRR},\n\tvolume       = {0}\n}\n@inproceedings{iyyer2017search,\n\ttitle        = {Search-based neural structured learning for sequential question answering},\n\tauthor       = {Mohit Iyyer and Wen-tau Yih and Ming-Wei Chang},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{iyyer2018adversarial,\n\ttitle        = {Adversarial Example Generation with Syntactically Controlled Paraphrase Networks},\n\tauthor       = {Mohit Iyyer and John Wieting and Kevin Gimpel and Luke Zettlemoyer},\n\tyear         = 2018,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@inproceedings{jaakkola1999exploiting,\n\ttitle        = {Exploiting generative models in discriminative classifiers},\n\tauthor       = {Tommi S Jaakkola and David Haussler and others},\n\tyear         = 1999,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {487--493}\n}\n@article{jaakkola1999variational,\n\ttitle        = {Variational Probabilistic Inference and the {QMR-DT} Network},\n\tauthor       = {T. S Jaakkola and M. I Jordan},\n\tyear         = 1999,\n\tjournal      = {Journal of Artificial Intelligence Research (JAIR)},\n\tvolume       = 10,\n\tpages        = {291--322}\n}\n@inproceedings{jaakkola97logistic,\n\ttitle        = {A variational approach to {B}ayesian logistic regression models and their extensions},\n\tauthor       = {Tommi Jaakkola and Michael I. Jordan},\n\tyear         = 1997,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)}\n}\n@inproceedings{jabbari2017,\n\ttitle        = {Fairness in Reinforcement Learning},\n\tauthor       = {Shahin Jabbari and Matthew Joseph and Michael Kearns and Jamie Morgenstern and Aaron Roth},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1617--1626}\n}\n@inproceedings{jackson2002learnability,\n\ttitle        = {Learnability beyond $AC^0$},\n\tauthor       = {Jackson, Jeffrey C and Klivans, Adam R and Servedio, Rocco A},\n\tyear         = 2002,\n\tbooktitle    = {Proceedings of the thiry-fourth annual ACM symposium on Theory of computing},\n\tpages        = {776--784},\n\torganization = {ACM},\n\towner        = {gewor_000},\n\ttimestamp    = {2013.10.01}\n}\n@article{jackson97dnf,\n\ttitle        = {An efficient membership-query algorithm for learning {DNF} with respect to the uniform distribution},\n\tauthor       = {J. Jackson},\n\tyear         = 1997,\n\tjournal      = {Journal of Computer and System Sciences},\n\tvolume       = 55,\n\tnumber       = 3,\n\tpages        = {414--440}\n}\n@inproceedings{jacob09cluster,\n\ttitle        = {Clustered Multi-Task Learning: A Convex Formulation},\n\tauthor       = {Laurent Jacob and Francis Bach and Jean-Philippe Vert},\n\tyear         = 2009,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {745--752}\n}\n@article{jacobs91experts,\n\ttitle        = {Adaptive mixtures of local experts},\n\tauthor       = {R. A. Jacobs and M. I. Jordan and S. J. Nowlan and G. E. Hinton},\n\tyear         = 1991,\n\tjournal      = {Neural Computation},\n\tvolume       = 3,\n\tpages        = {79--87}\n}\n@inproceedings{jacot2018neural,\n\ttitle        = {Neural tangent kernel: Convergence and generalization in neural networks},\n\tauthor       = {Jacot, Arthur and Gabriel, Franck and Hongler, Cl{\\'e}ment},\n\tyear         = 2018,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {8571--8580}\n}\n@inproceedings{jacovi2019blackbox,\n\ttitle        = {Neural network gradient-based learning of black-box function interfaces},\n\tauthor       = {Alon Jacovi and Guy Hadash and Einat Kermany and Boaz Carmeli and Ofer Lavi and George Kour and Jonathan Berant},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{jacovi2021formalizing,\n\ttitle        = {Formalizing trust in artificial intelligence: Prerequisites, causes and goals of human trust in ai},\n\tauthor       = {Jacovi, Alon and Marasovi{\\'c}, Ana and Miller, Tim and Goldberg, Yoav},\n\tyear         = 2021,\n\tbooktitle    = {Proceedings of the 2021 ACM Conference on Fairness, Accountability, and Transparency},\n\tpages        = {624--635}\n}\n@article{jaderberg2016reinforcement,\n\ttitle        = {Reinforcement learning with unsupervised auxiliary tasks},\n\tauthor       = {M. Jaderberg and V. Mnih and W. M. Czarnecki and T. Schaul and J. Z. Leibo and D. Silver and K. Kavukcuoglu},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1611.05397}\n}\n@article{jaeckel1972infinitesimal,\n\ttitle        = {The infinitesimal jackknife},\n\tauthor       = {Louis A Jaeckel},\n\tyear         = 1972,\n\tjournal      = {Unpublished memorandum, Bell Telephone Laboratories, Murray Hill, NJ}\n}\n@article{jaeger,\n\ttitle        = {Observable Operator Models for Discrete Stochastic Time Series},\n\tauthor       = {H. Jaeger},\n\tyear         = 2000,\n\tjournal      = {Neural Comput.},\n\tvolume       = 12,\n\tnumber       = 6\n}\n@article{jaeger08applications,\n\ttitle        = {Applications of Game Theory in Linguistics},\n\tauthor       = {Gerhard Jäger},\n\tyear         = 2008,\n\tjournal      = {Language and Linguistics Compass},\n\tvolume       = 2,\n\tpages        = {406--421}\n}\n@techreport{jaeger08game,\n\ttitle        = {Game Theory in Semantics and Pragmatics},\n\tauthor       = {Gerhard Jäger},\n\tyear         = 2008,\n\tinstitution  = {University of Tübingen}\n}\n@article{jaeger2000observable,\n\ttitle        = {Observable operator models for discrete stochastic time series},\n\tauthor       = {Herbert Jaeger},\n\tyear         = 2000,\n\tjournal      = {Neural Computation},\n\tvolume       = 12,\n\tnumber       = 6,\n\tpages        = {1371--1398}\n}\n@inproceedings{jaegle2021perceiver,\n\ttitle        = {Perceiver: General Perception with Iterative Attention},\n\tauthor       = {Andrew Jaegle and Felix Gimeno and Andrew Brock and Andrew Zisserman and Oriol Vinyals and Jo{\\~a}o Carreira},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{jaffe2015estimating,\n\ttitle        = {Estimating the accuracies of multiple classifiers without labeled data},\n\tauthor       = {A. Jaffe and B. Nadler and Y. Kluger},\n\tyear         = 2015,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)},\n\tpages        = {407--415}\n}\n@article{jaganathan2019predicting,\n\ttitle        = {Predicting splicing from primary sequence with deep learning},\n\tauthor       = {Kishore Jaganathan and Sofia Kyriazopoulou Panagiotopoulou and Jeremy F McRae and Siavash Fazel Darbandi and David Knowles and Yang I Li and Jack A Kosmicki and Juan Arbelaez and Wenwu Cui and Grace B Schwartz and others},\n\tyear         = 2019,\n\tjournal      = {Cell},\n\tvolume       = 176,\n\tnumber       = 3,\n\tpages        = {535--548}\n}\n@inproceedings{jagannatha2016structured,\n\ttitle        = {Structured prediction models for {RNN} based sequence labeling in clinical text},\n\tauthor       = {Abhyuday Jagannatha and Hong Yu},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{jahangiri2005shift,\n\ttitle        = {\n\t\tSHIFT-SPLIT: I/O efficient maintenance of wavelet-transformed multidimensional\n\n\t\tdata\n\t},\n\tauthor       = {Jahangiri, Mehrdad and Sacharidis, Dimitris and Shahabi, Cyrus},\n\tyear         = 2005,\n\tbooktitle    = {\n\t\tProceedings of the 2005 ACM SIGMOD international conference on Management\n\n\t\tof data\n\t},\n\tlocation     = {Baltimore, Maryland},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {SIGMOD '05},\n\tpages        = {275--286},\n\tdoi          = {http://doi.acm.org/10.1145/1066157.1066189},\n\tisbn         = {1-59593-060-4},\n\tacmid        = 1066189,\n\tnumpages     = 12\n}\n@techreport{jaillet2011online,\n\ttitle        = {Online Resource Allocation Problems},\n\tauthor       = {Patrick Jaillet and Xin Lu},\n\tyear         = 2011,\n\tinstitution  = {Massachusetts Institute of Technology}\n}\n@techreport{jain00splitmerge,\n\ttitle        = {A Split-Merge {M}arkov Chain {M}onte {C}arlo Procedure for the {D}irichlet Process Mixture Model},\n\tauthor       = {S. Jain and R. Neal},\n\tyear         = 2000,\n\tinstitution  = {Department of Statistics, University of Toronto (U. Toronto)}\n}\n@techreport{jain05splitmerge,\n\ttitle        = {Splitting and merging components of a nonconjugate {D}irichlet process mixture model},\n\tauthor       = {S. Jain and R. Neal},\n\tyear         = 2005,\n\tinstitution  = {Department of Statistics, University of Toronto (U. Toronto)}\n}\n@inproceedings{jain2004adaptive,\n\ttitle        = {Adaptive stream resource management using Kalman Filters},\n\tauthor       = {Jain, Ankur and Chang, Edward Y. and Wang, Yuan-Fang},\n\tyear         = 2004,\n\tbooktitle    = {\n\t\tProceedings of the 2004 ACM SIGMOD international conference on Management\n\n\t\tof data\n\t},\n\tlocation     = {Paris, France},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {SIGMOD '04},\n\tpages        = {11--22},\n\tdoi          = {http://doi.acm.org/10.1145/1007568.1007573},\n\tisbn         = {1-58113-859-8},\n\tacmid        = 1007573,\n\tnumpages     = 12\n}\n@inproceedings{jain2012low,\n\ttitle        = {Low-rank Matrix Completion using Alternating Minimization},\n\tauthor       = {Jain, Prateek and Netrapalli, Praneeth and Sanghavi, Sujay},\n\tyear         = 2013,\n\tbooktitle    = {ACM STOC},\n\tpages        = {665--674},\n\torganization = {ACM}\n}\n@inproceedings{jain2015assistive,\n\ttitle        = {Assistive robotic manipulation through shared autonomy and a body-machine interface},\n\tauthor       = {Siddarth Jain and Ali Farshchiansadegh and Alexander Broad and Farnaz Abdollahi and Ferdinando Mussa-Ivaldi and Brenna Argall},\n\tyear         = 2015,\n\tbooktitle    = {IEEE International Conference on Rehabilitation Robotics (ICORR)},\n\tpages        = {526--531}\n}\n@inproceedings{jain2015fast,\n\ttitle        = {Fast Exact Matrix Completion with Finite Samples},\n\tauthor       = {Jain, Prateek and Netrapalli, Praneeth},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of The 28th Conference on Learning Theory},\n\tpages        = {1007--1034}\n}\n@inproceedings{jain2016matching,\n\ttitle        = {Matching Matrix Bernstein with Little Memory: Near-Optimal Finite Sample Guarantees for Oja's Algorithm},\n\tauthor       = {Jain, Prateek and Jin, Chi and Kakade, Sham M and Netrapalli, Praneeth and Sidford, Aaron},\n\tyear         = 2016,\n\tjournal      = {COLT}\n}\n@inproceedings{jain2016streaming,\n\ttitle        = {Streaming PCA: Matching Matrix Bernstein and Near-Optimal Finite Sample Guarantees for Oja’s Algorithm},\n\tauthor       = {Jain, Prateek and Jin, Chi and Kakade, Sham M and Netrapalli, Praneeth and Sidford, Aaron},\n\tyear         = 2016,\n\tbooktitle    = {29th Annual Conference on Learning Theory},\n\tpages        = {1147--1164}\n}\n@inproceedings{jain2017global,\n\ttitle        = {Global Convergence of Non-Convex Gradient Descent for Computing Matrix Squareroot},\n\tauthor       = {Jain, Prateek and Jin, Chi and Kakade, Sham and Netrapalli, Praneeth},\n\tyear         = 2017,\n\tbooktitle    = {Artificial Intelligence and Statistics},\n\tpages        = {479--488}\n}\n@article{jain2019probabilistic,\n\ttitle        = {Probabilistic human intent recognition for shared autonomy in assistive robotics},\n\tauthor       = {Siddarth Jain and Brenna Argall},\n\tyear         = 2019,\n\tjournal      = {ACM Transactions on Human-Robot Interaction (THRI)},\n\tvolume       = 9,\n\tpages        = {1--23}\n}\n@article{JainJiUpadhyayWatrous2009,\n\ttitle        = {{QIP = PSPACE}},\n\tauthor       = {Jain, Rahul and Ji, Zhengfeng and Upadhyay, Sarvagya and Watrous, John},\n\tyear         = 2011,\n\tjournal      = {Journal of the ACM (JACM)},\n\tpublisher    = {ACM},\n\tvolume       = 58,\n\tnumber       = 6,\n\tpages        = 30\n}\n@inproceedings{JainJKNS2016-online1SVD,\n\ttitle        = {{Streaming PCA: Matching Matrix Bernstein and Near-Optimal Finite Sample Guarantees for Oja's Algorithm}},\n\tauthor       = {Prateek Jain and Chi Jin and Sham M. Kakade and Praneeth Netrapalli and Aaron Sidford},\n\tyear         = 2016,\n\tbooktitle    = {COLT}\n}\n@article{JainYao2011,\n\ttitle        = {{A Parallel Approximation Algorithm for Positive Semidefinite Programming}},\n\tauthor       = {Jain, Rahul and Yao, Penghui},\n\tyear         = 2011,\n\tmonth        = oct,\n\tjournal      = {2011 IEEE 52nd Annual Symposium on Foundations of Computer Science},\n\tpublisher    = {Ieee},\n\tpages        = {463--471},\n\tdoi          = {10.1109/FOCS.2011.25},\n\tisbn         = {978-0-7695-4571-4},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Jain, Yao - 2011 - A Parallel Approximation Algorithm for Positive Semidefinite Programming.pdf:pdf},\n\tkeywords     = {-fast parallel algorithms,gramming,multiplicative weight update,positive semidefinite pro-},\n\tmendeley-groups = {Algorithms/Multiplicative Weight/SDP}\n}\n@techreport{JainYao2012,\n\ttitle        = {{A parallel approximation algorithm for mixed packing and covering semidefinite programs}},\n\tauthor       = {Jain, Rahul and Yao, Penghui},\n\tyear         = 2012,\n\tmonth        = jan,\n\tbooktitle    = {arXiv preprint arXiv:1201.6090},\n\tpages        = 8,\n\tabstract     = {We present a parallel approximation algorithm for a class of mixed packing and covering semidefinite programs which generalize on the class of positive semidefinite programs as considered by Jain and Yao [2011]. As a corollary we get a faster approximation algorithm for positive semidefinite programs with better dependence of the parallel running time on the approximation factor, as compared to that of Jain and Yao [2011]. Our algorithm and analysis is on similar lines as that of Young [2001] who considered analogous linear programs.},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {1201.6090},\n\teprint       = {1201.6090},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop//Jain, Yao - 2012 - A parallel approximation algorithm for mixed packing and covering semidefinite programs.pdf:pdf},\n\tmendeley-groups = {Algorithms/Multiplicative Weight/SDP}\n}\n@inproceedings{jaitly2013vocal,\n\ttitle        = {Vocal Tract Length Perturbation (VTLP) improves {s}peech recognition},\n\tauthor       = {Navdeep Jaitly and Geoffrey E. Hinton},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{jaksch2010near,\n\ttitle        = {Near-optimal regret bounds for reinforcement learning},\n\tauthor       = {Jaksch, Thomas and Ortner, Ronald and Auer, Peter},\n\tyear         = 2010,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 11,\n\tnumber       = {Apr},\n\tpages        = {1563--1600}\n}\n@inproceedings{jamal2020rethinking,\n\ttitle        = {Rethinking Class-Balanced Methods for Long-Tailed Visual Recognition From a Domain Adaptation Perspective},\n\tauthor       = {Jamal, Muhammad Abdullah and Brown, Matthew and Yang, Ming-Hsuan and Wang, Liqiang and Gong, Boqing},\n\tyear         = 2020,\n\tmonth        = {June},\n\tbooktitle    = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}\n}\n@article{jammalamadaka2007inclusion,\n\ttitle        = {Inclusion and exclusion of data or parameters in the general linear model},\n\tauthor       = {S Rao Jammalamadaka and D Sengupta},\n\tyear         = 2007,\n\tjournal      = {Statistics \\& probability letters},\n\tvolume       = 77,\n\tnumber       = 12,\n\tpages        = {1235--1247}\n}\n@inproceedings{jang2017categorical,\n\ttitle        = {Categorical Reparametrization with Gumbel-Softmax},\n\tauthor       = {Jang, Eric and Gu, Shixiang and Poole, Ben},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1611.01144},\n\tbooktitle    = {International Conference on Learning Representations}\n}\n@article{janner2018representation,\n\ttitle        = {Representation Learning for Grounded Spatial Reasoning},\n\tauthor       = {Michael Janner and Karthik Narasimhan and Regina Barzilay},\n\tyear         = 2018,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 6\n}\n@article{janner2019trust,\n\ttitle        = {When to trust your model: Model-based policy optimization},\n\tauthor       = {Janner, Michael and Fu, Justin and Zhang, Marvin and Levine, Sergey},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1906.08253}\n}\n@article{janner2021reinforcement,\n\ttitle        = {Reinforcement Learning as One Big Sequence Modeling Problem},\n\tauthor       = {Michael Janner and Qiyang Li and Sergey Levine},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2106.02039}\n}\n@article{janowczyk2016deep,\n\ttitle        = {Deep learning for digital pathology image analysis: A comprehensive tutorial with selected use cases},\n\tauthor       = {Andrew Janowczyk and Anant Madabhushi},\n\tyear         = 2016,\n\tjournal      = {Journal of pathology informatics},\n\tvolume       = 7\n}\n@article{jansen2018safe,\n\ttitle        = {Safe Reinforcement Learning via Probabilistic Shields},\n\tauthor       = {Jansen, Nils and K{\\\"o}nighofer, Bettina and Junges, Sebastian and Serban, Alexandru C and Bloem, Roderick},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1807.06096}\n}\n@article{janson2004robust,\n\ttitle        = {Robust reconstruction on trees is determined by the second eigenvalue},\n\tauthor       = {Svante Janson and Elchanan Mossel},\n\tyear         = 2004,\n\tjournal      = {Annals of Probability},\n\tvolume       = 32,\n\tpages        = {2630--2649}\n}\n@article{janzamin2015beating,\n\ttitle        = {Beating the perils of non-convexity: Guaranteed training of neural networks using tensor methods},\n\tauthor       = {Janzamin, Majid and Sedghi, Hanie and Anandkumar, Anima},\n\tyear         = 2015,\n\tjournal      = {CoRR abs/1506.08473}\n}\n@inproceedings{janzamin2015score,\n\ttitle        = {Score Function Features for Discriminative Learning},\n\tauthor       = {Majid Janzamin and Hanie Sedghi and Anima Anandkumar},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{japkowicz2002class,\n\ttitle        = {The class imbalance problem: A systematic study},\n\tauthor       = {Nathalie Japkowicz and Shaju Stephen},\n\tyear         = 2002,\n\tjournal      = {Intelligent Data Analysis},\n\tvolume       = 6,\n\tnumber       = 5,\n\tpages        = {429--449}\n}\n@inproceedings{jarvik1991nicotine,\n\ttitle        = {Beneficial effects of nicotine},\n\tauthor       = {Murray E. Jarvik},\n\tyear         = 1991,\n\tbooktitle    = {British Journal of Addiction}\n}\n@article{jastrzkebski2017three,\n\ttitle        = {Three factors influencing minima in sgd},\n\tauthor       = {Jastrz{\\k{e}}bski, Stanis{\\l}aw and Kenton, Zachary and Arpit, Devansh and Ballas, Nicolas and Fischer, Asja and Bengio, Yoshua and Storkey, Amos},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1711.04623}\n}\n@article{jastrzkebski2018dnn,\n\ttitle        = {DNN's Sharpest Directions Along the SGD Trajectory},\n\tauthor       = {Jastrz{\\k{e}}bski, Stanis{\\l}aw and Kenton, Zachary and Ballas, Nicolas and Fischer, Asja and Bengio, Yoshua and Storkey, Amos},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1807.05031}\n}\n@article{javanmard2018flexible,\n\ttitle        = {A Flexible Framework for Hypothesis Testing in High-dimensions},\n\tauthor       = {Javanmard, Adel and Lee, Jason D},\n\tyear         = {},\n\tjournal      = {Accepted Journal of the Royal Statistical Society Series B}\n}\n@article{JavanmardM14,\n\ttitle        = {Confidence intervals and hypothesis testing for high-dimensional regression},\n\tauthor       = {Adel Javanmard and Andrea Montanari},\n\tyear         = 2014,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 15,\n\tnumber       = 1,\n\tpages        = {2869--2909},\n\turl          = {http://dl.acm.org/citation.cfm?id=2697057},\n\ttimestamp    = {Wed, 07 Jan 2015 20:37:19 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/journals/jmlr/JavanmardM14},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@article{javdani2018shared,\n\ttitle        = {Shared autonomy via hindsight optimization for teleoperation and teaming},\n\tauthor       = {Shervin Javdani and Henny Admoni and Stefania Pellegrinelli and Siddhartha S Srinivasa and J Andrew Bagnell},\n\tyear         = 2018,\n\tjournal      = {International Journal of Robotics Research (IJRR)},\n\tvolume       = 37,\n\tpages        = {717--742}\n}\n@article{javed2011efficient,\n\ttitle        = {Efficient Genomewide Selection of PCA-Correlated tSNPs for Genotype Imputation},\n\tauthor       = {Javed, Asif and Drineas, Petros and Mahoney, Michael W and Paschou, Peristera},\n\tyear         = 2011,\n\tjournal      = {Annals of human genetics},\n\tpublisher    = {Wiley Online Library},\n\tvolume       = 75,\n\tnumber       = 6,\n\tpages        = {707--722}\n}\n@inproceedings{jawahar2019does,\n\ttitle        = {What does BERT learn about the structure of language?},\n\tauthor       = {Jawahar, Ganesh and Sagot, Beno{\\^\\i}t and Seddah, Djam{\\'e}},\n\tyear         = 2019,\n\tbooktitle    = {ACL 2019-57th Annual Meeting of the Association for Computational Linguistics}\n}\n@book{jaynes1996probability,\n\ttitle        = {Probability theory: the logic of science},\n\tauthor       = {Jaynes, Edwin T},\n\tyear         = 1996,\n\tpublisher    = {Washington University St. Louis, MO}\n}\n@article{jean2016combining,\n\ttitle        = {Combining satellite imagery and machine learning to predict poverty},\n\tauthor       = {Neal Jean and Marshall Burke and Michael Xie and W. Matthew Davis and David B. Lobell and Stefano Ermon},\n\tyear         = 2016,\n\tjournal      = {Science},\n\tvolume       = 353\n}\n@inproceedings{jean2018ssdkl,\n\ttitle        = {Semi-supervised Deep Kernel Learning: Regression with Unlabeled Data by Minimizing Predictive Variance},\n\tauthor       = {Neal Jean and Sang Michael Xie and Stefano Ermon},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{jedoui2019deep,\n\ttitle        = {Deep {B}ayesian active learning for multiple correct outputs},\n\tauthor       = {Khaled Jedoui and Ranjay Krishna and Michael Bernstein and Li Fei-Fei},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1912.01119}\n}\n@inproceedings{jenatton12latent,\n\ttitle        = {A latent factor model for highly multi-relational data},\n\tauthor       = {R. Jenatton and N. Le Roux and A. Bordes and G. Obozinski},\n\tyear         = 2012,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{jensen2007trax,\n\ttitle        = {TRAX: real-world tracking of moving objects},\n\tauthor       = {Jensen, Christian S. and Pakalnis, Stardas},\n\tyear         = 2007,\n\tbooktitle    = {\n\t\tProceedings of the 33rd international conference on Very large data\n\n\t\tbases\n\t},\n\tlocation     = {Vienna, Austria},\n\tpublisher    = {VLDB Endowment},\n\tseries       = {VLDB '07},\n\tpages        = {1362--1365},\n\tisbn         = {978-1-59593-649-3},\n\tacmid        = 1326015,\n\tnumpages     = 4\n}\n@inproceedings{jeon2020sharedlatent,\n\ttitle        = {Shared Autonomy with Learned Latent Actions},\n\tauthor       = {Hong Jun Jeon and Dylan P. Losey and Dorsa Sadigh},\n\tyear         = 2020,\n\tbooktitle    = {Robotics: Science and Systems (RSS)}\n}\n@article{JGJS,\n\ttitle        = {Introduction to variational methods for graphical models},\n\tauthor       = {M. Jordan and Z. Ghahramani and T. Jaakola and L. Saul},\n\tyear         = 1999,\n\tjournal      = {Machine Learning},\n\tpages        = {183--233}\n}\n@inproceedings{jha2010oracle,\n\ttitle        = {Oracle-Guided Component-Based Program Synthesis},\n\tauthor       = {Susmit Jha and Sumit Gulwani and Sanjit A. Seshia and Ashish Tiwari},\n\tyear         = 2010,\n\tbooktitle    = {International Conference on Software Engineering (ICSE)}\n}\n@article{JHSPS-VR-Framework-parallel,\n\ttitle        = {On Variance Reduction in Stochastic Gradient Descent and its Asynchronous Variants},\n\tauthor       = {Sashank J. Reddi and Ahmed Hefny and Suvrit Sra and Barnab{\\'{a}}s P{\\'{o}}czos and Alexander J. Smola},\n\tyear         = 2015,\n\tjournal      = {NIPS}\n}\n@inproceedings{ji2011kbp,\n\ttitle        = {Overview of the {TAC} 2011 Knowledge Base Population Track},\n\tauthor       = {Heng Ji and Ralph Grishman and Hoa {Trang Dang}},\n\tyear         = 2011,\n\tbooktitle    = {Text Analytics Conference}\n}\n@inproceedings{ji2011knowledge,\n\ttitle        = {Knowledge base population: Successful approaches and challenges},\n\tauthor       = {Heng Ji and Ralph Grishman},\n\tyear         = 2011,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {1148--1158}\n}\n@article{ji2018gradient,\n\ttitle        = {Gradient descent aligns the layers of deep linear networks},\n\tauthor       = {Ji, Ziwei and Telgarsky, Matus},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1810.02032}\n}\n@article{ji2018risk,\n\ttitle        = {Risk and parameter convergence of logistic regression},\n\tauthor       = {Ji, Ziwei and Telgarsky, Matus},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1803.07300}\n}\n@inproceedings{ji2019implicit,\n\ttitle        = {The implicit bias of gradient descent on nonseparable data},\n\tauthor       = {Ziwei Ji and Matus Telgarsky},\n\tyear         = 2019,\n\tbooktitle    = {Conference on Learning Theory (COLT)},\n\tpages        = {1772--1798}\n}\n@article{ji2020convergence,\n\ttitle        = {Convergence of Meta-Learning with Task-Specific Adaptation over Partial Parameters},\n\tauthor       = {Ji, Kaiyi and Lee, Jason D and Liang, Yingbin and Poor, H Vincent},\n\tyear         = 2020,\n\tjournal      = {Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{Ji2020Neural,\n\ttitle        = {Neural tangent kernels, transportation mappings, and universal approximation},\n\tauthor       = {Ziwei Ji and Matus Telgarsky and Ruicheng Xian},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Learning Representations},\n\turl          = {https://openreview.net/forum?id=HklQYxBKwS}\n}\n@inproceedings{jia2016recombination,\n\ttitle        = {Data Recombination for Neural Semantic Parsing},\n\tauthor       = {Robin Jia and Percy Liang},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{jia2017adversarial,\n\ttitle        = {Adversarial Examples for Evaluating Reading Comprehension Systems},\n\tauthor       = {Robin Jia and Percy Liang},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{jia2017concepts,\n\ttitle        = {Learning Concepts through Conversations in Spoken Dialogue Systems},\n\tauthor       = {Robin Jia and Larry Heck and Dilek Hakkani-T{\\\"u}r and Georgi Nikolov},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}\n}\n@inproceedings{jia2019certified,\n\ttitle        = {Certified Robustness to Adversarial Word Substitutions},\n\tauthor       = {Robin Jia and Aditi Raghunathan and Kerem Göksel and Percy Liang},\n\tyear         = 2019,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{jia2019document,\n\ttitle        = {Document-Level {N}-ary Relation Extraction with Multiscale Representation Learning},\n\tauthor       = {Robin Jia and Cliff Wong and Hoifung Poon},\n\tyear         = 2019,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{jia2019towards,\n\ttitle        = {Towards Efficient Data Valuation Based on the Shapley Value},\n\tauthor       = {Ruoxi Jia and David Dao and Boxin Wang and Frances Ann Hubis and Nick Hynes and Nezihe Merve Gurel and Bo Li and Ce Zhang and Dawn Song and Costas Spanos},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1902.10275}\n}\n@inproceedings{jiang07deckard,\n\ttitle        = {{DECKARD}: Scalable and Accurate Tree-Based Detection of Code Clones},\n\tauthor       = {Lingxiao Jiang and Ghassan Misherghi and Zhendong Su  and Stephane Glondu},\n\tyear         = 2007,\n\tbooktitle    = {International Conference on Software Engineering (ICSE)},\n\tpages        = {96--105}\n}\n@article{jiang2004kruskal,\n\ttitle        = {Kruskal's permutation lemma and the identification of CANDECOMP/PARAFAC and bilinear models with constant modulus constraints},\n\tauthor       = {Jiang, Tao and Sidiropoulos, Nicholas D},\n\tyear         = 2004,\n\tjournal      = {Signal Processing, IEEE Transactions on},\n\tpublisher    = {IEEE},\n\tvolume       = 52,\n\tnumber       = 9,\n\tpages        = {2625--2636}\n}\n@inproceedings{jiang2007instance,\n\ttitle        = {Instance weighting for domain adaptation in NLP},\n\tauthor       = {Jiang, Jing and Zhai, ChengXiang},\n\tyear         = 2007,\n\tbooktitle    = {ACL},\n\tvolume       = 7,\n\tpages        = {264--271}\n}\n@article{jiang2012calibrating,\n\ttitle        = {Calibrating predictive model estimates to support personalized medicine},\n\tauthor       = {Xiaoqian Jiang and Melanie Osl and Jihoon Kim and Lucila Ohno{-}Machado},\n\tyear         = 2012,\n\tjournal      = {Journal of the American Medical Informatics Association},\n\tvolume       = 19,\n\tnumber       = 2,\n\tpages        = {263--274}\n}\n@inproceedings{jiang2012learned,\n\ttitle        = {Learned prioritization for trading off accuracy and speed},\n\tauthor       = {Jiarong Jiang and Adam Teichert and Jason Eisner and Hal {Daum{\\'e} III}},\n\tyear         = 2012,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{jiang2012placing,\n\ttitle        = {Learning to Place New Objects in a Scene},\n\tauthor       = {Y. Jiang and M. Lim and C. Zheng and A. Saxena},\n\tyear         = 2012,\n\tjournal      = {IJRR},\n\tvolume       = 31,\n\tnumber       = 9\n}\n@inproceedings{jiang2013hallucinated,\n\ttitle        = {Hallucinated Humans as the Hidden Context for Labeling 3{D} Scenes},\n\tauthor       = {Y. Jiang and H. Koppula and A. Saxena},\n\tyear         = 2013,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@inproceedings{jiang2015abstraction,\n\ttitle        = {Abstraction selection in model-based reinforcement learning},\n\tauthor       = {Jiang, Nan and Kulesza, Alex and Singh, Satinder},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Machine Learning}\n}\n@article{jiang2016contextual,\n\ttitle        = {Contextual Decision Processes with Low Bellman Rank are PAC-Learnable},\n\tauthor       = {Jiang, Nan and Krishnamurthy, Akshay and Agarwal, Alekh and Langford, John and Schapire, Robert E},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1610.09512},\n\tbooktitle    = {International Conference on Machine Learning}\n}\n@inproceedings{jiang2018mentornet,\n\ttitle        = {{MentorNet}: Learning data-driven curriculum for very deep neural networks on corrupted labels},\n\tauthor       = {Lu Jiang and Zhengyuan Zhou and Thomas Leung and Li-Jia Li and Li Fei-Fei},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {2304--2313}\n}\n@inproceedings{jiang2018open,\n\ttitle        = {Open problem: The dependence of sample complexity lower bounds on planning horizon},\n\tauthor       = {Jiang, Nan and Agarwal, Alekh},\n\tyear         = 2018,\n\tbooktitle    = {Conference On Learning Theory},\n\tpages        = {3395--3398}\n}\n@inproceedings{jiang2019abstraction,\n\ttitle        = {Language as an Abstraction for Hierarchical Deep Reinforcement Learning},\n\tauthor       = {Yiding Jiang and Shixiang (Shane) Gu and Kevin P. Murphy and Chelsea Finn},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{jiang2020can,\n\ttitle        = {How can we know what language models know?},\n\tauthor       = {Jiang, Zhengbao and Xu, Frank F and Araki, Jun and Neubig, Graham},\n\tyear         = 2020,\n\tjournal      = {Transactions of the Association for Computational Linguistics},\n\tpublisher    = {MIT Press},\n\tvolume       = 8,\n\tpages        = {423--438}\n}\n@misc{jiang2020tll,\n\ttitle        = {Transfer Learning library},\n\tauthor       = {Junguang Jiang and Baixu Chen and Bo Fu and Mingsheng Long},\n\tyear         = 2020,\n\thowpublished = {\\url{https://github.com/thuml/Transfer-Learning-Library}}\n}\n@article{jiang2021offline,\n\ttitle        = {Offline Decentralized Multi-Agent Reinforcement Learning},\n\tauthor       = {Jiang, Jiechuan and Lu, Zongqing},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2108.01832}\n}\n@inproceedings{jiao2016minimax,\n\ttitle        = {Minimax estimation of the $L_1$ distance},\n\tauthor       = {Jiantao Jiao and Yanjun Han and Tsachy Weissman},\n\tyear         = 2016,\n\tbooktitle    = {IEEE International Symposium on Information Theory},\n\tpages        = {750--754}\n}\n@inproceedings{jin2017escape,\n\ttitle        = {How to escape saddle points efficiently},\n\tauthor       = {Jin, Chi and Ge, Rong and Netrapalli, Praneeth and Kakade, Sham M and Jordan, Michael I},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1703.00887},\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {1724--1732},\n\torganization = {PMLR}\n}\n@inproceedings{jin2018q,\n\ttitle        = {Is Q-learning provably efficient?},\n\tauthor       = {Jin, Chi and Allen-Zhu, Zeyuan and Bubeck, Sebastien and Jordan, Michael I},\n\tyear         = 2018,\n\tbooktitle    = {Proceedings of the 32nd International Conference on Neural Information Processing Systems},\n\tpages        = {4868--4878}\n}\n@article{jin2019learning,\n\ttitle        = {Learning adversarial markov decision processes with bandit feedback and unknown transition},\n\tauthor       = {Jin, Chi and Jin, Tiancheng and Luo, Haipeng and Sra, Suvrit and Yu, Tiancheng},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1912.01192},\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{jin2019provably,\n\ttitle        = {Provably efficient reinforcement learning with linear function approximation},\n\tauthor       = {Jin, Chi and Yang, Zhuoran and Wang, Zhaoran and Jordan, Michael I},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:1907.05388},\n\tbooktitle    = {Conference on Learning Theory},\n\tpages        = {2137--2143}\n}\n@article{jin2019stochastic,\n\ttitle        = {Stochastic gradient descent escapes saddle points efficiently},\n\tauthor       = {Jin, Chi and Netrapalli, Praneeth and Ge, Rong and Kakade, Sham M and Jordan, Michael I},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1902.04811}\n}\n@article{jin2020domain,\n\ttitle        = {Domain extrapolation via regret minimization},\n\tauthor       = {Jin, Wengong and Barzilay, Regina and Jaakkola, Tommi},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.03908}\n}\n@inproceedings{jin2020reward,\n\ttitle        = {Reward-Free Exploration for Reinforcement Learning},\n\tauthor       = {Jin, Chi and Krishnamurthy, Akshay and Simchowitz, Max and Yu, Tiancheng},\n\tyear         = 2020,\n\tjournal      = {International Conference on Machine Learning},\n\tbooktitle    = {International Conference on Machine Learning}\n}\n@inproceedings{jin2020robust,\n\ttitle        = {Is {BERT} Really Robust? A Strong Baseline for Natural Language Attack on Text Classification and Entailment},\n\tauthor       = {Di Jin and Zhijing Jin and Joey Tianyi Zhou and Peter Szolovits},\n\tyear         = 2020,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@article{jin2021bellman,\n\ttitle        = {Bellman Eluder Dimension: New Rich Classes of RL Problems, and Sample-Efficient Algorithms},\n\tauthor       = {Jin, Chi and Liu, Qinghua and Miryoosefi, Sobhan},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2102.00815}\n}\n@article{jing2020selfsupervised,\n\ttitle        = {Self-supervised Visual Feature Learning with Deep Neural Networks: A Survey},\n\tauthor       = {Longlong Jing and Yingli Tian},\n\tyear         = 2020,\n\tjournal      = {IEEE transactions on pattern analysis and machine intelligence}\n}\n@article{JL1984,\n\ttitle        = {{Extensions of Lipschitz mappings into a Hilbert space}},\n\tauthor       = {Johnson, William B. and Lindenstrauss, Joram},\n\tyear         = 1984,\n\tjournal      = {Contemporary Mathematics},\n\tvolume       = 26,\n\tnumber       = {189-206},\n\tpages        = {189--206},\n\tdoi          = {10.1090/conm/026/737400},\n\tmendeley-groups = {Algorithms/Sublinear Algorithms/JL}\n}\n@article{JMLR:v15:anandkumar14b,\n\ttitle        = {{Tensor Decompositions for Learning Latent Variable Models}},\n\tauthor       = {Animashree Anandkumar and Rong Ge and Daniel Hsu and Sham M. Kakade and Matus Telgarsky},\n\tyear         = 2014,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 15,\n\tpages        = {2773--2832},\n\turl          = {http://jmlr.org/papers/v15/anandkumar14b.html}\n}\n@inproceedings{JNS,\n\ttitle        = {Low rank matrix completion using alternating minimization},\n\tauthor       = {P. Jain and P. Netrapalli and S. Sanghavi},\n\tyear         = 2013,\n\tbooktitle    = {STOC},\n\tpages        = {665--674}\n}\n@inproceedings{joachims1999transductive,\n\ttitle        = {Transductive inference for text classification using support vector machines},\n\tauthor       = {Thorsten Joachims},\n\tyear         = 1999,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@manual{jocher2020yolov5,\n\ttitle        = {{YOLO}-v5 repository},\n\tauthor       = {Glenn Jocher and Alex Stoken and Jirka Borovec and NanoCode012 and ChristopherSTAN and Laughing and tkianai and Adam Hogan and lorenzomammana and yxNONG and AlexWang1900 and Laurentiu Diaconu and Marc and wanghaoyang0106 and ml5ah and Doug and Francisco Ingham and Frederik and Guilhen and Hatovix and Jake Poznanski and Jiacong Fang and Lijun Yu and changyu98 and Mingyu Wang and Naman Gupta and Osama Akhtar and PetrDvoracek and Prashant Rai},\n\tyear         = 2020\n}\n@inproceedings{joglekar2015comprehensive,\n\ttitle        = {Comprehensive and reliable crowd assessment algorithms},\n\tauthor       = {Manas Joglekar and Hector Garcia-Molina and Aditya Parameswaran},\n\tyear         = 2015,\n\tbooktitle    = {Data Engineering (ICDE), 2015 IEEE 31st International Conference on},\n\tpages        = {195--206}\n}\n@inproceedings{johansson2016learning,\n\ttitle        = {Learning Representations for Counterfactual Inference},\n\tauthor       = {Fredrik Johansson and Uri Shalit and David Sontag},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{johns2007constructing,\n\ttitle        = {Constructing basis functions from directed graphs for value function approximation},\n\tauthor       = {Johns, Jeff and Mahadevan, Sridhar},\n\tyear         = 2007,\n\tbooktitle    = {Proceedings of the 24th international conference on Machine learning},\n\tpages        = {385--392},\n\torganization = {ACM}\n}\n@inproceedings{Johnson013,\n\ttitle        = {Accelerating Stochastic Gradient Descent using Predictive Variance Reduction},\n\tauthor       = {Rie Johnson and Tong Zhang},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems 26: 27th Annual Conference on Neural Information Processing Systems 2013. Proceedings of a meeting held December 5-8, 2013, Lake Tahoe, Nevada, United States.},\n\tpages        = {315--323},\n\turl          = {http://papers.nips.cc/paper/4937-accelerating-stochastic-gradient-descent-using-predictive-variance-reduction},\n\tbdsk-url-1   = {http://papers.nips.cc/paper/4937-accelerating-stochastic-gradient-descent-using-predictive-variance-reduction},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org},\n\tbiburl       = {http://dblp.org/rec/bib/conf/nips/Johnson013},\n\tcrossref     = {DBLP:conf/nips/2013},\n\ttimestamp    = {Fri, 31 Jan 2014 12:11:40 +0100}\n}\n@inproceedings{johnson06adaptor,\n\ttitle        = {Adaptor Grammars: A Framework for Specifying Compositional Nonparametric {B}ayesian Models},\n\tauthor       = {Mark Johnson and Tom Griffiths and Sharon Goldwater},\n\tyear         = 2006,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {641--648}\n}\n@inproceedings{johnson07mcmc,\n\ttitle        = {{B}ayesian Inference for {PCFG}s via {M}arkov Chain {M}onte {C}arlo},\n\tauthor       = {Mark Johnson and Tom Griffiths and Sharon Goldwater},\n\tyear         = 2007,\n\tbooktitle    = {Human Language Technology and North American Association for Computational Linguistics (HLT/NAACL)},\n\tpages        = {139--146}\n}\n@inproceedings{johnson07pos,\n\ttitle        = {Why doesn't {EM} find good {HMM} {POS}-taggers?},\n\tauthor       = {Mark Johnson},\n\tyear         = 2007,\n\tbooktitle    = {Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP/CoNLL)}\n}\n@inproceedings{johnson08synergy,\n\ttitle        = {Using Adaptor Grammars to Identify Synergies in the Unsupervised Acquisition of Linguistic Structure},\n\tauthor       = {Mark Johnson},\n\tyear         = 2008,\n\tbooktitle    = {Human Language Technology and Association for Computational Linguistics (HLT/ACL)},\n\tpages        = {398--406}\n}\n@inproceedings{johnson09segmentation,\n\ttitle        = {Improving nonparameteric {B}ayesian inference: experiments on unsupervised word segmentation with adaptor grammars},\n\tauthor       = {Mark Johnson and Sharon Goldwater},\n\tyear         = 2009,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)},\n\tpages        = {317--325}\n}\n@article{johnson1978densest,\n\ttitle        = {The densest hemisphere problem},\n\tauthor       = {D. S. Johnson and F. P. Preparata},\n\tyear         = 1978,\n\tjournal      = {Theoretical Computer Science},\n\tvolume       = 6,\n\tpages        = {93--107}\n}\n@article{johnson2007adjusting,\n\ttitle        = {Adjusting batch effects in microarray expression data using empirical {B}ayes methods},\n\tauthor       = {W. Evan Johnson and Cheng Li and Ariel Rabinovic},\n\tyear         = 2007,\n\tjournal      = {Biostatistics},\n\tvolume       = 8,\n\tpages        = {118--127}\n}\n@inproceedings{johnson2013accelerating,\n\ttitle        = {Accelerating stochastic gradient descent using predictive variance reduction},\n\tauthor       = {Johnson, Rie and Zhang, Tong},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tseries       = {NIPS 2013},\n\tpages        = {315--323},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Johnson, Zhang - 2013 - Accelerating stochastic gradient descent using predictive variance reduction.pdf:pdf},\n\tmendeley-groups = {Optimization/Variance Reduction,Optimization/[with Yuan Yang]}\n}\n@article{johnson2016crop,\n\ttitle        = {Crop yield forecasting on the Canadian Prairies by remotely sensed vegetation indices and machine learning methods},\n\tauthor       = {Michael D. Johnson and William W. Hsieh and Alex J. Cannon and Andrew Davidson and Frédéric Bédard},\n\tyear         = 2016,\n\tjournal      = {Agricultural and Forest Meteorology},\n\tvolume       = 218,\n\tpages        = {74--84}\n}\n@article{johnson2016google,\n\ttitle        = {Google's Multilingual Neural Machine Translation System: Enabling Zero-Shot Translation},\n\tauthor       = {Melvin Johnson and Mike Schuster and Quoc V. Le and Maxim Krikun and Yonghui Wu and Zhifeng Chen and Nikhil Thorat and Fernanda Viégas and Martin Wattenberg and Greg Corrado and Macduff Hughes and Jeffrey Dean},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1611.04558}\n}\n@inproceedings{johnson2016malmo,\n\ttitle        = {The Malmo Platform for Artificial Intelligence Experimentation},\n\tauthor       = {Matthew Johnson and Katja Hofmann and Tim Hutton and David Bignell},\n\tyear         = 2016,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@inproceedings{johnson2016perceptual,\n\ttitle        = {Perceptual losses for real-time style transfer and super-resolution},\n\tauthor       = {Justin Johnson and Alexandre Alahi and Li Fei-Fei},\n\tyear         = 2016,\n\tbooktitle    = {European Conference on Computer Vision (ECCV)},\n\tpages        = {694--711}\n}\n@inproceedings{johnson2017clevr,\n\ttitle        = {CLEVR: A diagnostic dataset for compositional language and elementary visual reasoning},\n\tauthor       = {Justin Johnson and Bharath Hariharan and Laurens van der Maaten and Li Fei-Fei and C Lawrence Zitnick and Ross Girshick},\n\tyear         = 2017,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@inproceedings{johnson2017inferring,\n\ttitle        = {Inferring and Executing Programs for Visual Reasoning},\n\tauthor       = {Justin Johnson and Bharath Hariharan and Laurens van der Maaten and Judy Hoffman and Li Fei-Fei and C Lawrence Zitnick and Ross Girshick},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Computer Vision (ICCV)}\n}\n@inproceedings{johnson2018multicalibration,\n\ttitle        = {Multicalibration: Calibration for the (Computationally-Identifiable) Masses},\n\tauthor       = {Ursula Hebert-Johnson and Michael P. Kim and Omer Reingold and Guy N. Rothblum},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{johnson2019billion,\n\ttitle        = {Billion-scale similarity search with GPUs},\n\tauthor       = {Jeff Johnson and Matthijs Douze and Herv{\\'e} J{\\'e}gou},\n\tyear         = 2019,\n\tbooktitle    = {IEEE Transactions on Big Data}\n}\n@article{johnson84randproj,\n\ttitle        = {Extensions of {L}ipschitz maps into a {H}ilbert space},\n\tauthor       = {W. Johnson and J. Lindenstrauss},\n\tyear         = 1984,\n\tjournal      = {Contemporary Mathematics},\n\tvolume       = 26,\n\tpages        = {189--206}\n}\n@article{johnson98parent,\n\ttitle        = {{PCFG} Models of Linguistic Tree Representations},\n\tauthor       = {Mark Johnson},\n\tyear         = 1998,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 24,\n\tpages        = {613--632}\n}\n@article{jojic04phylohmm,\n\ttitle        = {Efficient approximations for learning phylogenetic {HMM} models from data},\n\tauthor       = {V. Jojic and N. Jojic and C. Meek and D. Geiger and A. Siepel and D. Haussler and D. Heckerman},\n\tyear         = 2004,\n\tjournal      = {Bioinformatics},\n\tvolume       = 20,\n\tpages        = {161--168}\n}\n@book{jolliffe2002principal,\n\ttitle        = {Principal Component Analysis},\n\tauthor       = {I.T. Jolliffe},\n\tyear         = 2002,\n\tpublisher    = {Springer Verlag},\n\tisbn         = {0-387-95442-2},\n\tedition      = {2nd},\n\tabstract     = {\n\t\tseems like a great book on PCA - it shows the connection between PCA\n\n\t\tand SVD; talks about how to choose the number of eigenvectors to\n\n\t\tkeep; discusses outlier detection; uses PCA for stock prices (Dow\n\n\t\tJones)\n\t},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@article{jones1972tfidf,\n\ttitle        = {A statistical interpretation of term specificity and its application in retrieval},\n\tauthor       = {Karen Sparck Jones},\n\tyear         = 1972,\n\tjournal      = {Journal of documentation},\n\tvolume       = 28\n}\n@inproceedings{jones2003active,\n\ttitle        = {Active learning for information extraction with multiple view feature sets},\n\tauthor       = {Rosie Jones and Rayid Ghani and Tom Mitchell and Ellen Riloff},\n\tyear         = 2003,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)},\n\tpages        = {26--34}\n}\n@article{jones2004markov,\n\ttitle        = {On the {Markov} chain central limit theorem},\n\tauthor       = {Galin L. Jones},\n\tyear         = 2004,\n\tjournal      = {Probability Surveys},\n\tvolume       = 1\n}\n@inproceedings{jones2006generating,\n\ttitle        = {Generating query substitutions},\n\tauthor       = {Rosie Jones and Benjamin Rey and Omid Madani and Wiley Greiner},\n\tyear         = 2006,\n\tbooktitle    = {World Wide Web (WWW)},\n\tpages        = {387--396}\n}\n@article{jones2009scale,\n\ttitle        = {Developing a sense of scale: Looking backward},\n\tauthor       = {M. Gail Jones and Amy R. Taylor},\n\tyear         = 2009,\n\tjournal      = {Journal of Research in Science Teaching},\n\tvolume       = 46,\n\tpages        = {460--475}\n}\n@inproceedings{jones2020roben,\n\ttitle        = {Robust Encodings: A Framework for Combating Adversarial Typos},\n\tauthor       = {Erik Jones and Robin Jia and Aditi Raghunathan and Percy Liang},\n\tyear         = 2020,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{jones2021selective,\n\ttitle        = {Selective Classification Can Magnify Disparities Across Groups},\n\tauthor       = {Erik Jones and Shiori Sagawa and Pang Wei Koh and Ananya Kumar and Percy Liang},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{jong2008hierarchical,\n\ttitle        = {Hierarchical model-based reinforcement learning: {R}-max+ MAX{Q}},\n\tauthor       = {N. K. Jong and P. Stone},\n\tyear         = 2008,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {432--439}\n}\n@article{jonschkowski2015statereps,\n\ttitle        = {Learning state representations with robotic priors},\n\tauthor       = {Rico Jonschkowski and Oliver Brock},\n\tyear         = 2015,\n\tjournal      = {Autonomous Robots},\n\tvolume       = 39,\n\tpages        = {407--428}\n}\n@article{jonsen2005robust,\n\ttitle        = {Robust state--space modeling of animal movement data},\n\tauthor       = {Ian D Jonsen and Joanna Mills Flemming and Ransom A Myers},\n\tyear         = 2005,\n\tjournal      = {Ecology},\n\tvolume       = 86,\n\tnumber       = 11,\n\tpages        = {2874--2880}\n}\n@article{jordan1999variational,\n\ttitle        = {An Introduction to Variational Methods for Graphical Models},\n\tauthor       = {Michael I. Jordan and Zoubin Ghahramani and Tommi S.  Jaakkola and Lawrence K. Saul},\n\tyear         = 1999,\n\tjournal      = {Machine Learning},\n\tvolume       = 37,\n\tpages        = {183--233}\n}\n@article{jordan2018communication,\n\ttitle        = {Communication-efficient distributed statistical learning},\n\tauthor       = {Jordan, Michael I and {Jason D. Lee} and Yang, Yun},\n\tyear         = 2018,\n\tjournal      = {Journal of the American Statistics Association}\n}\n@misc{Jordanbook,\n\ttitle        = {An Introduction to Graphical Models},\n\tauthor       = {Michael I. Jordan and Christopher M. Bishop},\n\tpublisher    = {forthcoming.}\n}\n@inproceedings{jorgensen2015,\n\ttitle        = {Challenges of studying and processing dialects in social media},\n\tauthor       = {Anna Katrine Jørgensen and Dirk Hovy and Anders Søgaard},\n\tyear         = 2015,\n\tbooktitle    = {ACL Workshop on Noisy User-generated Text},\n\tpages        = {9--18}\n}\n@inproceedings{joseph2016,\n\ttitle        = {Rawlsian Fairness for Machine Learning},\n\tauthor       = {Matthew Joseph and Michael Kearns and Jamie Morgenstern and Seth Neel and Aaron Roth},\n\tyear         = 2016,\n\tbooktitle    = {FATML}\n}\n@inproceedings{joseph2016fairness,\n\ttitle        = {Fairness in learning: Classic and contextual bandits},\n\tauthor       = {Joseph, Matthew and Kearns, Michael and Morgenstern, Jamie H and Roth, Aaron},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {325--333}\n}\n@article{joseph2017fair,\n\ttitle        = {Fair Algorithms for Infinite and Contextual Bandits},\n\tauthor       = {Matthew Joseph and Michael Kearns and Jamie Morgenstern and Seth Neel and Aaron Roth},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1610.09559}\n}\n@inproceedings{joshi2009multi,\n\ttitle        = {Multi-class active learning for image classification},\n\tauthor       = {Ajay J Joshi and Fatih Porikli and Nikolaos Papanikolopoulos},\n\tyear         = 2009,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {2372--2379}\n}\n@inproceedings{joshi2017triviaqa,\n\ttitle        = {{TriviaQA}: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n\tauthor       = {Mandar Joshi and Eunsol Choi and Daniel Weld and Luke Zettlemoyer},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{joshi2018xgems,\n\ttitle        = {{xGEMs}: Generating examplars to explain black-box models},\n\tauthor       = {Shalmali Joshi and Oluwasanmi Koyejo and Been Kim and Joydeep Ghosh},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1806.08867}\n}\n@article{joshi2019spanbert,\n\ttitle        = {Span{BERT}: Improving Pre-training by Representing and Predicting Spans},\n\tauthor       = {Mandar Joshi and Danqi Chen and Yinhan Liu and Daniel S. Weld and Luke Zettlemoyer and Omer Levy},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1907.10529}\n}\n@article{joshi2020spanbert,\n\ttitle        = {Spanbert: Improving pre-training by representing and predicting spans},\n\tauthor       = {Joshi, Mandar and Chen, Danqi and Liu, Yinhan and Weld, Daniel S and Zettlemoyer, Luke and Levy, Omer},\n\tyear         = 2020,\n\tjournal      = {Transactions of the Association for Computational Linguistics},\n\tpublisher    = {MIT Press},\n\tvolume       = 8,\n\tpages        = {64--77}\n}\n@article{josse2014stable,\n\ttitle        = {Stable Autoencoding: A Flexible Framework for Regularized Low-Rank Matrix Estimation},\n\tauthor       = {Julie Josse and Stefan Wager},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1410.8275}\n}\n@article{joulin2015stack,\n\ttitle        = {Inferring Algorithmic Patterns with Stack-Augmented Recurrent Nets},\n\tauthor       = {Armand Joulin and Tomas Mikolov},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1503.01007}\n}\n@article{jozefowicz2016exploring,\n\ttitle        = {Exploring the Limits of Language Modeling},\n\tauthor       = {Rafal Jozefowicz and Oriol Vinyals and Mike Schuster and Noam Shazeer and Yonghui Wu},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1602.02410}\n}\n@inproceedings{judge06qtb,\n\ttitle        = {Question-Bank: creating a corpus of parse-annotated questions},\n\tauthor       = {J. Judge and A. Cahill and J. v. Genabith},\n\tyear         = 2006,\n\tbooktitle    = {International Conference on Computational Linguistics and Association for Computational Linguistics (COLING/ACL)},\n\tpages        = {497--504}\n}\n@misc{Juditsky13-lecture,\n\ttitle        = {Convex Optimization II: Algorithms},\n\tauthor       = {Anatoli Juditsky},\n\tyear         = 2013,\n\tmonth        = nov,\n\thowpublished = {Lecture notes}\n}\n@article{juditsky2011solving,\n\ttitle        = {Solving variational inequalities with stochastic mirror-prox algorithm},\n\tauthor       = {Juditsky, Anatoli and Nemirovski, Arkadi and Tauvel, Claire and others},\n\tyear         = 2011,\n\tjournal      = {Stochastic Systems},\n\tpublisher    = {INFORMS Applied Probability Society},\n\tvolume       = 1,\n\tnumber       = 1,\n\tpages        = {17--58}\n}\n@inproceedings{julier1997new,\n\ttitle        = {A New Extension of the Kalman Filter to nonlinear Systems},\n\tauthor       = {Simon J. Julier and Jeffery K. Uhlmann},\n\tyear         = 1997,\n\tbooktitle    = {\n\t\tThe Proceedings of AeroSense: The 11th International Symposium on\n\n\t\tAerospace/Defense Sensing, Simulation and Controls, Multi Sensor\n\n\t\tFusion, Tracking and Resource Management\n\t}\n}\n@article{julier2004unscented,\n\ttitle        = {Unscented filtering and nonlinear estimation},\n\tauthor       = {Simon J. Julier and Jeffrey K. Uhlmann},\n\tyear         = 2004,\n\tjournal      = {Proceedings of the IEEE},\n\tvolume       = 92,\n\tnumber       = 3,\n\tpages        = {401--422}\n}\n@article{jung2018omitted,\n\ttitle        = {Omitted and Included Variable Bias in Tests for Disparate Impact},\n\tauthor       = {Jongbin Jung and Sam Corbett-Davies and Ravi Shroff and Sharad Goel},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1809.05651}\n}\n@inproceedings{jung2020fair,\n\ttitle        = {Fair prediction with endogenous behavior},\n\tauthor       = {Christopher Jung and Sampath Kannan and Changhwa Lee and Mallesh Pai and Aaron Roth and Rakesh Vohra},\n\tyear         = 2020,\n\tbooktitle    = {Proceedings of the 21st ACM Conference on Economics and Computation},\n\tpages        = {677--678}\n}\n@article{jung2020moment,\n\ttitle        = {Moment multicalibration for uncertainty estimation},\n\tauthor       = {Jung, Christopher and Lee, Changhwa and Pai, Mallesh M and Roth, Aaron and Vohra, Rakesh},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2008.08037}\n}\n@book{jurafsky2000speech,\n\ttitle        = {Speech and language processing: An introduction to natural language processing, computational linguistics, and speech recognition},\n\tauthor       = {Daniel Jurafsky and James H Martin},\n\tyear         = 2000,\n\tpublisher    = {Prentice Hall Prentice Hall}\n}\n@article{jurczyk2016selqa,\n\ttitle        = {SelQA: A New Benchmark for Selection-based Question Answering},\n\tauthor       = {Tomasz Jurczyk and Michael Zhai and Jinho D. Choi},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@inproceedings{jurgens2017,\n\ttitle        = {Incorporating Dialectal Variability for Socially Equitable Language Identification},\n\tauthor       = {David Jurgens and Yulia Tsvetkov and Dan Jurafsky},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {51--57}\n}\n@article{k10,\n\ttitle        = {Products of random matrices: Dimension and growth in norm},\n\tauthor       = {Kargin, Vladislav},\n\tyear         = 2010,\n\tjournal      = {The Annals of Applied Probability},\n\tpublisher    = {Institute of Mathematical Statistics, \\url{https://arxiv.org/pdf/0903.0632.pdf}},\n\tvolume       = 20,\n\tnumber       = 3,\n\tpages        = {890--906}\n}\n@article{Kaczmarz1937,\n\ttitle        = {Angen{\\\"a}herte aufl{\\\"o}sung von systemen linearer gleichungen},\n\tauthor       = {Kaczmarz, Stefan},\n\tyear         = 1937,\n\tjournal      = {Bulletin International de l’Academie Polonaise des Sciences et des Lettres},\n\tvolume       = 35,\n\tpages        = {355--357}\n}\n@inproceedings{kadlec2016text,\n\ttitle        = {Text Understanding with the Attention Sum Reader Network},\n\tauthor       = {Rudolf Kadlec and Martin Schmid and Ondrej Bajgar and Jan Kleindienst},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{kaelbling1993learning,\n\ttitle        = {Learning to Achieve Goals},\n\tauthor       = {Leslie Kaelbling},\n\tyear         = 1993,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@article{kaelbling1998planning,\n\ttitle        = {Planning and acting in partially observable stochastic domains},\n\tauthor       = {Leslie Pack Kaelbling and Michael L Littman and Anthony R Cassandra},\n\tyear         = 1998,\n\tjournal      = {Artificial intelligence},\n\tvolume       = 101,\n\tnumber       = 1,\n\tpages        = {99--134}\n}\n@inproceedings{kaelbling2011hierarchical,\n\ttitle        = {Hierarchical task and motion planning in the now},\n\tauthor       = {L. P.  Kaelbling and T. Lozano-P{'e}rez},\n\tyear         = 2011,\n\tbooktitle    = {International Conference on Robotics and Automation (ICRA)}\n}\n@inproceedings{kagami2003measurement,\n\ttitle        = {Measurement and comparison of human and humanoid walking},\n\tauthor       = {\n\t\tKagami, S. and Mochimaru, M. and Ehara, Y. and Miyata, N. and Nishiwaki,\n\n\t\tK. and Kanade, T. and Inoue, H.\n\t},\n\tyear         = 2003,\n\tmonth        = jul,\n\tbooktitle    = {\n\t\tProceedings of 2003 IEEE International Symposium on Computational\n\n\t\tIntelligence in Robotics and Automation\n\t},\n\tvolume       = 2,\n\tpages        = {918--922 vol.2},\n\tissn         = {},\n\tabstract     = {\n\t\tThis paper describes our research efforts aimed at understanding\n\n\t\thuman being walking functions. Using motion capture system, force\n\n\t\tplates and distributed force sensors, both human being and humanoid\n\n\t\tH7 walk motion were captured. Experimental results are shown. Comparison\n\n\t\tin between human being with H7 walk in following points are discussed:\n\n\t\t1) ZMP trajectories, 2) torso movement, 3) free leg trajectories,\n\n\t\t4) joint angle usage, 5) joint torque usage. Furthermore, application\n\n\t\tto the humanoid robot is discussed.\n\t},\n\tkeywords     = {\n\t\tdistributed force sensors; force plates; free leg trajectories; human\n\n\t\tbeing walking functions; humanoid robot; humanoid walking; joint\n\n\t\tangle usage; joint torque usage; motion capture system; torso movement;\n\n\t\tdistributed sensors; force sensors; legged locomotion; motion control;\n\n\t\tmotion measurement;\n\t},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@misc{kaggle2015airline,\n\ttitle        = {2015 Flight Delays and Cancellations},\n\tauthor       = {DoT},\n\tyear         = 2017,\n\thowpublished = {\\url{https://www.kaggle.com/usdot/flight-delays}}\n}\n@article{kahn2017uncertaintyaware,\n\ttitle        = {Uncertainty-Aware Reinforcement Learning for Collision Avoidance},\n\tauthor       = {Gregory Kahn and Adam Villaflor and Vitchyr Pong and Pieter Abbeel and Sergey Levine},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@article{kaiser2019model,\n\ttitle        = {Model-based reinforcement learning for atari},\n\tauthor       = {Kaiser, Lukasz and Babaeizadeh, Mohammad and Milos, Piotr and Osinski, Blazej and Campbell, Roy H and Czechowski, Konrad and Erhan, Dumitru and Finn, Chelsea and Kozakowski, Piotr and Levine, Sergey, and Sepassi, Ryan and Tucker, George and Michalewski, Henryk},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1903.00374}\n}\n@inproceedings{kakade02objective,\n\ttitle        = {An Alternate Objective Function for {M}arkovian Fields},\n\tauthor       = {Sham Kakade and Yee Whye Teh and Sam Roweis},\n\tyear         = 2002,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{kakade2001natural,\n\ttitle        = {A natural policy gradient},\n\tauthor       = {Kakade, Sham M},\n\tyear         = 2001,\n\tjournal      = {Advances in neural information processing systems},\n\tbooktitle    = {Advances in neural information processing systems},\n\tvolume       = 14,\n\tpages        = {1531--1538}\n}\n@inproceedings{kakade2002approximately,\n\ttitle        = {Approximately optimal approximate reinforcement learning},\n\tauthor       = {Kakade, Sham and Langford, John},\n\tyear         = 2002,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {267--274},\n\torganization = {Morgan Kaufmann Publishers Inc.}\n}\n@phdthesis{kakade2003sample,\n\ttitle        = {On the sample complexity of reinforcement learning},\n\tauthor       = {Kakade, Sham Machandranath},\n\tyear         = 2003,\n\tschool       = {UCL (University College London)}\n}\n@incollection{kakade2007multi,\n\ttitle        = {Multi-view regression via canonical correlation analysis},\n\tauthor       = {Kakade, Sham M and Foster, Dean P},\n\tyear         = 2007,\n\tbooktitle    = {Learning theory},\n\tpublisher    = {Springer},\n\tseries       = {Lecture Notes in Computer Science},\n\tvolume       = 4539,\n\tpages        = {82--96},\n\teditor       = {Nader H. Bshouty and Claudio Gentile}\n}\n@techreport{Kakade2009,\n\ttitle        = {{On the duality of strong convexity and strong smoothness: Learning applications and matrix regularization}},\n\tauthor       = {Kakade, Sham M. and {Shalev-Shwartz}, Shai and Tewari, Ambuj},\n\tyear         = 2009,\n\tbooktitle    = {\\ldots Manuscript, http://ttic. \\ldots},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Kakade, Shalev-Shwartz, Tewari - 2009 - On the duality of strong convexity and strong smoothness Learning applications and matrix regula.pdf:pdf},\n\tmendeley-groups = {Optimization/General Theory}\n}\n@inproceedings{kakade2009complexity,\n\ttitle        = {On the complexity of linear prediction: Risk bounds, margin bounds, and regularization},\n\tauthor       = {Sham M. Kakade and Karthik Sridharan and Ambuj Tewari},\n\tyear         = 2009,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{kakade2011efficient,\n\ttitle        = {Efficient learning of generalized linear and single index models with isotonic regression},\n\tauthor       = {Kakade, Sham M and Kanade, Varun and Shamir, Ohad and Kalai, Adam},\n\tyear         = 2011,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {927--935}\n}\n@article{kakade2018provable,\n\ttitle        = {Provably Correct Automatic Subdifferentiation for Qualified Programs},\n\tauthor       = {Kakade, Sham and Lee, Jason D},\n\tyear         = 2018,\n\tjournal      = {Neural Information Processing Systems (NIPS)}\n}\n@article{kakade2020information,\n\ttitle        = {Information theoretic regret bounds for online nonlinear control},\n\tauthor       = {Kakade, Sham and Krishnamurthy, Akshay and Lowrey, Kendall and Ohnishi, Motoya and Sun, Wen},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.12466}\n}\n@inproceedings{kakadecca,\n\ttitle        = {Multi-view Regression Via Canonical Correlation Analysi s.},\n\tauthor       = {Sham M. Kakade and Dean P. Foster},\n\tyear         = 2007,\n\tbooktitle    = {COLT},\n\tpublisher    = {Springer},\n\tseries       = {Lecture Notes in Computer Science},\n\tvolume       = 4539,\n\tpages        = {82--96},\n\teditor       = {Nader H. Bshouty and Claudio Gentile}\n}\n@inproceedings{kalai2010efficiently,\n\ttitle        = {Efficiently learning mixtures of two {G}aussians},\n\tauthor       = {Adam Tauman Kalai and Ankur Moitra and Gregory Valiant},\n\tyear         = 2010,\n\tbooktitle    = {Symposium on Theory of Computing (STOC)},\n\tpages        = {553--562}\n}\n@inproceedings{KalaiEtal:GaussianMixture,\n\ttitle        = {Efficiently learning mixtures of two Gaussians},\n\tauthor       = {A. T. Kalai and A. Moitra and G. Valiant},\n\tyear         = 2010,\n\tbooktitle    = {STOC}\n}\n@article{kalashnikov2018qt,\n\ttitle        = {Qt-opt: Scalable deep reinforcement learning for vision-based robotic manipulation},\n\tauthor       = {Kalashnikov, Dmitry and Irpan, Alex and Pastor, Peter and Ibarz, Julian and Herzog, Alexander and Jang, Eric and Quillen, Deirdre and Holly, Ethan and Kalakrishnan, Mrinal and Vanhoucke, Vincent and others},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1806.10293}\n}\n@article{kalbfleisch1984least,\n\ttitle        = {Least-squares estimation of transition probabilities from aggregate data},\n\tauthor       = {John David Kalbfleisch and Jerald F Lawless},\n\tyear         = 1984,\n\tjournal      = {Canadian Journal of Statistics},\n\tvolume       = 12,\n\tnumber       = 3,\n\tpages        = {169--182}\n}\n@inproceedings{kalchbrenner2013recurrent,\n\ttitle        = {Recurrent Continuous Translation Models},\n\tauthor       = {Nal Kalchbrenner and Phil Blunsom},\n\tyear         = 2013,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1700--1709}\n}\n@article{kalman1960new,\n\ttitle        = {A New Approach to Linear Filtering and Prediction Problems},\n\tauthor       = {Kalman, Rudolf E.},\n\tyear         = 1960,\n\tjournal      = {Transactions of the ASME -- Journal of Basic Engineering},\n\tvolume       = {82 (Series D)},\n\tpages        = {35--45},\n\tciteulike-article-id = 347166,\n\tkeywords     = {kalman-filter, statistics, time-series},\n\tposted-at    = {2007-10-16 18:40:17},\n\tpriority     = 3\n}\n@inproceedings{kalpakis2001distance,\n\ttitle        = {Distance Measures for Effective Clustering of ARIMA Time-Series},\n\tauthor       = {Konstantinos Kalpakis and Dhiral Gada and Vasundhara Puttagunta},\n\tyear         = 2001,\n\tbooktitle    = {\n\t\tICDM 2001: Proceeding of 2001 IEEE International Conference on Data\n\n\t\tMining\n\t},\n\tpages        = {273--280}\n}\n@article{kalyanpur2012structured,\n\ttitle        = {Structured data and inference in DeepQA},\n\tauthor       = {Aditya Kalyanpur and Branimir K. Boguraev and Siddharth Patwardhan and J. William Murdock and Adam Lally and Christopher A. Welty and John M. Prager and Bonaventura Coppola and Achille Fokoue-Nkoutche and Lei Zhang and Yue Pan and Zhao Ming Qui},\n\tyear         = 2012,\n\tjournal      = {{IBM} Journal of Research and Development},\n\tvolume       = 56,\n\tpages        = {351--364}\n}\n@inproceedings{kamath2020squads,\n\ttitle        = {Selective Question Answering under Domain Shift},\n\tauthor       = {Amita Kamath and Robin Jia and Percy Liang},\n\tyear         = 2020,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{kamble2015truth,\n\ttitle        = {Truth Serums for Massively Crowdsourced Evaluation Tasks},\n\tauthor       = {Vijay Kamble and Nihar Shah and David Marn and Abhay Parekh and Kannan Ramachandran},\n\tyear         = 2015,\n\tjournal      = {arXiv}\n}\n@article{kamienny2020learning,\n\ttitle        = {Learning Adaptive Exploration Strategies in Dynamic Environments Through Informed Policy Regularization},\n\tauthor       = {Pierre-Alexandre Kamienny and Matteo Pirotta and Alessandro Lazaric and Thibault Lavril and Nicolas Usunier and Ludovic Denoyer},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2005.02934}\n}\n@article{kamiran2012data,\n\ttitle        = {Data preprocessing techniques for classification without discrimination},\n\tauthor       = {Faisal Kamiran and Toon Calders},\n\tyear         = 2012,\n\tjournal      = {Knowledge and Information Systems},\n\tvolume       = 33,\n\tnumber       = 1,\n\tpages        = {1--33}\n}\n@incollection{kamp05drt,\n\ttitle        = {Discourse Representation Theory},\n\tauthor       = {Hans Kamp and Josef van Genabith and Uwe Reyle},\n\tyear         = 2005,\n\tbooktitle    = {Handbook of Philosophical Logic}\n}\n@book{kamp93drt,\n\ttitle        = {From Discourse to Logic: An Introduction to the Model-theoretic Semantics of Natural Language, Formal Logic and Discourse Representation Theory},\n\tauthor       = {H. Kamp and U. Reyle},\n\tyear         = 1993,\n\tpublisher    = {Kluwer, Dordrecht}\n}\n@article{kane2017robust,\n\ttitle        = {Robust polynomial regression up to the information theoretic limit},\n\tauthor       = {Daniel Kane and Sushrut Karmalkar and Eric Price},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@inproceedings{kang2019decoupling,\n\ttitle        = {Decoupling Representation and Classifier for Long-Tailed Recognition},\n\tauthor       = {Bingyi Kang and Saining Xie and Marcus Rohrbach and Zhicheng Yan and Albert Gordo and Jiashi Feng and Yannis Kalantidis},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Learning Representations}\n}\n@inproceedings{kang2019recommendation,\n\ttitle        = {Recommendation as a Communication Game: Self-Supervised Bot-Play for Goal-oriented Dialogue},\n\tauthor       = {Dongyeop Kang and Anusha Balakrishnan and Pararth Shah and Paul A. Crook and Y-Lan Boureau and Jason Weston},\n\tyear         = 2019,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{kang2019testing,\n\ttitle        = {Testing Robustness Against Unforeseen Adversaries},\n\tauthor       = {Daniel Kang and Yi Sun and Dan Hendrycks and Tom Brown and Jacob Steinhardt},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1908.08016}\n}\n@article{kannan2004clusterings,\n\ttitle        = {On clusterings: Good, bad and spectral},\n\tauthor       = {Kannan, Ravi and Vempala, Santosh and Vetta, Adrian},\n\tyear         = 2004,\n\tjournal      = {Journal of the ACM (JACM)},\n\tpublisher    = {ACM New York, NY, USA},\n\tvolume       = 51,\n\tnumber       = 3,\n\tpages        = {497--515}\n}\n@inproceedings{kannan2016adversarial,\n\ttitle        = {Adversarial Evaluation of Dialogue Models},\n\tauthor       = {Anjuli Kannan and Oriol Vinyals},\n\tyear         = 2016,\n\tbooktitle    = {NIPS 2016 Workshop on Adversarial Training}\n}\n@inproceedings{kannan2016smart,\n\ttitle        = {Smart Reply: Automated Response Suggestion for Email},\n\tauthor       = {Anjuli Kannan and Karol Kurach and Sujith Ravi and Tobias Kaufmann and Andrew Tomkins and Balint Miklos and Greg Corrado and Laszlo Lukacs and Marina Ganea and Peter Young and \tVivek Ramavajjala},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)},\n\tpages        = {955--964}\n}\n@article{kannan2018adversarial,\n\ttitle        = {Adversarial logit pairing},\n\tauthor       = {Harini Kannan and Alexey Kurakin and Ian Goodfellow},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1803.06373}\n}\n@article{kannan97convex,\n\ttitle        = {Random walks and an ${O}^*(n^5)$ volume algorithm for convex bodies},\n\tauthor       = {R. Kannan and L. Lovasz and M. Simonovits},\n\tyear         = 1997,\n\tjournal      = {Random Structures and Algorithms},\n\tvolume       = 11,\n\tpages        = {1--50}\n}\n@article{kansagara2011risk,\n\ttitle        = {Risk prediction models for hospital readmission: a systematic review},\n\tauthor       = {Devan Kansagara and Honora Englander and Amanda Salanitro and David Kagen and Cecelia Theobald and Michele Freeman and Sunil Kripalani},\n\tyear         = 2011,\n\tjournal      = {JAMA},\n\tvolume       = 306,\n\tnumber       = 15,\n\tpages        = {1688--1698}\n}\n@inproceedings{kantas2009overview,\n\ttitle        = {An overview of Sequential {M}onte {C}arlo Methods for Parameter Estimation in General State-Space Models},\n\tauthor       = {Kantas, Nicholas and Doucet, Arnaud and Singh, Sumeetpal Sindhu and Maciejowski, Jan},\n\tyear         = 2009,\n\tbooktitle    = {15th IFAC Symposium on System Identification},\n\tvolume       = 15,\n\tpages        = {774--785}\n}\n@inproceedings{kao2014formalizing,\n\ttitle        = {Formalizing the Pragmatics of Metaphor Understanding},\n\tauthor       = {Justine T Kao and Leon Bergen and Noah Goodman},\n\tyear         = 2014,\n\tbooktitle    = {CogSci}\n}\n@article{kao2015pun,\n\ttitle        = {A Computational Model of Linguistic Humor in Puns},\n\tauthor       = {Justine T. Kao and Roger Levy and Noah D. Goodman},\n\tyear         = 2015,\n\tjournal      = {Cognitive Science}\n}\n@inproceedings{kappes2013benchmark,\n\ttitle        = {A Comparative Study of Modern Inference Techniques for Discrete Energy Minimization Problem},\n\tauthor       = {Jőrg H. Kappes and Bjoern Andres and Fred A. Hamprecht and Christoph Schnőrr and Sebastian Nowozin and Dhruv Batra and Sungwoong Kim and Bernhard X. Kausler and Jan Lellmann and Nikos Komodakis and Carsten Rother},\n\tyear         = 2013,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@inproceedings{kapturowski2019recurrent,\n\ttitle        = {Recurrent experience replay in distributed reinforcement learning},\n\tauthor       = {Steven Kapturowski and Georg Ostrovski and John Quan and Remi Munos and Will Dabney},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{karamcheti2017draggns,\n\ttitle        = {A Tale of Two DRAGGNs: A Hybrid Approach for Interpreting Action-Oriented and Goal-Oriented Instructions},\n\tauthor       = {Siddharth Karamcheti and Edward C. Williams and Dilip Arumugam and Mina Rhee and Nakul Gopalan and Lawson L. S. Wong and Stefanie Tellex},\n\tyear         = 2017,\n\tbooktitle    = {First Workshop on Language Grounding for Robotics @ ACL}\n}\n@inproceedings{karamcheti2020decomposition,\n\ttitle        = {Learning Adaptive Language Interfaces through Decomposition},\n\tauthor       = {Siddharth Karamcheti and Dorsa Sadigh and Percy Liang},\n\tyear         = 2020,\n\tbooktitle    = {EMNLP Workshop for Interactive and Executable Semantic Parsing (IntEx-SemPar)}\n}\n@inproceedings{karamcheti2021vla,\n\ttitle        = {Learning Visually Guided Latent Actions for Assistive Teleoperation},\n\tauthor       = {Siddharth Karamcheti and A. Zhai and Dylan P. Losey and Dorsa Sadigh},\n\tyear         = 2021,\n\tbooktitle    = {Learning for Dynamics \\& Control Conference (L4DC)}\n}\n@inproceedings{karampatziakis2014discriminative,\n\ttitle        = {Discriminative Features via Generalized Eigenvectors},\n\tauthor       = {Karampatziakis, Nikos and Mineiro, Paul},\n\tyear         = 2014,\n\tbooktitle    = {ICML},\n\tpages        = {494--502}\n}\n@article{karger2014budget,\n\ttitle        = {Budget-optimal task allocation for reliable crowdsourcing systems},\n\tauthor       = {David R. Karger and Sewoong Oh and Devavrat Shah},\n\tyear         = 2014,\n\tjournal      = {Operations Research},\n\tvolume       = 62,\n\tnumber       = 1,\n\tpages        = {1--24}\n}\n@inproceedings{karimi2016linear,\n\ttitle        = {Linear convergence of gradient and proximal-gradient methods under the polyak-{\\l}ojasiewicz condition},\n\tauthor       = {Karimi, Hamed and Nutini, Julie and Schmidt, Mark},\n\tyear         = 2016,\n\tbooktitle    = {Joint European Conference on Machine Learning and Knowledge Discovery in Databases},\n\tpages        = {795--811},\n\torganization = {Springer}\n}\n@inproceedings{Karnin2015online,\n\ttitle        = {Online PCA with spectral bounds},\n\tauthor       = {Karnin, Zohar and Liberty, Edo},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the 28th Annual Conference on Computational Learning Theory (COLT)},\n\tpages        = {505--509}\n}\n@inproceedings{karpathy2015deep,\n\ttitle        = {Deep visual-semantic alignments for generating image descriptions},\n\tauthor       = {Andrej Karpathy and Li Fei-Fei},\n\tyear         = 2015,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {3128--3137}\n}\n@article{karras2017progressive,\n\ttitle        = {Progressive Growing of GANs for Improved Quality, Stability, and Variation},\n\tauthor       = {Tero Karras and Timo Aila and Samuli Laine and Jaakko Lehtinen},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1710.10196}\n}\n@article{kasiviswanathan2011can,\n\ttitle        = {What can we learn privately?},\n\tauthor       = {Shiva Prasad Kasiviswanathan and Homin K Lee and Kobbi Nissim and Sofya Raskhodnikova and Adam Smith},\n\tyear         = 2011,\n\tjournal      = {SIAM Journal on Computing},\n\tvolume       = 40,\n\tnumber       = 3,\n\tpages        = {793--826}\n}\n@phdthesis{kassel1995comparison,\n\ttitle        = {A comparison of approaches to on-line handwritten character recognition},\n\tauthor       = {Robert H Kassel},\n\tyear         = 1995,\n\tschool       = {Massachusetts Institute of Technology}\n}\n@inproceedings{kate05funql,\n\ttitle        = {Learning to Transform Natural to Formal Languages},\n\tauthor       = {Rohit J. Kate and Yuk Wah Wong and Raymond J. Mooney},\n\tyear         = 2005,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tpages        = {1062--1068}\n}\n@inproceedings{kate06krisp,\n\ttitle        = {Using String-Kernels for Learning Semantic Parsers},\n\tauthor       = {Rohit J. Kate and Raymond J. Mooney},\n\tyear         = 2006,\n\tbooktitle    = {International Conference on Computational Linguistics and Association for Computational Linguistics (COLING/ACL)},\n\tpages        = {913--920}\n}\n@inproceedings{kate07krisper,\n\ttitle        = {Learning Language Semantics from Ambiguous Supervision},\n\tauthor       = {Rohit J. Kate and Raymond J. Mooney},\n\tyear         = 2007,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tpages        = {895--900}\n}\n@article{katona2018parking,\n\ttitle        = {On the Capital Market Consequences of Alternative Data: Evidence from Outer Space},\n\tauthor       = {Zsolt Katona and Marcus Painter and Panos N. Patatoukas and Jean Zeng},\n\tyear         = 2018,\n\tjournal      = {Miami Behavioral Finance Conference}\n}\n@article{Katyusha2016,\n\ttitle        = {Katyusha: Accelerated Variance Reduction for Faster {SGD}},\n\tauthor       = {Zeyuan Allen Zhu},\n\tyear         = 2016,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1603.05953},\n\turl          = {http://arxiv.org/abs/1603.05953},\n\ttimestamp    = {Sat, 02 Apr 2016 11:49:48 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/journals/corr/Zhu16c},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{katz2002omnibase,\n\ttitle        = {Omnibase: Uniform access to heterogeneous data for question answering},\n\tauthor       = {Boris Katz and Sue Felshin and Deniz Yuret and Ali Ibrahim and Jimmy Lin and Gregory Marton and Alton Jerome McFarland and Baris Temelkuran},\n\tyear         = 2002,\n\tbooktitle    = {International Conference on Application of Natural Language to Information Systems}\n}\n@article{katz2017reluplex,\n\ttitle        = {Reluplex: An Efficient {SMT} Solver for Verifying Deep Neural Networks},\n\tauthor       = {Guy Katz and Clark Barrett and David Dill and Kyle Julian and Mykel Kochenderfer},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1702.01135}\n}\n@article{katz2017towards,\n\ttitle        = {Towards proving the adversarial robustness of deep neural networks},\n\tauthor       = {Guy Katz and Clark Barrett and David L. Dill and Kyle Julian and Mykel J. Kochenderfer},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@article{kaufmann2020adaptive,\n\ttitle        = {Adaptive Reward-Free Exploration},\n\tauthor       = {Kaufmann, Emilie and M{\\'e}nard, Pierre and Domingues, Omar Darwiche and Jonsson, Anders and Leurent, Edouard and Valko, Michal},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.06294}\n}\n@inproceedings{kaushik2019learning,\n\ttitle        = {Learning The Difference That Makes A Difference With Counterfactually-Augmented Data},\n\tauthor       = {Divyansh Kaushik and Eduard Hovy and Zachary Lipton},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{Kawaguchi,\n\ttitle        = {{Deep Learning without Poor Local Minima}},\n\tauthor       = {{Kawaguchi}, K.},\n\tyear         = 2016,\n\tmonth        = may,\n\tjournal      = {ArXiv e-prints},\n\tbooktitle    = {Proceedings of the 30th International Conference on Neural Information Processing Systems},\n\tpages        = {586--594},\n\tarchiveprefix = {arXiv},\n\teprint       = {1605.07110},\n\tprimaryclass = {stat.ML},\n\tkeywords     = {Statistics - Machine Learning, Computer Science - Learning, Mathematics - Optimization and Control},\n\tadsurl       = {http://adsabs.harvard.edu/abs/2016arXiv160507110K},\n\tadsnote      = {Provided by the SAO/NASA Astrophysics Data System}\n}\n@article{kawahara2009change,\n\ttitle        = {Change-Point Detection in Time-Series Data by Direct Density-Ratio Estimation},\n\tauthor       = {Yoshinobu Kawahara and Masashi Sugiyama},\n\tyear         = 2009,\n\tjournal      = {SDM},\n\tvolume       = 9,\n\tpages        = {389--400}\n}\n@book{kay86algorithm,\n\ttitle        = {Algorithm Schemata and Data Structures in Syntactic Processing},\n\tauthor       = {Martin Kay},\n\tyear         = 1986,\n\tpublisher    = {Readings in Natural Language Processing},\n\tpages        = {35--70}\n}\n@article{kazemi2017show,\n\ttitle        = {Show, Ask, Attend, and Answer: A Strong Baseline For Visual Question Answering},\n\tauthor       = {Vahid Kazemi and Ali Elqursh},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1704.03162}\n}\n@inproceedings{kb13,\n\ttitle        = {Recurrent continuous translation models},\n\tauthor       = {Kalchbrenner, Nal and Blunsom, Phil},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing},\n\tpages        = {1700--1709}\n}\n@inproceedings{kbdjk17,\n\ttitle        = {Reluplex: An efficient SMT solver for verifying deep neural networks},\n\tauthor       = {Katz, Guy and Barrett, Clark and Dill, David L and Julian, Kyle and Kochenderfer, Mykel J},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Computer Aided Verification (CAV)},\n\tpages        = {97--117},\n\torganization = {Springer}\n}\n@article{ke2003efficient,\n\ttitle        = {Efficient selective screening of haplotype tag SNPs},\n\tauthor       = {Ke, Xiayi and Cardon, Lon R},\n\tyear         = 2003,\n\tjournal      = {Bioinformatics},\n\tpublisher    = {Oxford Univ Press},\n\tvolume       = 19,\n\tnumber       = 2,\n\tpages        = {287--288}\n}\n@article{Kearns,\n\ttitle        = {Efficient noise-tolerant learning from statistical queries},\n\tauthor       = {Kearns, Michael},\n\tyear         = 1998,\n\tmonth        = nov,\n\tjournal      = {J. ACM},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tvolume       = 45,\n\tnumber       = 6,\n\tpages        = {983--1006},\n\tissn         = {0004-5411},\n\tissue_date   = {Nov. 1998},\n\tnumpages     = 24,\n\tkeywords     = {computational learning theory, machine learning}\n}\n@article{kearns1993learning,\n\ttitle        = {Learning in the presence of malicious errors},\n\tauthor       = {Michael Kearns and Ming Li},\n\tyear         = 1993,\n\tjournal      = {SIAM Journal on Computing},\n\tvolume       = 22,\n\tnumber       = 4,\n\tpages        = {807--837}\n}\n@inproceedings{kearns1998near,\n\ttitle        = {Near-Optimal Reinforcement Learning in Polynominal Time},\n\tauthor       = {Kearns, Michael J and Singh, Satinder P},\n\tyear         = 1998,\n\tbooktitle    = {Proceedings of the Fifteenth International Conference on Machine Learning},\n\tpages        = {260–268}\n}\n@inproceedings{kearns1999efficient,\n\ttitle        = {Efficient reinforcement learning in factored MDPs},\n\tauthor       = {Kearns, Michael and Koller, Daphne},\n\tyear         = 1999,\n\tbooktitle    = {IJCAI},\n\tvolume       = 16,\n\tpages        = {740--747}\n}\n@inproceedings{kearns1999finite,\n\ttitle        = {Finite-sample convergence rates for {Q}-learning and indirect algorithms},\n\tauthor       = {Kearns, Michael J and Singh, Satinder P},\n\tyear         = 1999,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {996--1002}\n}\n@inproceedings{kearns2000approximate,\n\ttitle        = {Approximate planning in large {POMDPs} via reusable trajectories},\n\tauthor       = {Kearns, Michael J and Mansour, Yishay and Ng, Andrew Y},\n\tyear         = 2000,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {1001--1007}\n}\n@article{kearns2002near,\n\ttitle        = {Near-optimal reinforcement learning in polynomial time},\n\tauthor       = {Kearns, Michael and Singh, Satinder},\n\tyear         = 2002,\n\tjournal      = {Machine learning},\n\tpublisher    = {Springer},\n\tvolume       = 49,\n\tnumber       = {2-3},\n\tpages        = {209--232}\n}\n@article{kearns2002sparse,\n\ttitle        = {A sparse sampling algorithm for near-optimal planning in large Markov decision processes},\n\tauthor       = {Kearns, Michael and Mansour, Yishay and Ng, Andrew Y},\n\tyear         = 2002,\n\tjournal      = {Machine Learning},\n\tpublisher    = {Springer},\n\tvolume       = 49,\n\tnumber       = {2-3},\n\tpages        = {193--208}\n}\n@article{kearns2017preventing,\n\ttitle        = {Preventing fairness gerrymandering: Auditing and learning for subgroup fairness},\n\tauthor       = {Kearns, Michael and Neel, Seth and Roth, Aaron and Wu, Zhiwei Steven},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1711.05144}\n}\n@inproceedings{kearns2018preventing,\n\ttitle        = {Preventing fairness gerrymandering: Auditing and learning for subgroup fairness},\n\tauthor       = {Michael Kearns and Seth Neel and Aaron Roth and Zhiwei Steven Wu},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {2564--2572}\n}\n@article{kearns2019average,\n\ttitle        = {Average Individual Fairness: Algorithms, Generalization and Experiments},\n\tauthor       = {Kearns, Michael and Roth, Aaron and Sharifi-Malvajerdi, Saeed},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1905.10607}\n}\n@article{keilwagen2019accurate,\n\ttitle        = {Accurate prediction of cell type-specific transcription factor binding},\n\tauthor       = {Jens Keilwagen and Stefan Posch and Jan Grau},\n\tyear         = 2019,\n\tjournal      = {Genome biology},\n\tvolume       = 20,\n\tnumber       = 1\n}\n@inproceedings{keizer2017negotiation,\n\ttitle        = {Evaluating Persuasion Strategies and Deep Reinforcement Learning Methods for Negotiation Dialogue Agents},\n\tauthor       = {Simon Keizer and Markus Guhe and Heriberto Cuayahuitl and Ioannis Efstathiou and Klaus-Peter Engelbrecht and Mihai Dobre and Alex Lascarides and Oliver Lemon},\n\tyear         = 2017,\n\tbooktitle    = {European Association for Computational Linguistics (EACL)}\n}\n@article{kelley2016basset,\n\ttitle        = {Basset: learning the regulatory code of the accessible genome with deep convolutional neural networks},\n\tauthor       = {David R Kelley and Jasper Snoek and John L Rinn},\n\tyear         = 2016,\n\tjournal      = {Genome research},\n\tvolume       = 26,\n\tnumber       = 7,\n\tpages        = {990--999}\n}\n@article{kellgren1957radiological,\n\ttitle        = {Radiological assessment of osteo-arthrosis},\n\tauthor       = {JH Kellgren and JS Lawrence},\n\tyear         = 1957,\n\tjournal      = {Annals of the Rheumatic Diseases},\n\tvolume       = 16,\n\tnumber       = 4\n}\n@inproceedings{kemp2007learning,\n\ttitle        = {Learning and using relational theories},\n\tauthor       = {Charles Kemp and Noah Goodman and Joshua B Tenenbaum},\n\tyear         = 2007,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {753--760}\n}\n@inproceedings{kendall2017uncertainties,\n\ttitle        = {What uncertainties do we need in {B}ayesian deep learning for computer vision?},\n\tauthor       = {Alex Kendall and Yarin Gal},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {5574--5584}\n}\n@book{kenney2013mathematics,\n\ttitle        = {Mathematics of statistics},\n\tauthor       = {John Francis Kenney},\n\tyear         = 2013,\n\tpublisher    = {D. Van Nostrand Company Inc; Toronto; Princeton; New Jersey; London; New York,; Affiliated East-West Press Pvt-Ltd; New Delhi}\n}\n@inproceedings{keogh2001locally,\n\ttitle        = {\n\t\tLocally adaptive dimensionality reduction for indexing large time\n\n\t\tseries databases\n\t},\n\tauthor       = {\n\t\tKeogh, Eamonn and Chakrabarti, Kaushik and Pazzani, Michael and Mehrotra,\n\n\t\tSharad\n\t},\n\tyear         = 2001,\n\tbooktitle    = {\n\t\tProceedings of the 2001 ACM SIGMOD international conference on Management\n\n\t\tof data\n\t},\n\tlocation     = {Santa Barbara, California, United States},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {SIGMOD '01},\n\tpages        = {151--162},\n\tdoi          = {http://doi.acm.org/10.1145/375663.375680},\n\tisbn         = {1-58113-332-4},\n\tacmid        = 375680,\n\tkeywords     = {content-based retrieval, dimensionality reduction, indexing},\n\tnumpages     = 12\n}\n@inproceedings{keogh2002exact,\n\ttitle        = {Exact indexing of dynamic time warping},\n\tauthor       = {Keogh, Eamonn},\n\tyear         = 2002,\n\tbooktitle    = {\n\t\tProceedings of the 28th international conference on Very Large Data\n\n\t\tBases\n\t},\n\tlocation     = {Hong Kong, China},\n\tpublisher    = {VLDB Endowment},\n\tseries       = {VLDB '02},\n\tpages        = {406--417},\n\tacmid        = 1287405,\n\tnumpages     = 12\n}\n@inproceedings{keogh2004indexing,\n\ttitle        = {Indexing large human-motion databases},\n\tauthor       = {\n\t\tKeogh, Eamonn and Palpanas, Themistoklis and Zordan, Victor B. and\n\n\t\tGunopulos, Dimitrios and Cardle, Marc\n\t},\n\tyear         = 2004,\n\tbooktitle    = {\n\t\tProceedings of the Thirtieth international conference on Very large\n\n\t\tdata bases - Volume 30\n\t},\n\tlocation     = {Toronto, Canada},\n\tpublisher    = {VLDB Endowment},\n\tseries       = {VLDB '04},\n\tpages        = {780--791},\n\tisbn         = {0-12-088469-0},\n\tacmid        = 1316757,\n\tkeywords     = {animation, indexing, motion capture, time series},\n\tnumpages     = 12\n}\n@article{keramati2018strategic,\n\ttitle        = {Strategic Object Oriented Reinforcement Learning},\n\tauthor       = {R. Keramati and J. Whang and P. Cho and E. Brunskill},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1806.00175}\n}\n@article{kerckhoffs1883security,\n\ttitle        = {La cryptographie militaire},\n\tauthor       = {Auguste Kerckhoffs},\n\tyear         = 1883,\n\tjournal      = {Journal des sciences militaires},\n\tvolume       = 9\n}\n@incollection{KernelDeep,\n\ttitle        = {Kernel Methods for Deep Learning},\n\tauthor       = {Youngmin Cho and Lawrence Saul},\n\tyear         = 2009,\n\tbooktitle    = {Advances in Neural Information Processing Systems 22},\n\tpages        = {342--350},\n\towner        = {rongge},\n\ttimestamp    = {2013.09.26}\n}\n@book{kerr2001university,\n\ttitle        = {The Uses of the University},\n\tauthor       = {Clark Kerr},\n\tyear         = 2001,\n\tpublisher    = {Harvard University Press}\n}\n@article{keshavan2010matrix,\n\ttitle        = {Matrix Completion From a Few Entries},\n\tauthor       = {Keshavan, Raghunandan H and Montanari, Andrea and Oh, Sewoong},\n\tyear         = 2010,\n\tjournal      = {Information Theory, IEEE Transactions on},\n\tpublisher    = {IEEE},\n\tvolume       = 56,\n\tnumber       = 6,\n\tpages        = {2980--2998}\n}\n@article{keshavan2010matrixnoisy,\n\ttitle        = {Matrix completion from noisy entries},\n\tauthor       = {Keshavan, Raghunandan H and Montanari, Andrea and Oh, Sewoong},\n\tyear         = 2010,\n\tjournal      = {The Journal of Machine Learning Research},\n\tpublisher    = {JMLR. org},\n\tvolume       = 11,\n\tpages        = {2057--2078}\n}\n@article{keskar2016large,\n\ttitle        = {On large-batch training for deep learning: Generalization gap and sharp minima},\n\tauthor       = {Keskar, Nitish Shirish and Mudigere, Dheevatsa and Nocedal, Jorge and Smelyanskiy, Mikhail and Tang, Ping Tak Peter},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1609.04836}\n}\n@article{keskar2017improving,\n\ttitle        = {Improving generalization performance by switching from adam to sgd},\n\tauthor       = {Keskar, Nitish Shirish and Socher, Richard},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1712.07628}\n}\n@article{kesten1966additional,\n\ttitle        = {Additional limit theorems for indecomposable multidimensional {G}alton-{W}atson processes},\n\tauthor       = {Harry Kesten and Bernt P. Stigum},\n\tyear         = 1966,\n\tjournal      = {The Annals of Mathematical Statistics},\n\tvolume       = 37,\n\tnumber       = 6,\n\tpages        = {1463--1481}\n}\n@article{kesten1966limit,\n\ttitle        = {A limit theorem for multidimensional {G}alton-{W}atson processes},\n\tauthor       = {Harry Kesten and Bernt P. Stigum},\n\tyear         = 1966,\n\tjournal      = {The Annals of Mathematical Statistics},\n\tvolume       = 37,\n\tnumber       = 5,\n\tpages        = {1211--1223}\n}\n@article{khachiyan1993complexity,\n\ttitle        = {On the complexity of approximating the maximal inscribed ellipsoid for a polytope},\n\tauthor       = {Khachiyan, Leonid G and Todd, Michael J},\n\tyear         = 1993,\n\tjournal      = {Mathematical Programming},\n\tpublisher    = {Springer},\n\tvolume       = 61,\n\tnumber       = 1,\n\tpages        = {137--159}\n}\n@article{khachiyan1996rounding,\n\ttitle        = {Rounding of polytopes in the real number model of computation},\n\tauthor       = {Khachiyan, Leonid G.},\n\tyear         = 1996,\n\tjournal      = {Mathematics of Operations Research},\n\tpublisher    = {INFORMS},\n\tvolume       = 21,\n\tnumber       = 2,\n\tpages        = {307--320}\n}\n@inproceedings{khadka2018evolution,\n\ttitle        = {Evolution-guided policy gradient in reinforcement learning},\n\tauthor       = {Khadka, Shauharda and Tumer, Kagan},\n\tyear         = 2018,\n\tbooktitle    = {Proceedings of the 32nd International Conference on Neural Information Processing Systems},\n\tpages        = {1196--1208}\n}\n@article{khan2001diagnosis,\n\ttitle        = {Classification and diagnostic prediction of cancers using gene expression profiling and artificial neural networks},\n\tauthor       = {Javed Khan and Jun S. Wei and Markus Ringnér and Lao H. Saal and Marc Ladanyi and Frank Westermann and Frank Berthold and Manfred Schwab and Cristina R. Antonescu and Carsten Peterson and Paul S. Meltzer},\n\tyear         = 2001,\n\tjournal      = {Nature Medicine}\n}\n@article{khandelwal2018sharp,\n\ttitle        = {Sharp nearby, fuzzy far away: How neural language models use context},\n\tauthor       = {Urvashi Khandelwal and He He and Peng Qi and Dan Jurafsky},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.04623}\n}\n@inproceedings{khani2016unanimity,\n\ttitle        = {Unanimous Prediction for 100\\% Precision with Application to Learning Semantic Mappings},\n\tauthor       = {Fereshte Khani and Martin Rinard and Percy Liang},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{khani2018pip,\n\ttitle        = {Planning, Inference and Pragmatics in Sequential Language Games},\n\tauthor       = {Fereshte Khani and Noah D. Goodman and Percy Liang},\n\tyear         = 2018,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 6\n}\n@article{khani2019mwld,\n\ttitle        = {Maximum Weighted Loss Discrepancy},\n\tauthor       = {Fereshte Khani and Aditi Raghunathan and Percy Liang},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1906.03518}\n}\n@inproceedings{khani2020noise,\n\ttitle        = {Feature Noise Induces Loss Discrepancy Across Groups},\n\tauthor       = {Fereshte Khani and Percy Liang},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{khani2021removing,\n\ttitle        = {Removing Spurious Features can Hurt Accuracy and Affect Groups Disproportionately},\n\tauthor       = {Fereshte Khani and Percy Liang},\n\tyear         = 2021,\n\tbooktitle    = {ACM Conference on Fairness, Accountability, and Transparency (FAccT)}\n}\n@inproceedings{khanna2013parallel,\n\ttitle        = {Parallel matrix factorization for binary response},\n\tauthor       = {Khanna, Rajiv and Zhang, Liang and Agarwal, Deepak and Chen, Bee-chung},\n\tyear         = 2013,\n\tbooktitle    = {Big Data, 2013 IEEE International Conference on},\n\tpages        = {430--438},\n\torganization = {IEEE}\n}\n@inproceedings{khanna2019interpreting,\n\ttitle        = {Interpreting Black Box Predictions using {Fisher} Kernels},\n\tauthor       = {Rajiv Khanna and Been Kim and Joydeep Ghosh and Oluwasanmi Koyejo},\n\tyear         = 2019,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)},\n\tpages        = {3382--3390}\n}\n@inproceedings{khanpour2016dialogue,\n\ttitle        = {Dialogue Act Classification in Domain-Independent Conversations Using a Deep Recurrent Neural Network},\n\tauthor       = {Hamed Khanpour and Nishitha Guntakandla and Rodney D. Nielsen},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)}\n}\n@article{khatib1987osc,\n\ttitle        = {A unified approach for motion and force control of robot manipulators: The operational space formulation},\n\tauthor       = {Oussama Khatib},\n\tyear         = 1987,\n\tjournal      = {IEEE Journal on Robotics and Automation},\n\tvolume       = 3,\n\tpages        = {43--53}\n}\n@inproceedings{khattab2020colbert,\n\ttitle        = {{ColBERT}: Efficient and Effective Passage Search via Contextualized Late Interaction over {BERT}},\n\tauthor       = {Omar Khattab and Matei Zaharia},\n\tyear         = 2020,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)}\n}\n@article{khim2018adversarial,\n\ttitle        = {Adversarial risk bounds for binary classification via function transformation},\n\tauthor       = {Justin Khim and Po-Ling Loh},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1810.09519}\n}\n@article{khintchine1923uber,\n\ttitle        = {{\\\"U}ber dyadische Br{\\\"u}che},\n\tauthor       = {Aleksandr Khintchine},\n\tyear         = 1923,\n\tjournal      = {Mathematische Zeitschrift},\n\tvolume       = 18,\n\tpages        = {109--116}\n}\n@inproceedings{khosla2012undoing,\n\ttitle        = {Undoing the damage of dataset bias},\n\tauthor       = {Aditya Khosla and Tinghui Zhou and Tomasz Malisiewicz and Alexei A Efros and Antonio Torralba},\n\tyear         = 2012,\n\tbooktitle    = {European Conference on Computer Vision (ECCV)},\n\tpages        = {158--171}\n}\n@inproceedings{khot08approximate,\n\ttitle        = {Approximate kernel clustering},\n\tauthor       = {S. Khot and A. Naor},\n\tyear         = 2008,\n\tbooktitle    = {Foundations of Computer Science (FOCS)}\n}\n@inproceedings{khot10sharp,\n\ttitle        = {Sharp kernel clustering algorithms and their associated {G}rothendieck inequalities},\n\tauthor       = {S. Khot and A. Naor},\n\tyear         = 2010,\n\tbooktitle    = {Symposium on Discrete Algorithms (SODA)}\n}\n@article{kidambi2020morel,\n\ttitle        = {Morel: Model-based offline reinforcement learning},\n\tauthor       = {Kidambi, Rahul and Rajeswaran, Aravind and Netrapalli, Praneeth and Joachims, Thorsten},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2005.05951}\n}\n@inproceedings{kiddon2011coarse,\n\ttitle        = {Coarse-to-Fine Inference and Learning for First-Order Probabilistic Models},\n\tauthor       = {Chlo'e Kiddon and Pedro Domingos},\n\tyear         = 2011,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{kiddon2016globally,\n\ttitle        = {Globally Coherent Text Generation with Neural Checklist Models},\n\tauthor       = {Chlo'e Kiddon and Luke S. Zettlemoyer and Yejin Choi},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{kilbertus2017avoiding,\n\ttitle        = {Avoiding discrimination through causal reasoning},\n\tauthor       = {Kilbertus, Niki and Carulla, Mateo Rojas and Parascandolo, Giambattista and Hardt, Moritz and Janzing, Dominik and Sch{\\\"o}lkopf, Bernhard},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {656--666}\n}\n@article{kilgarriff1997don,\n\ttitle        = {I don’t believe in word senses},\n\tauthor       = {Kilgarriff, Adam},\n\tyear         = 1997,\n\tjournal      = {Computers and the Humanities}\n}\n@article{kilgarriff1997wordsense,\n\ttitle        = {{I} Don’t Believe in Word Senses},\n\tauthor       = {A. Kilgarriff},\n\tyear         = 1997,\n\tjournal      = {Computers and the Humanities}\n}\n@inproceedings{kim2009weighted,\n\ttitle        = {Weighted nonnegative matrix factorization},\n\tauthor       = {Kim, Yong-Deok and Choi, Seungjin},\n\tyear         = 2009,\n\tbooktitle    = {Acoustics, Speech and Signal Processing, 2009. ICASSP 2009. IEEE International Conference on},\n\tpages        = {1541--1544},\n\torganization = {IEEE}\n}\n@article{kim2011fast,\n\ttitle        = {Fast nonnegative matrix factorization: An active-set-like method and comparisons},\n\tauthor       = {Kim, Jingu and Park, Haesun},\n\tyear         = 2011,\n\tjournal      = {SIAM Journal on Scientific Computing},\n\tpublisher    = {SIAM},\n\tvolume       = 33,\n\tnumber       = 6,\n\tpages        = {3261--3281}\n}\n@inproceedings{kim2011overview,\n\ttitle        = {Overview of BioNLP shared task 2011},\n\tauthor       = {Jin-Dong Kim and Sampo Pyysalo and Tomoko Ohta and Robert Bossy and Ngan Nguyen and Jun'ichi Tsujii},\n\tyear         = 2011,\n\tbooktitle    = {Proceedings of the BioNLP Shared Task 2011 Workshop}\n}\n@inproceedings{kim2012unsupervised,\n\ttitle        = {Unsupervised {PCFG} induction for grounded language learning with highly ambiguous supervision},\n\tauthor       = {J. Kim and R. Mooney},\n\tyear         = 2012,\n\tbooktitle    = {Computational Natural Language Learning (CoNLL)},\n\tpages        = {433--444}\n}\n@inproceedings{kim2013learning,\n\ttitle        = {Learning from limited demonstrations},\n\tauthor       = {Beomjoon Kim and Amir massoud Farahmand and Joelle Pineau and Doina Precup},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {2859--2867}\n}\n@article{kim2014algorithms,\n\ttitle        = {Algorithms for nonnegative matrix and tensor factorizations: A unified view based on block coordinate descent framework},\n\tauthor       = {Kim, Jingu and He, Yunlong and Park, Haesun},\n\tyear         = 2014,\n\tjournal      = {Journal of Global Optimization},\n\tpublisher    = {Springer},\n\tvolume       = 58,\n\tnumber       = 2,\n\tpages        = {285--319}\n}\n@article{kim2015character,\n\ttitle        = {Character-aware neural language models},\n\tauthor       = {Yoon Kim and Yacine Jernite and David Sontag and Alexander M Rush},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1508.06615}\n}\n@inproceedings{kim2016analogies,\n\ttitle        = {Generating Personalized Spatial Analogies for Distances and Areas},\n\tauthor       = {Yea-seul Kim and Jessica Hullman and  Maneesh Agarwala},\n\tyear         = 2016,\n\tbooktitle    = {Conference on Human Factors in Computing Systems (CHI)}\n}\n@inproceedings{kim2016examples,\n\ttitle        = {Examples are not enough, learn to criticize! Criticism for Interpretability},\n\tauthor       = {Been Kim and Oluwasanmi O Koyejo and Rajiv Khanna},\n\tyear         = 2016,\n\tbooktitle    = {Advances In Neural Information Processing Systems},\n\tpages        = {2280--2288}\n}\n@article{kim2016multiresolution,\n\ttitle        = {Incorporating Spatial Context and Fine-grained Detail from Satellite Imagery to Predict Poverty},\n\tauthor       = {Jae Hyun Kim and Michael Xie and Neal Jean and Stefano Ermon},\n\tyear         = 2016,\n\tjournal      = {Stanford University}\n}\n@article{kim2018fairness,\n\ttitle        = {Fairness Through Computationally-Bounded Awareness},\n\tauthor       = {Michael P Kim and Omer Reingold and Guy N Rothblum},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1803.03239}\n}\n@inproceedings{kim2018interpretability,\n\ttitle        = {Interpretability Beyond Feature Attribution: Quantitative Testing with Concept Activation Vectors (TCAV)},\n\tauthor       = {Been Kim and Martin Wattenberg and Justin Gilmer and Carrie Cai and James Wexler and Fernanda Viegas and others},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {2668--2677}\n}\n@inproceedings{kim2019multiaccuracy,\n\ttitle        = {Multiaccuracy: Black-box post-processing for fairness in classification},\n\tauthor       = {Kim, Michael P and Ghorbani, Amirata and Zou, James},\n\tyear         = 2019,\n\tbooktitle    = {Proceedings of the 2019 AAAI/ACM Conference on AI, Ethics, and Society},\n\tpages        = {247--254}\n}\n@article{kim2020cogs,\n\ttitle        = {COGS: A Compositional Generalization Challenge Based on Semantic Interpretation},\n\tauthor       = {Najoung Kim and Tal Linzen},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.05465}\n}\n@article{kim2020pre,\n\ttitle        = {Are pre-trained language models aware of phrases? simple but strong baselines for grammar induction},\n\tauthor       = {Kim, Taeuk and Choi, Jihun and Edmiston, Daniel and Lee, Sang-goo},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2002.00737}\n}\n@inproceedings{kim2021policy,\n\ttitle        = {A policy gradient algorithm for learning to learn in multiagent reinforcement learning},\n\tauthor       = {Kim, Dong Ki and Liu, Miao and Riemer, Matthew D and Sun, Chuangchuang and Abdulhai, Marwa and Habibi, Golnaz and Lopez-Cot, Sebastian and Tesauro, Gerald and How, Jonathan},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {5541--5550},\n\torganization = {PMLR}\n}\n@inproceedings{kim2021vilt,\n\ttitle        = {{ViLT}: Vision-and-language transformer without convolution or region supervision},\n\tauthor       = {Wonjae Kim and Bokyung Son and Ildoo Kim},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@book{king2013solution,\n\ttitle        = {A Solution to the Ecological Inference Problem: Reconstructing Individual Behavior from Aggregate Data},\n\tauthor       = {Gary King},\n\tyear         = 2013,\n\tpublisher    = {Princeton University Press}\n}\n@inproceedings{kinga2015method,\n\ttitle        = {A method for stochastic optimization},\n\tauthor       = {Diederik P. Kinga and Jimmy Ba},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)},\n\tvolume       = 5\n}\n@article{kingma2013auto,\n\ttitle        = {Auto-encoding variational bayes},\n\tauthor       = {Kingma, Diederik P and Welling, Max},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1312.6114}\n}\n@article{kingma2014adam,\n\ttitle        = {Adam: A method for stochastic optimization},\n\tauthor       = {Kingma, Diederik P and Ba, Jimmy},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1412.6980}\n}\n@article{kingma2014variational,\n\ttitle        = {Auto-Encoding Variational {B}ayes},\n\tauthor       = {Diederik P. Kingma and Max Welling},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1312.6114}\n}\n@inproceedings{kingma2015adam,\n\ttitle        = {Adam: A method for stochastic optimization},\n\tauthor       = {Diederik Kingma and Jimmy Ba},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{kipf2017semi,\n\ttitle        = {Semi-Supervised Classification with Graph Convolutional Networks},\n\tauthor       = {Thomas N. Kipf and Max Welling},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@book{kipfer2013roget,\n\ttitle        = {Roget's 21st Century Thesaurus, Third Edition},\n\tauthor       = {Barbara Ann Kipfer},\n\tyear         = 2013,\n\tpublisher    = {Random House Publishing Group}\n}\n@article{kiranyaz2019cnn1d,\n\ttitle        = {1D Convolutional Neural Networks and Applications: A Survey},\n\tauthor       = {Serkan Kiranyaz and Onur Avci and Osama Abdeljaber and Turker Ince and Moncef Gabbouj and Daniel J Inman},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1905.03554}\n}\n@inproceedings{kirk2005skeletal,\n\ttitle        = {Skeletal Parameter Estimation from Optical Motion Capture Data},\n\tauthor       = {Adam G. Kirk and James F. O'Brien and David A. Forsyth},\n\tyear         = 2005,\n\tmonth        = jun,\n\tbooktitle    = {\n\t\tIEEE Conference on Computer Vision and Pattern Recognition (CVPR)\n\n\t\t2005\n\t},\n\tpages        = {782--788},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@inproceedings{kirk2014controlled,\n\ttitle        = {Controlled Natural Languages for language generation in artificial cognition},\n\tauthor       = {N. H. Kirk and D. Nyga and M. Beetz},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Robotics and Automation (ICRA)},\n\tpages        = {6667--6672}\n}\n@inproceedings{kirkpatrick10painless,\n\ttitle        = {Painless Unsupervised Learning with Features},\n\tauthor       = {Taylor Berg-Kirkpatrick and Alexandre Bouchard-C\\^ot\\'e and John DeNero and Dan Klein},\n\tyear         = 2010,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{kirkpatrick1983optimization,\n\ttitle        = {Optimization by simulated annealing},\n\tauthor       = {Kirkpatrick, Scott and Gelatt, C Daniel and Vecchi, Mario P},\n\tyear         = 1983,\n\tjournal      = {science},\n\tpublisher    = {American association for the advancement of science},\n\tvolume       = 220,\n\tnumber       = 4598,\n\tpages        = {671--680}\n}\n@inproceedings{kirkpatrick2012significance,\n\ttitle        = {An empirical investigation of statistical significance in {NLP}},\n\tauthor       = {Taylor Berg-Kirkpatrick and David Burkett and Dan Klein},\n\tyear         = 2012,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {995--1005}\n}\n@article{kirkpatrick2017overcoming,\n\ttitle        = {Overcoming catastrophic forgetting in neural networks},\n\tauthor       = {J. Kirkpatrick and R. Pascanu and N. Rabinowitz and J. Veness and G. Desjardins and A. A. Rusu and K. Milan and J. Quan and T. Ramalho and A. Grabska-Barwinska and others},\n\tyear         = 2017,\n\tjournal      = {Proceedings of the national academy of sciences}\n}\n@article{Kirkpatrick83optimizationby,\n\ttitle        = {Optimization by simulated annealing},\n\tauthor       = {S. Kirkpatrick and C. D. Gelatt and M. P. Vecchi},\n\tyear         = 1983,\n\tjournal      = {SCIENCE},\n\tvolume       = 220,\n\tnumber       = 4598,\n\tpages        = {671--680}\n}\n@inproceedings{kiros2015skip,\n\ttitle        = {Skip-Thought Vectors},\n\tauthor       = {Ryan Kiros and Yukun Zhu and Ruslan Salakhutdinov and Richard S. Zemel and Raquel Urtasun and Antonio Torralba and Sanja Fidler},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@techreport{kishida2005gap,\n\ttitle        = {Property of Average Precision and its Generalization: An Examination of Evaluation Indicator for Information Retrieval Experiments},\n\tauthor       = {Kazuaki Kishida},\n\tyear         = 2005,\n\tinstitution  = {National Institute of Informatics}\n}\n@book{kittay1990metaphor,\n\ttitle        = {Metaphor: Its cognitive force and linguistic structure},\n\tauthor       = {Kittay, Eva Feder},\n\tyear         = 1990\n}\n@book{kittredge1982sublanguage,\n\ttitle        = {Sublanguage: Studies of Language in Restricted Semantic Domains},\n\tauthor       = {R. Kittredge and J. Lehrberger},\n\tyear         = 1982,\n\tpublisher    = {B. Blackwell}\n}\n@article{kivinen1997exponentiated,\n\ttitle        = {Exponentiated gradient versus gradient descent for linear predictors},\n\tauthor       = {Kivinen, Jyrki and Warmuth, Manfred K},\n\tyear         = 1997,\n\tjournal      = {Information and Computation},\n\tpublisher    = {Elsevier},\n\tvolume       = 132,\n\tnumber       = 1,\n\tpages        = {1--63},\n\tdoi          = {10.1006/inco.1996.2612},\n\tissn         = {0890-5401},\n\tfjournal     = {Information and Computation},\n\tmrclass      = {68T05 (68Q99)},\n\tmrnumber     = 1429254,\n\tmrreviewer   = {Peter Auer}\n}\n@article{Kivinen95exponentiatedgradient,\n\ttitle        = {Exponentiated Gradient Versus Gradient Descent for Linear Predictors},\n\tauthor       = {Jyrki Kivinen and Manfred K. Warmuth},\n\tyear         = 1995,\n\tjournal      = {Inform. and Comput.},\n\tvolume       = 132,\n\tfjournal     = {Information and Computation}\n}\n@article{kjellstrom2011visual,\n\ttitle        = {Visual object-action recognition: Inferring object affordances from human demonstration},\n\tauthor       = {H. Kjellstr{'o}m and J. Romero and D. Kragi{'c}},\n\tyear         = 2011,\n\tjournal      = {Computer Vision and Image Understanding},\n\tvolume       = 115,\n\tnumber       = 1,\n\tpages        = {81--90}\n}\n@article{kleijn2012bernstein,\n\ttitle        = {The {Bernstein}-von Mises theorem under misspecification},\n\tauthor       = {B.J.K. Kleijn and A.W. van der Vaart},\n\tyear         = 2012,\n\tjournal      = {Electronic Journal of Statistics},\n\tvolume       = 6\n}\n@inproceedings{klein02conditional,\n\ttitle        = {Conditional structure versus conditional estimation in {NLP} models},\n\tauthor       = {Dan Klein and Christopher D. Manning},\n\tyear         = 2002,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{klein03unlexicalized,\n\ttitle        = {Accurate Unlexicalized Parsing},\n\tauthor       = {D. Klein and C. Manning},\n\tyear         = 2003,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {423--430}\n}\n@inproceedings{klein04induction,\n\ttitle        = {Corpus-Based Induction of Syntactic Structure: Models of Dependency and Constituency},\n\tauthor       = {Dan Klein and Christopher D. Manning},\n\tyear         = 2004,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {478--485}\n}\n@inproceedings{klein2001parsing,\n\ttitle        = {Parsing with Treebank Grammars: Empirical Bounds, Theoretical Models, and the Structure of the {P}enn Treebank},\n\tauthor       = {Dan Klein  and  Christopher D. Manning},\n\tyear         = 2001,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {338--345}\n}\n@inproceedings{klein2002fast,\n\ttitle        = {Fast exact inference with a factored model for natural language parsing},\n\tauthor       = {Dan Klein and Christopher D Manning},\n\tyear         = 2002,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{klein2003fast,\n\ttitle        = {{A*} Parsing: Fast Exact Viterbi Parse Selection},\n\tauthor       = {Dan Klein and Christopher Manning},\n\tyear         = 2003,\n\tbooktitle    = {Human Language Technology and North American Association for Computational Linguistics (HLT/NAACL)}\n}\n@phdthesis{klein2005thesis,\n\ttitle        = {The Unsupervised Learning of Natural Language Structure},\n\tauthor       = {Dan Klein},\n\tyear         = 2005,\n\tschool       = {Stanford University}\n}\n@article{klein2017opennmt,\n\ttitle        = {Open{NMT}: Open-Source Toolkit for Neural Machine Translation},\n\tauthor       = {Guillaume Klein and Yoon Kim and Yuntian Deng and Jean Senellart and Alexander M. Rush},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1701.02810}\n}\n@inproceedings{kleinberg1997two,\n\ttitle        = {Two algorithms for nearest-neighbor search in high dimensions},\n\tauthor       = {Jon M Kleinberg},\n\tyear         = 1997,\n\tbooktitle    = {Symposium on Theory of Computing (STOC)},\n\tpages        = {599--608}\n}\n@inproceedings{Kleinberg2003,\n\ttitle        = {{The value of knowing a demand curve: bounds on regret for online posted-price auctions}},\n\tauthor       = {Kleinberg, R. and Leighton, T.},\n\tyear         = 2003,\n\tbooktitle    = {44th Annual IEEE Symposium on Foundations of Computer Science, 2003. Proceedings.},\n\tpublisher    = {IEEE Computer. Soc},\n\tnumber       = {Focs 2003},\n\tpages        = {594--605},\n\tdoi          = {10.1109/SFCS.2003.1238232},\n\tisbn         = {0-7695-2040-5},\n\tissn         = {0272-5428},\n\tabstract     = {We consider price-setting algorithms for a simple market in which a seller has an unlimited supply of identical copies of some good, and interacts sequentially with a pool of n buyers, each of whom wants at most one copy of the good. In each transaction, the seller offers a price between 0 and 1, and the buyer decides whether or not to buy, by comparing the offered price to his privately-held valuation for the good. The price offered to a given buyer may be influenced by the outcomes of prior transactions, but each individual buyer participates only once. In this setting, what is the value of knowing the demand curve? In other words, how much revenue can an uninformed seller expect to obtain, relative to a seller with prior information about the buyers' valuations? The answer depends on how the buyers' valuations are modeled. We analyze three cases - identical, random, and worst-case valuations - in each case deriving upper and lower bounds which match within a sublogarithmic factor.},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Unknown - Unknown - No Title(2).pdf:pdf},\n\tmendeley-groups = {Operation Research}\n}\n@article{kleinberg2016inherent,\n\ttitle        = {Inherent trade-offs in the fair determination of risk scores},\n\tauthor       = {Kleinberg, Jon and Mullainathan, Sendhil and Raghavan, Manish},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1609.05807}\n}\n@inproceedings{kleinberg2017,\n\ttitle        = {Inherent Trade-offs in the Fair Determination of Risk Scores},\n\tauthor       = {Jon Kleinberg and Sendhil Mullainathan and Manish Raghavan},\n\tyear         = 2017,\n\tbooktitle    = {Innovations in Theoretical Computer Science (ITCS)}\n}\n@inproceedings{kleinberg2019simplicity,\n\ttitle        = {Simplicity creates inequity: implications for fairness, stereotypes, and interpretability},\n\tauthor       = {Jon Kleinberg and Sendhil Mullainathan},\n\tyear         = 2019,\n\tbooktitle    = {Proceedings of the 2019 ACM Conference on Economics and Computation},\n\tpages        = {807--808}\n}\n@article{KleinbergS08,\n\ttitle        = {Using mixture models for collaborative filtering},\n\tauthor       = {Jon M. Kleinberg and Mark Sandler},\n\tyear         = 2008,\n\tjournal      = {J. Comput. Syst. Sci.},\n\tvolume       = 74,\n\tnumber       = 1,\n\tpages        = {49--69},\n\tdoi          = {10.1016/j.jcss.2007.04.013},\n\turl          = {http://dx.doi.org/10.1016/j.jcss.2007.04.013},\n\ttimestamp    = {Wed, 05 Mar 2008 11:35:52 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/journals/jcss/KleinbergS08},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@article{kleinman1968design,\n\ttitle        = {The design of suboptimal linear time-varying systems},\n\tauthor       = {D. Kleinman and M. Athans},\n\tyear         = 1968,\n\tjournal      = {IEEE Transactions on Automatic Control},\n\tvolume       = 13,\n\tpages        = {150--159}\n}\n@incollection{KleinYoung99,\n\ttitle        = {On the Number of Iterations for Dantzig-Wolfe Optimization and Packing-Covering Approximation Algorithms},\n\tauthor       = {Klein, Philip and Young, Neal},\n\tyear         = 1999,\n\tbooktitle    = {Integer Programming and Combinatorial Optimization},\n\tpublisher    = {Springer Berlin Heidelberg},\n\tseries       = {Lecture Notes in Computer Science},\n\tvolume       = 1610,\n\tpages        = {320--327},\n\tdoi          = {10.1007/3-540-48777-8_24},\n\tisbn         = {978-3-540-66019-4},\n\teditor       = {Cornu\\'{e}jols, G\\'{e}rard and Burkard, Rainer E. and Woeginger, Gerhard J.}\n}\n@article{klemera2006new,\n\ttitle        = {A new approach to the concept and computation of biological age},\n\tauthor       = {Petr Klemera and Stanislav Doubal},\n\tyear         = 2006,\n\tjournal      = {Mechanisms of Ageing and Development},\n\tvolume       = 127,\n\tnumber       = 3,\n\tpages        = {240--248}\n}\n@inproceedings{klerke2016improving,\n\ttitle        = {Improving sentence compression by learning to predict gaze},\n\tauthor       = {Sigrid Klerke and Yoav Goldberg and Anders S{\\o}gaard},\n\tyear         = 2016,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{kliegl2010generalized,\n\ttitle        = {{Generalized DCell Structure for Load-Balanced Data Center Networks}},\n\tauthor       = {Kliegl, Markus and {Jason D. Lee} and Li, Jun and Zhang, Xinchao and Guo, Chuanxiong and Rinc{\\'o}n, David},\n\tyear         = 2010,\n\tjournal      = {IEEE Conference on Computer Communications (INFOCOM)},\n\tpages        = {1--5},\n\turl          = {http://research.microsoft.com/apps/pubs/default.aspx?id=103129}\n}\n@inproceedings{klivans2006cryptographic,\n\ttitle        = {Cryptographic hardness for learning intersections of halfspaces},\n\tauthor       = {Adam R Klivans and Alexander A Sherstov},\n\tyear         = 2006,\n\tbooktitle    = {Foundations of Computer Science (FOCS)},\n\tpages        = {553--562}\n}\n@article{klivans2009cryptographic,\n\ttitle        = {Cryptographic hardness for learning intersections of halfspaces},\n\tauthor       = {Klivans, Adam R and Sherstov, Alexander A},\n\tyear         = 2009,\n\tjournal      = {Journal of Computer and System Sciences},\n\tpublisher    = {Elsevier},\n\tvolume       = 75,\n\tnumber       = 1,\n\tpages        = {2--12},\n\towner        = {gewor_000},\n\ttimestamp    = {2013.10.01}\n}\n@article{klivans2009learning,\n\ttitle        = {Learning halfspaces with malicious noise},\n\tauthor       = {Adam R. Klivans and Philip M. Long and Rocco A. Servedio},\n\tyear         = 2009,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 10,\n\tpages        = {2715--2740}\n}\n@inproceedings{KLOS2014,\n\ttitle        = {{An Almost-Linear-Time Algorithm for Approximate Max Flow in Undirected Graphs, and its Multicommodity Generalizations}},\n\tauthor       = {Kelner, Jonathan A. and Lee, Yin Tat and Orecchia, Lorenzo and Sidford, Aaron},\n\tyear         = 2014,\n\tmonth        = apr,\n\tbooktitle    = {Proceedings of the 25th Annual ACM-SIAM Symposium on Discrete Algorithms - SODA '14},\n\tseries       = {STOC '14},\n\tnumber       = 1,\n\tdoi          = {10.1137/1.9781611973402.16},\n\tabstract     = {In this paper, we introduce a new framework for approximately solving flow problems in capacitated, undirected graphs and apply it to provide asymptotically faster algorithms for the maximum \\$s\\$-\\$t\\$ flow and maximum concurrent multicommodity flow problems. For graphs with \\$n\\$ vertices and \\$m\\$ edges, it allows us to find an \\$\\backslash epsilon\\$-approximate maximum \\$s\\$-\\$t\\$ flow in time \\$O(m\\^{}\\{1+o(1)\\}\\backslash epsilon\\^{}\\{-2\\})\\$, improving on the previous best bound of \\$\\backslash tilde\\{O\\}(mn\\^{}\\{1/3\\} poly(1/\\backslash epsilon))\\$. Applying the same framework in the multicommodity setting solves a maximum concurrent multicommodity flow problem with \\$k\\$ commodities in \\$O(m\\^{}\\{1+o(1)\\}\\backslash epsilon\\^{}\\{-2\\}k\\^{}2)\\$ time, improving on the existing bound of \\$\\backslash tilde\\{O\\}(m\\^{}\\{4/3\\} poly(k,\\backslash epsilon\\^{}\\{-1\\})\\$. Our algorithms utilize several new technical tools that we believe may be of independent interest: - We give a non-Euclidean generalization of gradient descent and provide bounds on its performance. Using this, we show how to reduce approximate maximum flow and maximum concurrent flow to the efficient construction of oblivious routings with a low competitive ratio. - We define and provide an efficient construction of a new type of flow sparsifier. In addition to providing the standard properties of a cut sparsifier our construction allows for flows in the sparse graph to be routed (very efficiently) in the original graph with low congestion. - We give the first almost-linear-time construction of an \\$O(m\\^{}\\{o(1)\\})\\$-competitive oblivious routing scheme. No previous such algorithm ran in time better than \\$\\backslash tilde\\{\\{\\backslash Omega\\}\\}(mn)\\$. We also note that independently Jonah Sherman produced an almost linear time algorithm for maximum flow and we thank him for coordinating submissions.},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {1304.2338},\n\teprint       = {1304.2338},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Kelner et al. - 2014 - An Almost-Linear-Time Algorithm for Approximate Max Flow in Undirected Graphs, and its Multicommodity Generalizat.pdf:pdf},\n\tmendeley-groups = {Algorithms/Maxflow}\n}\n@inproceedings{kmeans,\n\ttitle        = {Some Methods for Classification and Analysis of Multivariate Observations},\n\tauthor       = {J. B. MacQueen},\n\tyear         = 1967,\n\tbooktitle    = {Proceedings of the fifth Berkeley Symposium on Mathematical Statistics and Probability},\n\tpublisher    = {University of California Press},\n\tvolume       = 1,\n\tpages        = {281--297}\n}\n@inproceedings{KMRELS,\n\ttitle        = {Dictionary learning algorithms for sparse representation},\n\tauthor       = {K. Kreutz-Delgado and J. Murray and B. Rao, K. Engan and T. Lee and T. Sejnowski.},\n\tyear         = 2003,\n\tbooktitle    = {Neural Computation}\n}\n@inproceedings{knepper2013ikeabot,\n\ttitle        = {IkeaBot: An autonomous multi-robot coordinated furniture assembly system},\n\tauthor       = {Ross A. Knepper and Todd Layton and John Romanishin and Daniela Rus},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Robotics and Automation (ICRA)},\n\tpages        = {855--862}\n}\n@inproceedings{kneser1995improved,\n\ttitle        = {Improved backing-off for m-gram language modeling},\n\tauthor       = {Reinhard Kneser and Hermann Ney},\n\tyear         = 1995,\n\tbooktitle    = {International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},\n\tvolume       = 1,\n\tpages        = {181--184}\n}\n@article{knight2002summarization,\n\ttitle        = {Summarization beyond sentence extraction: A probabilistic approach to sentence compression},\n\tauthor       = {Kevin Knight and Daniel Marcu},\n\tyear         = 2002,\n\tjournal      = {Artifical Intelligence},\n\tvolume       = 139,\n\tpages        = {91--107}\n}\n@inproceedings{knight2006unsupervised,\n\ttitle        = {Unsupervised analysis for decipherment problems},\n\tauthor       = {Kevin Knight and Anish Nair and Nishit Rathod and Kenji Yamada},\n\tyear         = 2006,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{kno18,\n\ttitle        = {Expressive power of recurrent neural networks},\n\tauthor       = {Valentin Khrulkov and Alexander Novikov and Ivan Oseledets},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations},\n\turl          = {https://openreview.net/forum?id=S1WRibb0Z}\n}\n@article{knyazev2012principal,\n\ttitle        = {Principal angles between subspaces and their tangents},\n\tauthor       = {Knyazev, Andrew V and Zhu, Peizhen},\n\tyear         = 2012,\n\tjournal      = {Arxiv preprint}\n}\n@inproceedings{ko2007probabilistic,\n\ttitle        = {A Probabilistic Framework for Answer Selection in Question Answering},\n\tauthor       = {Jeongwoo Ko and Luo Si and Eric Nyberg},\n\tyear         = 2007,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{kober2013reinforcement,\n\ttitle        = {Reinforcement learning in robotics: A survey},\n\tauthor       = {Kober, Jens and Bagnell, J Andrew and Peters, Jan},\n\tyear         = 2013,\n\tjournal      = {The International Journal of Robotics Research},\n\tpublisher    = {SAGE Publications Sage UK: London, England},\n\tvolume       = 32,\n\tnumber       = 11,\n\tpages        = {1238--1274}\n}\n@inproceedings{koch2015siamese,\n\ttitle        = {Siamese Neural Networks for One-Shot Image Recognition},\n\tauthor       = {Gregory R. Koch and Richard Zemel and Ruslan Salakhutdinov},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{kocijan2019winograd,\n\ttitle        = {A Surprisingly Robust Trick for the {W}inograd Schema Challenge},\n\tauthor       = {Vid Kocijan and Ana-Maria Cretu and Oana-Maria Camburu and Yordan Yordanov and Thomas Lukasiewicz},\n\tyear         = 2019,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{kocisk2016semantic,\n\ttitle        = {Semantic Parsing with Semi-Supervised Sequential Autoencoders},\n\tauthor       = {Tom{\\'a}s Kocisk{\\'y} and G{\\'a}bor Melis and Edward Grefenstette and Chris Dyer and Wang Ling and Phil Blunsom and Karl Moritz Hermann},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1078--1087}\n}\n@inproceedings{kocsis2006bandit,\n\ttitle        = {Bandit based {M}onte-{C}arlo planning},\n\tauthor       = {Levente Kocsis and Csaba Szepesv{\\'a}ri},\n\tyear         = 2006,\n\tbooktitle    = {European Conference on Machine Learning (ECML)},\n\tpages        = {282--293}\n}\n@inproceedings{koehn2003statistical,\n\ttitle        = {Statistical phrase-based translation},\n\tauthor       = {Philipp Koehn and Franz Josef Och and Daniel Marcu},\n\tyear         = 2003,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {48--54}\n}\n@inproceedings{koehn2007moses,\n\ttitle        = {Moses: Open source toolkit for statistical machine translation},\n\tauthor       = {Philipp Koehn and Hieu Hoang and Alexandra Birch and Chris Callison-Burch and Marcello Federico and Nicola Bertoldi and Brooke Cowan and Wade Shen and Christine Moran and Richard Zens and others},\n\tyear         = 2007,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {177--180}\n}\n@inproceedings{koehn2017six,\n\ttitle        = {Six Challenges for Neural Machine Translation},\n\tauthor       = {Philipp Koehn and Rebecca Knowles},\n\tyear         = 2017,\n\tbooktitle    = {NMT@ACL}\n}\n@article{koenecke2020racial,\n\ttitle        = {Racial disparities in automated speech recognition},\n\tauthor       = {Allison Koenecke and Andrew Nam and Emily Lake and Joe Nudell and Minnie Quartey and Zion Mengesha and Connor Toups and John R Rickford and Dan Jurafsky and Sharad Goel},\n\tyear         = 2020,\n\tjournal      = {Science},\n\tvolume       = 117,\n\tnumber       = 14,\n\tpages        = {7684--7689}\n}\n@article{kofidis_regalia_power_convexity,\n\ttitle        = {On the best rank-1 approximation of higher-order supersymmetric tensors},\n\tauthor       = {E. Kofidis and P. A. Regalia},\n\tyear         = 2002,\n\tjournal      = {SIAM Journal on Matrix Analysis and Applications},\n\tvolume       = 23,\n\tnumber       = 3,\n\tpages        = {863--884}\n}\n@inproceedings{koh2017understanding,\n\ttitle        = {Understanding Black-box Predictions via Influence Functions},\n\tauthor       = {Pang Wei Koh and Percy Liang},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{koh2019influence,\n\ttitle        = {On the Accuracy of Influence Functions for Measuring Group Effects},\n\tauthor       = {Pang Wei Koh and Kai-Siang Ang and Hubert H. K. Teo and Percy Liang},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{koh2019stronger,\n\ttitle        = {Stronger Data Poisoning Attacks Break Data Sanitization Defenses},\n\tauthor       = {Pang Wei Koh and Jacob Steinhardt and Percy Liang},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1811.00741}\n}\n@inproceedings{koh2020bottleneck,\n\ttitle        = {Concept Bottleneck Models},\n\tauthor       = {Pang Wei Koh and Thao Nguyen and Yew Siang Tang and Stephen Mussmann and Emma Pierson and Been Kim and Percy Liang},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{koh2021wilds,\n\ttitle        = {{WILDS}: A Benchmark of in-the-Wild Distribution Shifts},\n\tauthor       = {Pang Wei Koh and Shiori Sagawa and Henrik Marklund and Sang Michael Xie and Marvin Zhang and Akshay Balsubramani and Weihua Hu and Michihiro Yasunaga and Richard Lanas Phillips and Irena Gao and Tony Lee and Etienne David and Ian Stavness and Wei Guo and Berton A. Earnshaw and Imran S. Haque and Sara Beery and Jure Leskovec and Anshul Kundaje and Emma Pierson and Sergey Levine and Chelsea Finn and Percy Liang},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{kohn2016classifications,\n\ttitle        = {Classifications in Brief: {Kellgren-Lawrence} Classification of Osteoarthritis},\n\tauthor       = {MD Kohn and AA Sassoon and ND Fernando},\n\tyear         = 2016,\n\tjournal      = {Clinical orthopaedics and related research},\n\tvolume       = 474,\n\tnumber       = 8,\n\tpages        = {1886--1893}\n}\n@article{koiran1998vapnik,\n\ttitle        = {Vapnik-Chervonenkis dimension of recurrent neural networks},\n\tauthor       = {Koiran, Pascal and Sontag, Eduardo D},\n\tyear         = 1998,\n\tjournal      = {Discrete Applied Mathematics},\n\tpublisher    = {Elsevier},\n\tvolume       = 86,\n\tnumber       = 1,\n\tpages        = {63--79}\n}\n@inproceedings{kokkalis2013emailvalet,\n\ttitle        = {EmailValet: Managing email overload through private, accountable crowdsourcing},\n\tauthor       = {Nicolas Kokkalis and Thomas K{\\\"o}hn and Carl Pfeiffer and Dima Chornyi and Michael S Bernstein and Scott R Klemmer},\n\tyear         = 2013,\n\tbooktitle    = {Conference on Computer Supported Cooperative Work},\n\tpages        = {1291--1300}\n}\n@inproceedings{kol2016time,\n\ttitle        = {Time-Space Hardness of Learning Sparse Parities},\n\tauthor       = {Gillat Kol and Ran Raz and Avishay Tal},\n\tyear         = 2016,\n\tbooktitle    = {ECCC},\n\tvolume       = 23\n}\n@inproceedings{kol2017time,\n\ttitle        = {Time-space Hardness of Learning Sparse Parities},\n\tauthor       = {Kol, Gillat and Raz, Ran and Tal, Avishay},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the 49th Annual ACM SIGACT Symposium on Theory of Computing},\n\tlocation     = {Montreal, Canada},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {STOC 2017},\n\tpages        = {1067--1080},\n\tdoi          = {10.1145/3055399.3055430},\n\tisbn         = {978-1-4503-4528-6},\n\turl          = {http://doi.acm.org/10.1145/3055399.3055430},\n\tacmid        = 3055430,\n\tkeywords     = {Fourier analysis, PAC learning, bounded storage cryptography, branching program, lower bounds, time-space tradeoff},\n\tnumpages     = 14\n}\n@article{kolda_survey,\n\ttitle        = {{Tensor decompositions and applications}},\n\tauthor       = {T. Kolda and B. Bader},\n\tyear         = 2009,\n\tjournal      = {SIREV},\n\tvolume       = 51,\n\tnumber       = 3,\n\tpages        = {455--500}\n}\n@article{kolda2001orthogonal,\n\ttitle        = {Orthogonal tensor decompositions},\n\tauthor       = {T. Kolda},\n\tyear         = 2001,\n\tjournal      = {SIAM Journal on Matrix Analysis and Applications},\n\tpublisher    = {SIAM},\n\tvolume       = 23,\n\tnumber       = 1,\n\tpages        = {243--255}\n}\n@inproceedings{kolda2008scalable,\n\ttitle        = {Scalable Tensor Decompositions for Multi-aspect Data Mining},\n\tauthor       = {Kolda, T. G. and Sun, Jimeng},\n\tyear         = 2008,\n\tbooktitle    = {\n\t\tICDM '08: Proceeding of Eighth IEEE International Conference on Data\n\n\t\tMining\n\t},\n\tpages        = {363--372},\n\tdoi          = {10.1109/ICDM.2008.89},\n\tissn         = {1550-4786},\n\tabstract     = {\n\t\tModern applications such as Internet traffic, telecommunication records,\n\n\t\tand large-scale social networks generate massive amounts of data\n\n\t\twith multiple aspects and high dimensionalities. Tensors (i.e., multi-way\n\n\t\tarrays) provide a natural representation for such data. Consequently,\n\n\t\ttensor decompositions such as Tucker become important tools for summarization\n\n\t\tand analysis. One major challenge is how to deal with high-dimensional,\n\n\t\tsparse data. In other words, how do we compute decompositions of\n\n\t\ttensors where most of the entries of the tensor are zero. Specialized\n\n\t\ttechniques are needed for computing the Tucker decompositions for\n\n\t\tsparse tensors because standard algorithms do not account for the\n\n\t\tsparsity of the data. As a result, a surprising phenomenon is observed\n\n\t\tby practitioners: Despite the fact that there is enough memory to\n\n\t\tstore both the input tensors and the factorized output tensors, memory\n\n\t\toverflows occur during the tensor factorization process. To address\n\n\t\tthis intermediate blowup problem, we propose Memory-Efficient Tucker\n\n\t\t(MET). Based on the available memory, MET adaptively selects the\n\n\t\tright execution strategy during the decomposition. We provide quantitative\n\n\t\tand qualitative evaluation of MET on real tensors. It achieves over\n\n\t\t1000X space reduction without sacrificing speed; it also allows us\n\n\t\tto work with much larger tensors that were too big to handle before.\n\n\t\tFinally, we demonstrate a data mining case-study using MET.\n\t},\n\tkeywords     = {\n\t\tInternet, data mining, matrix decomposition, social networking (online),\n\n\t\tsparse matrices, telecommunication traffic, tensors, Internet traffic,\n\n\t\tMemory-Efficient Tucker, Tucker decompositions, intermediate blowup\n\n\t\tproblem, large-scale social networks, multiaspect data mining, scalable\n\n\t\ttensor decompositions, sparse tensors, telecommunication records,\n\n\t\ttensor decompositions, tensor factorization, Data mining, Sparse\n\n\t\tdata, Tensor Decomposition, Tucker Decomposition\n\t},\n\towner        = {leili},\n\ttimestamp    = {2010.02.05}\n}\n@article{kolda2009tensor,\n\ttitle        = {Tensor decompositions and applications},\n\tauthor       = {Kolda, T. G. and Bader, B. W.},\n\tyear         = 2009,\n\tjournal      = {SIAM review},\n\tvolume       = 51,\n\tnumber       = 3,\n\tpages        = 455\n}\n@article{kolda2011shifted,\n\ttitle        = {Shifted power method for computing tensor eigenpairs},\n\tauthor       = {Kolda, Tamara G and Mayo, Jackson R},\n\tyear         = 2011,\n\tjournal      = {SIAM Journal on Matrix Analysis and Applications},\n\tpublisher    = {SIAM},\n\tvolume       = 32,\n\tnumber       = 4,\n\tpages        = {1095--1124}\n}\n@article{kolesnyk2016generating,\n\ttitle        = {Generating natural language inference chains},\n\tauthor       = {Vladyslav Kolesnyk and Tim Rockt{\\\"a}schel and Sebastian Riedel},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1606.01404}\n}\n@inproceedings{kolla2011play,\n\ttitle        = {How to play unique games against a semi-random adversary: Study of semi-random models of unique games},\n\tauthor       = {Alexandra Kolla and Konstantin Makarychev and Yury Makarychev},\n\tyear         = 2011,\n\tbooktitle    = {Foundations of Computer Science (FOCS)},\n\tpages        = {443--452}\n}\n@inproceedings{kollar10directions,\n\ttitle        = {Toward Understanding Natural Language Directions},\n\tauthor       = {Thomas Kollar and Stefanie Tellex and Deb Roy and Nicholas Roy},\n\tyear         = 2010,\n\tbooktitle    = {Human-Robot Interaction},\n\tpages        = {259--266}\n}\n@inproceedings{kollar2010grounding,\n\ttitle        = {Grounding Verbs of Motion in Natural Language Commands to Robots},\n\tauthor       = {T. Kollar and S. Tellex and D. Roy and N. Roy},\n\tyear         = 2010,\n\tbooktitle    = {International Symposium on Experimental Robotics (ISER)}\n}\n@inproceedings{koller02generation,\n\ttitle        = {Generation as Dependency Parsing},\n\tauthor       = {Alexander Koller and Kristina Striegnitz},\n\tyear         = 2002,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {17--24}\n}\n@book{koller2009probabilistic,\n\ttitle        = {Probabilistic graphical models: principles and techniques},\n\tauthor       = {Koller, Daphne and Friedman, Nir},\n\tyear         = 2009,\n\tpublisher    = {MIT press}\n}\n@inproceedings{kollios1999indexing,\n\ttitle        = {On indexing mobile objects},\n\tauthor       = {Kollios, George and Gunopulos, Dimitrios and Tsotras, Vassilis J.},\n\tyear         = 1999,\n\tbooktitle    = {\n\t\tProceedings of the eighteenth ACM SIGMOD-SIGACT-SIGART symposium\n\n\t\ton Principles of database systems\n\t},\n\tlocation     = {Philadelphia, Pennsylvania, United States},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {PODS '99},\n\tpages        = {261--272},\n\tdoi          = {http://doi.acm.org/10.1145/303976.304002},\n\tisbn         = {1-58113-062-7},\n\tacmid        = 304002,\n\tnumpages     = 12\n}\n@article{kolmogorov1959varepsilon,\n\ttitle        = {$\\varepsilon$-entropy and $\\varepsilon$-capacity of sets in function spaces},\n\tauthor       = {Andrei Nikolaevich Kolmogorov and Vladimir Mikhailovich Tikhomirov},\n\tyear         = 1959,\n\tjournal      = {Uspekhi Matematicheskikh Nauk},\n\tvolume       = 14,\n\tnumber       = 2,\n\tpages        = {3--86}\n}\n@inproceedings{kolter09regularization,\n\ttitle        = {Regularization and Feature Selection in Least-Squares Temporal Difference Learning},\n\tauthor       = {J. Zico Kolter and Andrew Y. Ng},\n\tyear         = 2009,\n\tbooktitle    = {Proceedings of the 26th International Conference on Machine Learning (ICML)},\n\tpages        = {521--528}\n}\n@inproceedings{kolter2009near,\n\ttitle        = {Near-Bayesian exploration in polynomial time},\n\tauthor       = {Kolter, J Zico and Ng, Andrew Y},\n\tyear         = 2009,\n\tbooktitle    = {Proceedings of the 26th annual international conference on machine learning},\n\tpages        = {513--520}\n}\n@inproceedings{kolter2009regularization,\n\ttitle        = {Regularization and feature selection in least-squares temporal difference learning},\n\tauthor       = {Kolter, J Zico and Ng, Andrew Y},\n\tyear         = 2009,\n\tbooktitle    = {Proceedings of the 26th annual international conference on machine learning},\n\tpages        = {521--528},\n\torganization = {ACM}\n}\n@inproceedings{kolter2019learning,\n\ttitle        = {Learning stable deep dynamics models},\n\tauthor       = {Kolter, J Zico and Manek, Gaurav},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {11128--11136}\n}\n@article{kolve2017ai2thor,\n\ttitle        = {AI2-THOR: An Interactive 3D Environment for Visual {AI}},\n\tauthor       = {Eric Kolve and Roozbeh Mottaghi and Daniel Gordon and Yuke Zhu and Abhinav Gupta and Ali Farhadi},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1712.05474}\n}\n@article{kompa2020empirical,\n\ttitle        = {Empirical Frequentist Coverage of Deep Learning Uncertainty Quantification Procedures},\n\tauthor       = {Benjamin Kompa and Jasper Snoek and Andrew Beam},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.03039}\n}\n@article{komura2018machine,\n\ttitle        = {Machine learning methods for histopathological image analysis},\n\tauthor       = {Daisuke Komura and Shumpei Ishikawa},\n\tyear         = 2018,\n\tjournal      = {Computational and structural biotechnology journal},\n\tvolume       = 16,\n\tpages        = {34--42}\n}\n@article{konda1999actor,\n\ttitle        = {Actor-Critic--Type Learning Algorithms for Markov Decision Processes},\n\tauthor       = {Konda, Vijaymohan R and Borkar, Vivek S},\n\tyear         = 1999,\n\tjournal      = {SIAM Journal on control and Optimization},\n\tpublisher    = {SIAM},\n\tvolume       = 38,\n\tnumber       = 1,\n\tpages        = {94--123}\n}\n@inproceedings{konda2000actor,\n\ttitle        = {Actor-critic algorithms},\n\tauthor       = {Konda, Vijay R and Tsitsiklis, John N},\n\tyear         = 2000,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {1008--1014}\n}\n@inproceedings{konidaris2007building,\n\ttitle        = {Building Portable Options: Skill Transfer in Reinforcement Learning},\n\tauthor       = {G. Konidaris and A. G. Barto},\n\tyear         = 2007,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@article{konishi96gic,\n\ttitle        = {Generalized Information Criteria in Model Selection},\n\tauthor       = {S. Konishi and G. Kitagawa},\n\tyear         = 1996,\n\tjournal      = {Biometrika},\n\tvolume       = 83,\n\tnumber       = 4,\n\tpages        = {875--890}\n}\n@article{konstas2017neural,\n\ttitle        = {Neural {AMR:} Sequence-to-Sequence Models for Parsing and Generation},\n\tauthor       = {Ioannis Konstas and Srinivasan Iyer and Mark Yatskar and Yejin Choi and Luke Zettlemoyer},\n\tyear         = 2017,\n\tjournal      = {CoRR},\n\tvolume       = {0}\n}\n@inproceedings{koo08simple,\n\ttitle        = {Simple Semi-Supervised Dependency Parsing},\n\tauthor       = {Terry Koo and Xavier Carreras and Michael Collins},\n\tyear         = 2008,\n\tbooktitle    = {Human Language Technology and Association for Computational Linguistics (HLT/ACL)}\n}\n@article{koohbanani2021self,\n\ttitle        = {Self-Path: Self-supervision for Classification of Pathology Images with Limited Annotations},\n\tauthor       = {Navid Alemi Koohbanani and Balagopal Unnikrishnan and Syed Ali Khurram and Pavitra Krishnaswamy and Nasir Rajpoot},\n\tyear         = 2021,\n\tjournal      = {IEEE Transactions on Medical Imaging},\n\tvolume       = 1\n}\n@inproceedings{koppula2011semantic,\n\ttitle        = {Semantic Labeling of 3{D} Point Clouds for Indoor Scenes},\n\tauthor       = {H.S. Koppula and A. Anand and T. Joachims and A. Saxena},\n\tyear         = 2011,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{koppula2013anticipating,\n\ttitle        = {Anticipating Human Activities using Object Affordances for Reactive Robotic Response},\n\tauthor       = {H. Koppula and A. Saxena},\n\tyear         = 2013,\n\tbooktitle    = {Robotics: Science and Systems (RSS)}\n}\n@article{koppula2013learning,\n\ttitle        = {Learning human activities and object affordances from {RGB-D} videos},\n\tauthor       = {H. S. Koppula and R. Gupta and A. Saxena},\n\tyear         = 2013,\n\tjournal      = {International Journal of Robotics Research (IJRR)},\n\tvolume       = 32,\n\tnumber       = 8,\n\tpages        = {951--970}\n}\n@article{koren2009bellkor,\n\ttitle        = {The bellkor solution to the netflix grand prize},\n\tauthor       = {Koren, Yehuda},\n\tyear         = 2009,\n\tjournal      = {Netflix prize documentation},\n\tvolume       = 81\n}\n@article{koren2009matrix,\n\ttitle        = {Matrix factorization techniques for recommender systems},\n\tauthor       = {Koren, Yehuda and Bell, Robert and Volinsky, Chris},\n\tyear         = 2009,\n\tjournal      = {Computer},\n\tpublisher    = {Institute of Electrical and Electronics Engineers, Inc., 3 Park Avenue, 17 th Fl New York NY 10016-5997 United States},\n\tvolume       = 42,\n\tnumber       = 8,\n\tpages        = {30--37}\n}\n@inproceedings{korn1997efficiently,\n\ttitle        = {Efficiently supporting ad hoc queries in large datasets of time sequences},\n\tauthor       = {Korn, Flip and Jagadish, H. V. and Faloutsos, Christos},\n\tyear         = 1997,\n\tbooktitle    = {\n\t\tProceedings of the 1997 ACM SIGMOD international conference on Management\n\n\t\tof data\n\t},\n\tlocation     = {Tucson, Arizona, United States},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {SIGMOD '97},\n\tpages        = {289--300},\n\tdoi          = {http://doi.acm.org/10.1145/253260.253332},\n\tisbn         = {0-89791-911-4},\n\tacmid        = 253332,\n\tnumpages     = 12\n}\n@inproceedings{kornblith2019better,\n\ttitle        = {Do Better ImageNet Models Transfer Better?},\n\tauthor       = {Simon Kornblith and Jonathon Shlens and Quoc V. Le},\n\tyear         = 2019,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@article{korsky2019computational,\n\ttitle        = {On the computational power of rnns},\n\tauthor       = {Korsky, Samuel A and Berwick, Robert C},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1906.06349}\n}\n@article{kos2017adversarial,\n\ttitle        = {Adversarial examples for generative models},\n\tauthor       = {Jernej Kos and Ian Fischer and Dawn Song},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@inproceedings{koshorek2018segmentation,\n\ttitle        = {Text Segmentation as a Supervised Learning Task},\n\tauthor       = {Omri Koshorek and Noam Mor and Adir Cohen and Michael Rotman and Jonathan Berant},\n\tyear         = 2018,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@inproceedings{koshorek2019active,\n\ttitle        = {On the Limits of Learning to Actively Learn Semantic Representations},\n\tauthor       = {Omri Koshorek and Gabriel Stanovsky and Yichu Zhou and Vivek Srikumar and Jonathan Berant},\n\tyear         = 2019,\n\tbooktitle    = {Computational Natural Language Learning (CoNLL)}\n}\n@inproceedings{kostrikov2021offline,\n\ttitle        = {Offline reinforcement learning with fisher divergence critic regularization},\n\tauthor       = {Kostrikov, Ilya and Fergus, Rob and Tompson, Jonathan and Nachum, Ofir},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {5774--5783},\n\torganization = {PMLR}\n}\n@inproceedings{KOSZ13,\n\ttitle        = {A Simple, Combinatorial Algorithm for Solving {SDD} Systems in Nearly-{L}inear Time},\n\tauthor       = {Jonathan A. Kelner and Lorenzo Orecchia and Aaron Sidford and Zeyuan Allen Zhu},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of the 45th Annual ACM Symposium on Theory of Computing},\n\tseries       = {STOC~'13}\n}\n@techreport{kothari07survey,\n\ttitle        = {Type Reconstruction Algorithms - A Survey},\n\tauthor       = {Sunil Kothari},\n\tyear         = 2007,\n\tinstitution  = {University of Wyoming}\n}\n@inproceedings{kothari08polylet,\n\ttitle        = {On Extending {W}and's Type Reconstruction Algorithm to Handle Polymorphic Let},\n\tauthor       = {Sunil Kothari and James L. Caldwell},\n\tyear         = 2008,\n\tbooktitle    = {Fourth Conference on Computability}\n}\n@inproceedings{kothari2018agnostic,\n\ttitle        = {Better Agnostic Clustering via Tensor Norms},\n\tauthor       = {Pravesh Kothari and Jacob Steinhardt},\n\tyear         = 2018,\n\tbooktitle    = {Symposium on Theory of Computing (STOC)}\n}\n@inproceedings{kothari2018outlier,\n\ttitle        = {Outlier-robust moment-estimation via sum-of-squares},\n\tauthor       = {Pravesh Kothari and David Steurer},\n\tyear         = 2018,\n\tbooktitle    = {Symposium on Theory of Computing (STOC)}\n}\n@article{KotlowskiWarmuth2015-onlineEV,\n\ttitle        = {PCA with Gaussian perturbations},\n\tauthor       = {Kot{\\l}owski, Wojciech and Warmuth, Manfred K.},\n\tyear         = 2015,\n\tjournal      = {ArXiv e-prints},\n\tvolume       = {abs/1506.04855}\n}\n@inproceedings{kottenstette2010relationships,\n\ttitle        = {Relationships between positive real, passive dissipative, \\& positive systems},\n\tauthor       = {Kottenstette, Nicholas and Antsaklis, Panos J},\n\tyear         = 2010,\n\tbooktitle    = {American Control Conference (ACC), 2010},\n\tpages        = {409--416},\n\torganization = {IEEE}\n}\n@article{KoufogiannakisYoung2013,\n\ttitle        = {{A Nearly Linear-Time PTAS for Explicit Fractional Packing and Covering Linear Programs}},\n\tauthor       = {Koufogiannakis, Christos and Young, Neal E.},\n\tyear         = 2013,\n\tmonth        = mar,\n\tjournal      = {Algorithmica},\n\tpages        = {494--506},\n\tdoi          = {10.1007/s00453-013-9771-6},\n\tissn         = {0178-4617},\n\tnote         = {Previously appeared in FOCS '07.},\n\tabstract     = {We give an approximation algorithm for packing and covering linear programs (linear programs with non-negative coefficients). Given a constraint matrix with n non-zeros, r rows, and c columns, the algorithm computes feasible primal and dual solutions whose costs are within a factor of 1+eps of the optimal cost in time O((r+c)log(n)/eps\\^{}2 + n).}\n}\n@inproceedings{koutnik2014clockwork,\n\ttitle        = {A Clockwork {RNN}},\n\tauthor       = {Jan Koutnik and Klaus Greff and Faustino Gomez and Juergen Schmidhuber},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1863--1871}\n}\n@article{kouw2018introduction,\n\ttitle        = {An introduction to domain adaptation and transfer learning},\n\tauthor       = {Kouw, Wouter M and Loog, Marco},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1812.11806}\n}\n@inproceedings{kouw2019learning,\n\ttitle        = {Learning an mr acquisition-invariant representation using siamese neural networks},\n\tauthor       = {Kouw, Wouter M and Loog, Marco and Bartels, Lambertus W and Mendrik, Adri{\\\"e}nne M},\n\tyear         = 2019,\n\tbooktitle    = {2019 IEEE 16th International Symposium on Biomedical Imaging (ISBI 2019)},\n\tpages        = {364--367},\n\torganization = {IEEE}\n}\n@article{kovanic1979pseudoinverse,\n\ttitle        = {On the Pseudoinverse of a Sum of Symmetric Matrices with Applications to Estimation},\n\tauthor       = {Pavel Kovanic},\n\tyear         = 1979,\n\tjournal      = {Kybernetika},\n\tvolume       = 15\n}\n@inproceedings{kovar2002motion,\n\ttitle        = {Motion graphs},\n\tauthor       = {Lucas Kovar and Michael Gleicher and Fr\\&\\#233;d\\&\\#233;ric Pighin},\n\tyear         = 2002,\n\tbooktitle    = {\n\t\tSIGGRAPH '02: Proceedings of the 29th annual conference on Computer\n\n\t\tgraphics and interactive techniques\n\t},\n\tlocation     = {San Antonio, Texas},\n\tpublisher    = {ACM Press},\n\taddress      = {New York, NY, USA},\n\tpages        = {473--482},\n\tdoi          = {http://doi.acm.org/10.1145/566570.566605},\n\tisbn         = {1-58113-521-1}\n}\n@article{kovcisky2017narrativeqa,\n\ttitle        = {The {NarrativeQA} Reading Comprehension Challenge},\n\tauthor       = {Tom{\\v{s}} Ko{\\v{c}}isky and Jonathan Schwarz and Phil Blunsom and Chris Dyer and Karl Moritz Hermann and Gabor Melis and Edward Grefenstette},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1712.07040}\n}\n@inproceedings{kozor2014drug,\n\ttitle        = {Regular Cocaine Use Is Associated with Increased Systolic Blood Pressure, Aortic Stiffness and Left Ventricular Mass in Young Otherwise Healthy Individuals},\n\tauthor       = {Rebecca Kozor and Stuart M. Grieve and Stefan Buchholz and Sharlene Kaye and Shane Darke and Ravinay Bhindi and Gemma A. Figtree},\n\tyear         = 2014,\n\tbooktitle    = {PLos ONE}\n}\n@inproceedings{kpotufe2013adaptivity,\n\ttitle        = {Adaptivity to local smoothness and dimension in kernel regression},\n\tauthor       = {Kpotufe, Samory and Garg, Vikas},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {3075--3083}\n}\n@article{kraemer2000can,\n\ttitle        = {How can we learn about developmental processes from cross-sectional studies, or can we?},\n\tauthor       = {Helena Chmura Kraemer and Jerome A Yesavage and Joy L Taylor and David Kupfer},\n\tyear         = 2000,\n\tjournal      = {American Journal of Psychiatry},\n\tvolume       = 157,\n\tnumber       = 2,\n\tpages        = {163--171}\n}\n@inproceedings{krahenbuhl2011efficient,\n\ttitle        = {Efficient inference in fully connected {CRF}s with {G}aussian edge potentials},\n\tauthor       = {P. Kr{\\\"a}henb{\\\"u}hl and V. Koltun},\n\tyear         = 2011,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{krahenbuhl2013learning,\n\ttitle        = {Parameter Learning and Convergent Inference for Dense Random Fields},\n\tauthor       = {P. Kr{\\\"a}henb{\\\"u}hl and V. Koltun},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {513--521}\n}\n@inproceedings{kramer1988learning,\n\ttitle        = {Learning despite distribution shift},\n\tauthor       = {Alan H. Kramer},\n\tyear         = 1988,\n\tbooktitle    = {Connectionist Models Summer School}\n}\n@book{krashen1982second,\n\ttitle        = {Principles and Practice in Second Language Acquisition},\n\tauthor       = {Stephen Krashen},\n\tyear         = 1982,\n\tpublisher    = {Pergamon Press}\n}\n@inproceedings{krause20133d,\n\ttitle        = {3d object representations for fine-grained categorization},\n\tauthor       = {Krause, Jonathan and Stark, Michael and Deng, Jia and Fei-Fei, Li},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of the IEEE international conference on computer vision workshops},\n\tpages        = {554--561}\n}\n@inproceedings{kremer2014coinco,\n\ttitle        = {What Substitutes Tell Us -- Analysis of an ``All-Words Lexical Substitution Corpus},\n\tauthor       = {Gerhard Kremer and Katrin Erk and Sabastian Pado and Stefan Thater},\n\tyear         = 2014,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{krening2017learning,\n\ttitle        = {Learning from explanations using sentiment and advice in {RL}},\n\tauthor       = {Samantha Krening and Brent Harrison and Karen M Feigh and Charles Lee Isbell and Mark Riedl and Andrea Thomaz},\n\tyear         = 2017,\n\tjournal      = {IEEE Transactions on Cognitive and Developmental Systems},\n\tvolume       = 9,\n\tnumber       = 1,\n\tpages        = {44--55}\n}\n@inproceedings{kress2007structured,\n\ttitle        = {From structured {E}nglish to robot motion},\n\tauthor       = {H. Kress-Gazit and G. Fainekos and G. Pappas},\n\tyear         = 2007,\n\tbooktitle    = {International Conference on Intelligent Robots and Systems (IROS)}\n}\n@article{kreuzer2015quantitative,\n\ttitle        = {A Quantitative Comparison of Semantic Web Page Segmentation Approaches},\n\tauthor       = {Robert Kreuzer and Jurriaan Hage and A. J. Feelders},\n\tyear         = 2015,\n\tjournal      = {International Conference on Web Engineering (ICWE)}\n}\n@inproceedings{krishna2016embracing,\n\ttitle        = {Embracing Error to Enable Rapid Crowdsourcing},\n\tauthor       = {Ranjay Krishna and Kenji Hata and Stephanie Chen and Joshua Kravitz and David A. Shamma and Li Fei-Fei and Michael S. Bernstein},\n\tyear         = 2016,\n\tbooktitle    = {Conference on Human Factors in Computing Systems (CHI)}\n}\n@article{krishna2017visual,\n\ttitle        = {Visual genome: Connecting language and vision using crowdsourced dense image annotations},\n\tauthor       = {Ranjay Krishna and Yuke Zhu and Oliver Groth and Justin Johnson and Kenji Hata and Joshua Kravitz and Stephanie Chen and Yannis Kalantidi and Li-Jia Li and David A. Shamma and Michael S. Bernstein and Fei-Fei Li},\n\tyear         = 2017,\n\tjournal      = {International Journal of Computer Vision},\n\tvolume       = 123,\n\tpages        = {32--73}\n}\n@book{krishnamurthi06pl,\n\ttitle        = {Programming Languages: Application and Interpretation},\n\tauthor       = {Shriram Krishnamurthi},\n\tyear         = 2006,\n\tpublisher    = {Creative Commons}\n}\n@inproceedings{krishnamurthy2012weakly,\n\ttitle        = {Weakly supervised training of semantic parsers},\n\tauthor       = {Jayant Krishnamurthy and Tom Mitchell},\n\tyear         = 2012,\n\tbooktitle    = {Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP/CoNLL)},\n\tpages        = {754--765}\n}\n@article{krishnamurthy2013jointly,\n\ttitle        = {Jointly Learning to Parse and Perceive: Connecting Natural Language to the Physical World},\n\tauthor       = {Jayant Krishnamurthy and Thomas Kollar},\n\tyear         = 2013,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 1,\n\tpages        = {193--206}\n}\n@article{krishnamurthy2013sequential,\n\ttitle        = {Sequential Algorithms for Matrix and Tensor Completion},\n\tauthor       = {Krishnamurthy, Akshay and Singh, Aarti},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1304.4672}\n}\n@article{krishnamurthy2014power,\n\ttitle        = {On the power of adaptivity in matrix completion and approximation},\n\tauthor       = {Krishnamurthy, Akshay and Singh, Aarti},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1407.3619}\n}\n@article{krishnamurthy2016contextual,\n\ttitle        = {Contextual-MDPs for {PAC}-Reinforcement Learning with Rich Observations},\n\tauthor       = {Krishnamurthy, Akshay and Agarwal, Alekh and Langford, John},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1602.02722}\n}\n@inproceedings{krishnamurthy2016pac,\n\ttitle        = {PAC reinforcement learning with rich observations},\n\tauthor       = {Krishnamurthy, Akshay and Agarwal, Alekh and Langford, John},\n\tyear         = 2016,\n\tbooktitle    = {Proceedings of the 30th International Conference on Neural Information Processing Systems},\n\tpages        = {1848--1856}\n}\n@inproceedings{krishnamurthy2017neural,\n\ttitle        = {Neural Semantic Parsing with Type Constraints for Semi-Structured Tables},\n\tauthor       = {Jayant Krishnamurthy and Pradeep Dasigi and Matt Gardner},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{krishnamurthy2018semiparametric,\n\ttitle        = {Semiparametric contextual bandits},\n\tauthor       = {Akshay Krishnamurthy and Zhiwei Steven Wu and Vasilis Syrgkanis},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1803.04204}\n}\n@inproceedings{krishnan2014recommender,\n\ttitle        = {A methodology for learning, analyzing, and mitigating social influence bias in recommender systems},\n\tauthor       = {Sanjay Krishnan and Jay Patel and Michael Frankling and Ken Goldberg},\n\tyear         = 2014,\n\tbooktitle    = {ACM Conference on Recommender Systems (RECSYS)}\n}\n@inproceedings{krivelevich2006semirandom,\n\ttitle        = {Semirandom models as benchmarks for coloring algorithms},\n\tauthor       = {Michael Krivelevich and Dan Vilenchik},\n\tyear         = 2006,\n\tbooktitle    = {Meeting on Analytic Algorithmics and Combinatorics},\n\tpages        = {211--221}\n}\n@techreport{Krizhevsky09learningmultiple,\n\ttitle        = {Learning multiple layers of features from tiny images},\n\tauthor       = {Alex Krizhevsky},\n\tyear         = 2009,\n\tinstitution  = {}\n}\n@article{krizhevsky2009learning,\n\ttitle        = {Learning multiple layers of features from tiny images},\n\tauthor       = {Krizhevsky, Alex and Hinton, Geoffrey},\n\tyear         = 2009,\n\tpublisher    = {Citeseer}\n}\n@inproceedings{krizhevsky2011verydeep,\n\ttitle        = {Using Very Deep Autoencoders for Content-Based Image Retrieval},\n\tauthor       = {Alex Krizhevsky and Geoffrey E. Hinton},\n\tyear         = 2011,\n\tbooktitle    = {19th European Symposium on Artificial Neural Networks, Computational Intelligence and Machine Learning (ESANN)},\n\tpages        = {489--494}\n}\n@inproceedings{krizhevsky2012imagenet,\n\ttitle        = {Imagenet classification with deep convolutional neural networks},\n\tauthor       = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},\n\tyear         = 2012,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {1097--1105},\n\towner        = {rongge},\n\ttimestamp    = {2013.09.26}\n}\n@article{kroemer2010combining,\n\ttitle        = {Combining active learning and reactive control for robot grasping},\n\tauthor       = {OB Kroemer and R. Detry and J. Piater and J. Peters},\n\tyear         = 2010,\n\tjournal      = {RAS},\n\tvolume       = 58,\n\tnumber       = 9,\n\tpages        = {1105--1116}\n}\n@article{kroll2017,\n\ttitle        = {Accountable Algorithms},\n\tauthor       = {Joshua A. Kroll and Joanna Huey and Solon Baroca and Edward W. Felten and Joel R. Reidenberg and David G Robinson and Harlan Yu},\n\tyear         = 2017,\n\tjournal      = {University of Pennsylvania Law Review},\n\tvolume       = 3,\n\tpages        = {633--706}\n}\n@article{krueger2020out,\n\ttitle        = {Out-of-distribution generalization via risk extrapolation (rex)},\n\tauthor       = {Krueger, David and Caballero, Ethan and Jacobsen, Joern-Henrik and Zhang, Amy and Binas, Jonathan and Zhang, Dinghuai and Priol, Remi Le and Courville, Aaron},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2003.00688}\n}\n@article{kruger1999unskilled,\n\ttitle        = {Unskilled and unaware of it: how difficulties in recognizing one's own incompetence lead to inflated self-assessments},\n\tauthor       = {Justin Kruger and David Dunning},\n\tyear         = 1999,\n\tjournal      = {Journal of personality and social psychology},\n\tvolume       = 77,\n\tnumber       = 6,\n\tpages        = {1121--1134}\n}\n@article{Kruskal:76,\n\ttitle        = {{More factors than subjects, tests and treatments: an indeterminacy theorem for canonical decomposition and individual differences scaling}},\n\tauthor       = {Kruskal, J.B.},\n\tyear         = 1976,\n\tjournal      = {Psychometrika},\n\tvolume       = 41,\n\tnumber       = 3,\n\tpages        = {281--293}\n}\n@article{Kruskal:77,\n\ttitle        = {{Three-way arrays: Rank and uniqueness of trilinear decompositions, with application to arithmetic complexity and statistics}},\n\tauthor       = {Kruskal, J.B.},\n\tyear         = 1977,\n\tjournal      = {Linear algebra and its applications},\n\tvolume       = 18,\n\tnumber       = 2,\n\tpages        = {95--138}\n}\n@article{kruskal77,\n\ttitle        = {Three-way arrays: rank and uniqueness of trilinear decompositions, with application to arithmetic complexity and statistics},\n\tauthor       = {J. B. Kruskal},\n\tyear         = 1977,\n\tjournal      = {Linear Algebra and Appl.},\n\tvolume       = 18,\n\tnumber       = 2,\n\tpages        = {95--138}\n}\n@inproceedings{KRV2006,\n\ttitle        = {Graph partitioning using single commodity flows},\n\tauthor       = {Khandekar, Rohit and Rao, Satish and Vazirani, Umesh},\n\tyear         = 2006,\n\tbooktitle    = {STOC '06}\n}\n@inproceedings{krymolowski2002distinguishing,\n\ttitle        = {Distinguishing easy and hard instances},\n\tauthor       = {Yuval Krymolowski},\n\tyear         = 2002,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)}\n}\n@inproceedings{KSV05,\n\ttitle        = {The spectral method for general mixture models},\n\tauthor       = {R. Kannan and H. Salmasian and S. Vempala},\n\tyear         = 2005,\n\tbooktitle    = {COLT}\n}\n@inproceedings{ku2020rxr,\n\ttitle        = {Room-Across-Room: Multilingual Vision-and-Language Navigation with Dense Spatiotemporal Grounding},\n\tauthor       = {Alexander Ku and Peter Anderson and Roma Patel and Eugene Ie and Jason Baldridge},\n\tyear         = 2020,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{kudo2018sentencepiece,\n\ttitle        = {SentencePiece: A simple and language independent subword tokenizer and detokenizer for Neural Text Processing},\n\tauthor       = {Taku Kudo and John Richardson},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{kuhl2004earlylanguage,\n\ttitle        = {Early language acquisition: cracking the speech code},\n\tauthor       = {Patricia K Kuhl},\n\tyear         = 2004,\n\tjournal      = {Nature Reviews Neuroscience}\n}\n@inproceedings{kuhn1994hmm,\n\ttitle        = {Ergodic hidden {markov} models and polygrams for language modeling},\n\tauthor       = {Thomas Kuhn and Heinrich Niemann and Ernst Gunter and Schukat-Talamazzini},\n\tyear         = 1994,\n\tbooktitle    = {International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}\n}\n@article{kuhn2016extensive,\n\ttitle        = {Extensive games and the problem of information},\n\tauthor       = {H Kuhn},\n\tyear         = 2016,\n\tjournal      = {InH. KuhnandA. Tucker, editors, Contributions to the Theory of Games},\n\tpages        = {193--216}\n}\n@article{kuhn55hungarian,\n\ttitle        = {The {H}ungarian method for the assignment problem},\n\tauthor       = {H. W. Kuhn},\n\tyear         = 1955,\n\tjournal      = {Naval Research Logistic Quarterly},\n\tvolume       = 2,\n\tpages        = {83--97}\n}\n@inproceedings{kula2015,\n\ttitle        = {Metadata Embeddings for User and Item Cold-start Recommendations},\n\tauthor       = {Maciej Kula},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the 2nd Workshop on New Trends on Content-Based Recommender Systems},\n\tpages        = {14--21}\n}\n@inproceedings{kulal2019spoc,\n\ttitle        = {SPoC: Search-based Pseudocode to Code},\n\tauthor       = {Sumith Kulal and Panupong Pasupat and Kartik Chandra and Mina Lee and Oded Padon and Alex Aiken and Percy Liang},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{kuleshov2015calibrated,\n\ttitle        = {Calibrated Structured Prediction},\n\tauthor       = {Volodymyr Kuleshov and Percy Liang},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{kuleshov2015simultaneous,\n\ttitle        = {Simultaneous diagonalization: the asymmetric, low-rank, and noisy settings},\n\tauthor       = {Volodymyr Kuleshov and Arun Chaganty and Percy Liang},\n\tyear         = 2015,\n\tjournal      = {arXiv}\n}\n@inproceedings{kuleshov2015tensor,\n\ttitle        = {Tensor factorization via matrix factorization},\n\tauthor       = {Volodymyr Kuleshov and Arun Chaganty and Percy Liang},\n\tyear         = 2015,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)}\n}\n@inproceedings{kuleshov2017estimating,\n\ttitle        = {Estimating uncertainty online against an adversary},\n\tauthor       = {Kuleshov, Volodymyr and Ermon, Stefano},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1607.03594},\n\tbooktitle    = {Thirty-First AAAI Conference on Artificial Intelligence}\n}\n@inproceedings{kuleshov2018accurate,\n\ttitle        = {Accurate uncertainties for deep learning using calibrated regression},\n\tauthor       = {Kuleshov, Volodymyr and Fenner, Nathan and Ermon, Stefano},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {2796--2804},\n\torganization = {PMLR}\n}\n@inproceedings{kulesza2007structured,\n\ttitle        = {Structured learning with approximate inference},\n\tauthor       = {Alex Kulesza and Fernando Pereira},\n\tyear         = 2007,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {785--792}\n}\n@inproceedings{kulick2013active,\n\ttitle        = {Active Learning for Teaching a Robot Grounded Relational Symbols},\n\tauthor       = {J. Kulick and M. Toussaint and T. Lang and M. Lopes},\n\tyear         = 2013,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@inproceedings{kulick2014inter,\n\ttitle        = {Inter-Annotator Agreement for {ERE} Annotation},\n\tauthor       = {Seth Kulick and Ann Bies and Justin Mott},\n\tyear         = 2014,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{kulis2007fast,\n\ttitle        = {Fast low-rank semidefinite programming for embedding and clustering},\n\tauthor       = {B. Kulis and A. C. Surendran and J. C. Platt},\n\tyear         = 2007,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)},\n\tpages        = {235--242}\n}\n@inproceedings{kulkarni2014scaling,\n\ttitle        = {Scaling short-answer grading by combining peer assessment with algorithmic scoring},\n\tauthor       = {Chinmay E Kulkarni and Richard Socher and Michael S Bernstein and Scott R Klemmer},\n\tyear         = 2014,\n\tbooktitle    = {Proceedings of the first ACM conference on Learning@Scale conference},\n\tpages        = {99--108}\n}\n@article{kulkarni2015peer,\n\ttitle        = {Peer and self assessment in massive online classes},\n\tauthor       = {Chinmay Kulkarni and Pang Wei Koh and Huy Huy and Daniel Chia and Kathryn Papadopoulos and Justin Cheng and Daphne Koller and Scott R. Klemmer},\n\tyear         = 2015,\n\tjournal      = {Design Thinking Research},\n\tpages        = {131--168}\n}\n@article{kulkarni2016hierarchical,\n\ttitle        = {Hierarchical deep reinforcement learning: Integrating temporal abstraction and intrinsic motivation},\n\tauthor       = {T. D. Kulkarni and K. Narasimhan and A. Saeedi and J. Tenenbaum},\n\tyear         = 2016,\n\tjournal      = {Advances in neural information processing systems},\n\tpages        = {3675--3683}\n}\n@inproceedings{kull2015novel,\n\ttitle        = {Novel decompositions of proper scoring rules for classification: Score adjustment as precursor to calibration},\n\tauthor       = {Kull, Meelis and Flach, Peter},\n\tyear         = 2015,\n\tbooktitle    = {Joint European Conference on Machine Learning and Knowledge Discovery in Databases},\n\tpages        = {68--85},\n\torganization = {Springer}\n}\n@article{kull2017sigmoids,\n\ttitle        = {Beyond sigmoids: How to obtain well-calibrated probabilities from binary classifiers with beta calibration},\n\tauthor       = {Meelis Kull and Telmo M. Silva Filho and Peter Flach},\n\tyear         = 2017,\n\tjournal      = {Electronic Journal of Statistics},\n\tvolume       = 11,\n\tpages        = {5052--5080}\n}\n@article{kull2019beyond,\n\ttitle        = {Beyond temperature scaling: Obtaining well-calibrated multiclass probabilities with Dirichlet calibration},\n\tauthor       = {Kull, Meelis and Perello-Nieto, Miquel and K{\\\"a}ngsepp, Markus and Song, Hao and Flach, Peter and others},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1910.12656}\n}\n@inproceedings{kull2019temperature,\n\ttitle        = {Beyond temperature scaling: Obtaining well-calibrated multi-class probabilities with Dirichlet calibration},\n\tauthor       = {Meelis Kull and Miquel Perello Nieto and Markus Kängsepp and Telmo Silva Filho and Hao Song and Peter Flach},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{Kumar12,\n\ttitle        = {Fast Conical Hull Algorithms for Near-separable Non-negative Matrix Factorization},\n\tauthor       = {A. Kumar and V. Sindhwani and P. Kambadur},\n\tyear         = 2012,\n\tnote         = {http://arxiv.org/abs/1210.1190v1}\n}\n@article{Kumar2003,\n\ttitle        = {{Approximate minimum enclosing balls in high dimensions using core-sets}},\n\tauthor       = {Kumar, Piyush and Mitchell, Joseph S. B. and Yildirim, E. Alper},\n\tyear         = 2003,\n\tmonth        = jan,\n\tjournal      = {Journal of Experimental Algorithmics},\n\tvolume       = 8,\n\tpages        = {1--29},\n\tdoi          = {10.1145/996546.996548},\n\tissn         = 10846654,\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Kumar, Mitchell, Yildirim - 2003 - Approximate minimum enclosing balls in high dimensions using core-sets.pdf:pdf},\n\tmendeley-groups = {Algorithms/Computational Geometry}\n}\n@inproceedings{kumar2007eyepoint,\n\ttitle        = {EyePoint: practical pointing and selection using gaze and keyboard},\n\tauthor       = {Manu Kumar and Andreas Paepcke and Terry Winograd},\n\tyear         = 2007,\n\tbooktitle    = {Conference on Human Factors in Computing Systems (CHI)}\n}\n@article{kumar2009analysis,\n\ttitle        = {An analysis of convex relaxations for {MAP} estimation of discrete {MRF}s},\n\tauthor       = {M. P. Kumar and V. Kolmogorov and P. Torr},\n\tyear         = 2009,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 10,\n\tpages        = {71--106}\n}\n@inproceedings{kumar2009attribute,\n\ttitle        = {Attribute and simile classifiers for face verification},\n\tauthor       = {Neeraj Kumar and Alexander C Berg and Peter N Belhumeur and Shree K Nayar},\n\tyear         = 2009,\n\tbooktitle    = {International Conference on Computer Vision (ICCV)},\n\tpages        = {365--372}\n}\n@inproceedings{kumar2010clustering,\n\ttitle        = {Clustering with spectral norm and the {k}-means algorithm},\n\tauthor       = {Amit Kumar and Ravindran Kannan},\n\tyear         = 2010,\n\tbooktitle    = {Foundations of Computer Science (FOCS)},\n\tpages        = {299--308}\n}\n@inproceedings{kumar2010self,\n\ttitle        = {Self-paced learning for latent variable models},\n\tauthor       = {M Pawan Kumar and Benjamin Packer and Daphne Koller},\n\tyear         = 2010,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1189--1197}\n}\n@inproceedings{kumar2011bricolage,\n\ttitle        = {Bricolage: example-based retargeting for web design},\n\tauthor       = {Ranjitha Kumar and Jerry O. Talton and Salman Ahmad and Scott R. Klemmer},\n\tyear         = 2011,\n\tbooktitle    = {Conference on Human Factors in Computing Systems (CHI)}\n}\n@inproceedings{kumar2013webzeitgeist,\n\ttitle        = {Webzeitgeist: design mining the web},\n\tauthor       = {Ranjitha Kumar and Arvind Satyanarayan and C{'e}sar Torres and Maxine Lim and Salman Ahmad and Scott R. Klemmer and Jerry O. Talton},\n\tyear         = 2013,\n\tbooktitle    = {Conference on Human Factors in Computing Systems (CHI)}\n}\n@inproceedings{kumar2016dmn,\n\ttitle        = {Ask Me Anaything: Dynamic Memory Networks for Natural Language Processing},\n\tauthor       = {Ankit Kumar and Ozan Irsoy and Peter Ondruska and Mohit Iyyer and James Bradbury and Ishaan Gulrajani and Victor Zhong and Romain Paulus and Richard Socher},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{kumar2017understanding,\n\ttitle        = {Understanding development and stem cells using single cell-based analyses of gene expression},\n\tauthor       = {Pavithra Kumar and Yuqi Tan and Patrick Cahan},\n\tyear         = 2017,\n\tjournal      = {Development},\n\tvolume       = 144,\n\tnumber       = 1,\n\tpages        = {17--32}\n}\n@inproceedings{kumar2019calibration,\n\ttitle        = {Verified Uncertainty Calibration},\n\tauthor       = {Ananya Kumar and Percy Liang and Tengyu Ma},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{kumar2019stabilizing,\n\ttitle        = {Stabilizing Off-Policy Q-Learning via Bootstrapping Error Reduction},\n\tauthor       = {Kumar, Aviral and Fu, Justin and Soh, Matthew and Tucker, George and Levine, Sergey},\n\tyear         = 2019,\n\tjournal      = {Advances in Neural Information Processing Systems},\n\tvolume       = 32,\n\tpages        = {11784--11794}\n}\n@inproceedings{kumar2019verified,\n\ttitle        = {Verified uncertainty calibration},\n\tauthor       = {Kumar, Ananya and Liang, Percy S and Ma, Tengyu},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {3792--3803}\n}\n@inproceedings{kumar2020conservative,\n\ttitle        = {Conservative Q-Learning for Offline Reinforcement Learning},\n\tauthor       = {Kumar, Aviral and Zhou, Aurick and Tucker, George and Levine, Sergey},\n\tyear         = 2020,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tvolume       = 33,\n\tpages        = {1179--1191}\n}\n@inproceedings{kumar2020gradual,\n\ttitle        = {Understanding Self-Training for Gradual Domain Adaptation},\n\tauthor       = {Ananya Kumar and Tengyu Ma and Percy Liang},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{kummerfeld2010faster,\n\ttitle        = {Faster parsing by supertagger adaptation},\n\tauthor       = {Jonathan Kummerfeld and Jessika Roesner and Tim Dawborn and James Haggerty and James Curran and Stephen Clark},\n\tyear         = 2010,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{kundaje2015integrative,\n\ttitle        = {Integrative analysis of 111 reference human epigenomes},\n\tauthor       = {Anshul Kundaje and Wouter Meuleman and Jason Ernst and Misha Bilenky and Angela Yen and Alireza Heravi-Moussavi and Pouya Kheradpour and Zhizhuo Zhang and Jianrong Wang and Michael J Ziller and others},\n\tyear         = 2015,\n\tjournal      = {Nature},\n\tvolume       = 518,\n\tnumber       = 7539,\n\tpages        = {317--330}\n}\n@inproceedings{kunze2013acquiring,\n\ttitle        = {Acquiring task models for imitation learning through games with a purpose},\n\tauthor       = {L. Kunze and A. Haidu and M. Beetz},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Intelligent Robots and Systems (IROS)}\n}\n@inproceedings{kuo08transliterations,\n\ttitle        = {Mining Transliterations from Web Query Results: An Incremental Approach},\n\tauthor       = {Jin-Shea Kuo and Haizhou Li and Chih-Lung Lin},\n\tyear         = 2008,\n\tbooktitle    = {Sixth SIGHAN Workshop on Chinese Language Processing}\n}\n@article{kurakin2016adversarial,\n\ttitle        = {Adversarial examples in the physical world},\n\tauthor       = {Alexey Kurakin and Ian Goodfellow and Samy Bengio},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@inproceedings{kurihara04varpcfg,\n\ttitle        = {An Application of the Variational {B}ayesian Approach to Probabilistic Context-Free Grammars},\n\tauthor       = {K. Kurihara and T. Sato},\n\tyear         = 2004,\n\tbooktitle    = {International Joint Conference on Natural Language Processing Workshop Beyond Shallow Analyses}\n}\n@inproceedings{kurihara06varinduct,\n\ttitle        = {Variational {B}ayesian Grammar Induction for Natural Language},\n\tauthor       = {K. Kurihara and T. Sato},\n\tyear         = 2006,\n\tbooktitle    = {International Colloquium on Grammatical Inference}\n}\n@inproceedings{kurihara07accelerated,\n\ttitle        = {Accelerated Variational {D}irichlet Mixture Models},\n\tauthor       = {K. Kurihara and M. Welling and N. Vlassis},\n\tyear         = 2007,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{kurihara07collapsed,\n\ttitle        = {Collapsed Variational {D}irichlet Process Mixture Models},\n\tauthor       = {K. Kurihara and M. Welling and Y. W. Teh},\n\tyear         = 2007,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@inproceedings{kurucz2007methods,\n\ttitle        = {Methods for large scale SVD with missing values},\n\tauthor       = {Kurucz, Mikl{\\'o}s and Bencz{\\'u}r, Andr{\\'a}s A and Csalog{\\'a}ny, K{\\'a}roly},\n\tyear         = 2007,\n\tbooktitle    = {Proceedings of KDD Cup and Workshop},\n\tvolume       = 12,\n\tpages        = {31--38},\n\torganization = {Citeseer}\n}\n@article{kurutach2018model,\n\ttitle        = {Model-ensemble trust-region policy optimization},\n\tauthor       = {Kurutach, Thanard and Clavera, Ignasi and Duan, Yan and Tamar, Aviv and Abbeel, Pieter},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1802.10592}\n}\n@article{kuruvilla2002vector,\n\ttitle        = {Vector algebra in the analysis of genome-wide expression data},\n\tauthor       = {Kuruvilla, Finny G and Park, Peter J and Schreiber, Stuart L},\n\tyear         = 2002,\n\tjournal      = {Genome biology},\n\tpublisher    = {BioMed Central Ltd},\n\tvolume       = 3,\n\tnumber       = 3,\n\tpages        = {research0011}\n}\n@inproceedings{kushagra2016finding,\n\ttitle        = {Finding Meaningful Cluster Structure Amidst Background Noise},\n\tauthor       = {Shrinu Kushagra and Samira Samadi and Shai Ben-David},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Algorithmic Learning Theory},\n\tpages        = {339--354}\n}\n@inproceedings{kushman2013regex,\n\ttitle        = {Using Semantic Unification to Generate Regular Expressions from Natural Language},\n\tauthor       = {Nate Kushman and Regina Barzilay},\n\tyear         = 2013,\n\tbooktitle    = {Human Language Technology and North American Association for Computational Linguistics (HLT/NAACL)},\n\tpages        = {826--836}\n}\n@inproceedings{kushman2014algebra,\n\ttitle        = {Learning to Automatically Solve Algebra Word Problems},\n\tauthor       = {Nate Kushman and Yoav Artzi and Luke Zettlemoyer and Regina Barzilay},\n\tyear         = 2014,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@phdthesis{kushmerick1997wrapper,\n\ttitle        = {Wrapper induction for information extraction},\n\tauthor       = {Nicholas Kushmerick},\n\tyear         = 1997,\n\tschool       = {University of Washington}\n}\n@inproceedings{kusner2017,\n\ttitle        = {Counterfactual Fairness},\n\tauthor       = {Matt J Kusner and Joshua R Loftus and Chris Russell and Ricardo Silva},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {4069--4079}\n}\n@inproceedings{kusner2017counterfactual,\n\ttitle        = {Counterfactual fairness},\n\tauthor       = {Kusner, Matt J and Loftus, Joshua and Russell, Chris and Silva, Ricardo},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {4066--4076}\n}\n@article{kussul2017classification,\n\ttitle        = {Deep Learning Classification of Land Cover and Crop Types Using Remote Sensing Data},\n\tauthor       = {N. {Kussul} and M. {Lavreniuk} and S. {Skakun} and A. {Shelestov}},\n\tyear         = 2017,\n\tjournal      = {IEEE Geoscience and Remote Sensing Letters},\n\tvolume       = 14,\n\tnumber       = 5,\n\tpages        = {778--782}\n}\n@book{KuY03,\n\ttitle        = {Stochastic Approximation and Recursive Algorithms and Applications},\n\tauthor       = {Kushner, Harold J and Yin, George},\n\tyear         = 2003,\n\tpublisher    = {Springer}\n}\n@inproceedings{kuznetsova2013generalizing,\n\ttitle        = {Generalizing Image Captions for Image-Text Parallel Corpus},\n\tauthor       = {Polina Kuznetsova and Vicente Ordonez and Alexander C Berg and Tamara L Berg and Yejin Choi},\n\tyear         = 2013,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {790--796}\n}\n@article{KVV04,\n\ttitle        = {On clusterings: Good, bad and spectral},\n\tauthor       = {Ravi Kannan and Santosh Vempala and Adrian Vetta},\n\tyear         = 2004,\n\tjournal      = {Journal of the ACM},\n\tvolume       = 51,\n\tnumber       = 3,\n\tpages        = {497--515},\n\tee           = {http://doi.acm.org/10.1145/990308.990313},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de}\n}\n@inproceedings{kwiatkowski10ccg,\n\ttitle        = {Inducing Probabilistic {CCG} Grammars from Logical Form with Higher-Order Unification},\n\tauthor       = {T. Kwiatkowski and L. Zettlemoyer and S. Goldwater and M. Steedman},\n\tyear         = 2010,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1223--1233}\n}\n@inproceedings{kwiatkowski11lex,\n\ttitle        = {Lexical Generalization in {CCG} Grammar Induction for Semantic Parsing},\n\tauthor       = {T. Kwiatkowski and L. Zettlemoyer and S. Goldwater and M. Steedman},\n\tyear         = 2011,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1512--1523}\n}\n@inproceedings{kwiatkowski2012probabilistic,\n\ttitle        = {A probabilistic model of syntactic and semantic acquisition from child-directed utterances and their meanings},\n\tauthor       = {Tom Kwiatkowski and Sharon Goldwater and Luke Zettlemoyer and Mark Steedman},\n\tyear         = 2012,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {234--244}\n}\n@inproceedings{kwiatkowski2013scaling,\n\ttitle        = {Scaling Semantic Parsers with On-the-fly Ontology Matching},\n\tauthor       = {Tom Kwiatkowski and Eunsol Choi and Yoav Artzi and Luke Zettlemoyer},\n\tyear         = 2013,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{kwiatkowski2019natural,\n\ttitle        = {Natural Questions: A Benchmark for Question Answering Research},\n\tauthor       = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov},\n\tyear         = 2019,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{kwok2001scaling,\n\ttitle        = {Scaling question answering to the web},\n\tauthor       = {Cody Kwok and Oren Etzioni and Daniel S. Weld},\n\tyear         = 2001,\n\tjournal      = {ACM Transactions on Information Systems (TOIS)},\n\tvolume       = 19,\n\tpages        = {242--262}\n}\n@article{l1994stochastic,\n\ttitle        = {Stochastic optimization by simulation: Convergence proofs for the {GI/G/1} queue in steady-state},\n\tauthor       = {L'Ecuyer, Pierre and Glynn, Peter W},\n\tyear         = 1994,\n\tjournal      = {Management Science},\n\tpublisher    = {INFORMS},\n\tvolume       = 40,\n\tnumber       = 11,\n\tpages        = {1562--1578}\n}\n@article{l63,\n\ttitle        = {A topological property of real analytic subsets},\n\tauthor       = {Lojasiewicz, S},\n\tyear         = 1963,\n\tjournal      = {Coll. du CNRS, Les {\\'e}quations aux d{\\'e}riv{\\'e}es partielles},\n\tvolume       = 117,\n\tpages        = {87--89}\n}\n@inproceedings{labutov2018learning,\n\ttitle        = {Learning to Learn Semantic Parsers from Natural Language Supervision},\n\tauthor       = {Igor Labutov and Bishan Yang and Tom Mitchell},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{labutov2018lia,\n\ttitle        = {LIA: A Natural Language Programmable Personal Assistant},\n\tauthor       = {Igor Labutov and Shashank Srivastava and Tom Michael Mitchell},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{lachaux2020unsupervised,\n\ttitle        = {Unsupervised Translation of Programming Languages},\n\tauthor       = {Marie-Anne Lachaux and Baptiste Roziere and Lowik Chanussot and Guillaume Lample},\n\tyear         = 2020,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{lacoste2012sigma,\n\ttitle        = {SiGMa: Simple Greedy Matching for Aligning Large Knowledge Bases},\n\tauthor       = {Simon Lacoste-Julien and Konstantina Palla and Alex Davies and Gjergji Kasneci and Thore Graepel and Zoubin Ghahramani},\n\tyear         = 2012,\n\tjournal      = {arXiv}\n}\n@article{LacosteJulienSB2012,\n\ttitle        = {A simpler approach to obtaining an $O(1/t)$ convergence rate for the projected stochastic subgradient method},\n\tauthor       = {Simon Lacoste{-}Julien and Mark W. Schmidt and Francis R. Bach},\n\tyear         = 2012,\n\tjournal      = {ArXiv e-prints},\n\tvolume       = {abs/1212.2002}\n}\n@inproceedings{lafferty01crf,\n\ttitle        = {Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Data},\n\tauthor       = {John Lafferty and Andrew McCallum and Fernando Pereira},\n\tyear         = 2001,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {282--289}\n}\n@inproceedings{laghi2017teleimpedance,\n\ttitle        = {Tele-impedance with force feedback under communication time delay},\n\tauthor       = {Marco Laghi and A. Ajoudani and M. Catalano and A. Bicchi},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Intelligent Robots and Systems (IROS)},\n\tpages        = {2564--2571}\n}\n@article{laghi2020unifying,\n\ttitle        = {Unifying bilateral teleoperation and tele-impedance for enhanced user experience},\n\tauthor       = {Marco Laghi and A. Ajoudani and M. Catalano and A. Bicchi},\n\tyear         = 2020,\n\tjournal      = {International Journal of Robotics Research (IJRR)},\n\tvolume       = 39,\n\tpages        = {514--539}\n}\n@article{lagoudakis2003least,\n\ttitle        = {Least-squares policy iteration},\n\tauthor       = {Lagoudakis, Michail G and Parr, Ronald},\n\tyear         = 2003,\n\tjournal      = {Journal of machine learning research},\n\tvolume       = 4,\n\tnumber       = {Dec},\n\tpages        = {1107--1149}\n}\n@article{lai1985asymptotically,\n\ttitle        = {Asymptotically efficient adaptive allocation rules},\n\tauthor       = {Tze Leung Lai and Herbert Robbins},\n\tyear         = 1985,\n\tjournal      = {Advances in applied mathematics},\n\tvolume       = 6,\n\tnumber       = 1,\n\tpages        = {4--22}\n}\n@inproceedings{lai2012detection,\n\ttitle        = {Detection-based object labeling in 3d scenes},\n\tauthor       = {Kevin Lai and Liefeng Bo and Xiaofeng Ren and Dieter Fox},\n\tyear         = 2012,\n\tbooktitle    = {International Conference on Robotics and Automation (ICRA)},\n\tpages        = {1330--1337}\n}\n@inproceedings{lai2016agnostic,\n\ttitle        = {Agnostic Estimation of Mean and Covariance},\n\tauthor       = {Kevin A. Lai and Anup B. Rao and Santosh Vempala},\n\tyear         = 2016,\n\tbooktitle    = {Foundations of Computer Science (FOCS)}\n}\n@article{lai2017natural,\n\ttitle        = {Natural Language Inference from Multiple Premises},\n\tauthor       = {Alice Lai and Yonatan Bisk and Julia Hockenmaier},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1710.02925}\n}\n@inproceedings{lai2017race,\n\ttitle        = {RACE: Large-scale ReAding Comprehension Dataset From Examinations},\n\tauthor       = {Guokun Lai and Qizhe Xie and Hanxiao Liu and Yiming Yang and Eduard Hovy},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{laine2017temporal,\n\ttitle        = {Temporal ensembling for semi-supervised learning},\n\tauthor       = {Samuli Laine and Timo Aila},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{laishram2016curie,\n\ttitle        = {Curie: A method for protecting {SVM} Classifier from Poisoning Attack},\n\tauthor       = {Ricky Laishram and Vir Virander Phoha},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@inproceedings{lake2013one,\n\ttitle        = {One-shot learning by inverting a compositional causal process},\n\tauthor       = {Brenden M Lake and Ruslan R Salakhutdinov and Josh Tenenbaum},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {2526--2534}\n}\n@inproceedings{lake2018generalization,\n\ttitle        = {Generalization without Systematicity: On the Compositional Skills of Sequence-to-Sequence Recurrent Networks},\n\tauthor       = {Brenden Lake and Marco Baroni},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{lakhina2004diagnosing,\n\ttitle        = {Diagnosing network-wide traffic anomalies},\n\tauthor       = {Anukool Lakhina and Mark Crovella and Christophe Diot},\n\tyear         = 2004,\n\tbooktitle    = {ACM SIGCOMM Computer Communication Review},\n\tvolume       = 34,\n\tnumber       = 4,\n\tpages        = {219--230}\n}\n@inproceedings{lakkaraju2017identifying,\n\ttitle        = {Identifying Unknown Unknowns in the Open World: Representations and Policies for Guided Exploration},\n\tauthor       = {Himabindu Lakkaraju and Ece Kamar and Rich Caruana and Eric Horvitz},\n\tyear         = 2017,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@book{lakoff2008metaphors,\n\ttitle        = {Metaphors we live by},\n\tauthor       = {George Lakoff and Mark Johnson},\n\tyear         = 2008,\n\tpublisher    = {University of Chicago Press}\n}\n@misc{lakshminarayanan17linear,\n\ttitle        = {A Linearly Relaxed Approximate Linear Program for {Markov} Decision Processes},\n\tauthor       = {Chandrashekar Lakshminarayanan and Shalabh Bhatnagar and Csaba Szepesv\\'{a}ri},\n\tyear         = 2017,\n\tjournal      = {IEEE Transactions on Automatic Control},\n\tvolume       = 63,\n\tnumber       = 4,\n\tpages        = {1185--1191},\n\tnote         = {CoRR abs/1704.02544}\n}\n@inproceedings{lakshminarayanan2017simple,\n\ttitle        = {Simple and scalable predictive uncertainty estimation using deep ensembles},\n\tauthor       = {Lakshminarayanan, Balaji and Pritzel, Alexander and Blundell, Charles},\n\tyear         = 2017,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {6402--6413}\n}\n@inproceedings{lam2015quantifying,\n\ttitle        = {Quantifying Input Uncertainty in Stochastic Optimization},\n\tauthor       = {Henry Lam and Enlu Zhou},\n\tyear         = 2015,\n\tbooktitle    = {2015 Winter Simulation Conference}\n}\n@article{lamb2019interpolated,\n\ttitle        = {Interpolated Adversarial Training: Achieving Robust Neural Networks without Sacrificing Too Much Accuracy},\n\tauthor       = {Alex Lamb and Vikas Verma and Juho Kannala and Yoshua Bengio},\n\tyear         = 2019,\n\tjournal      = {arXiv}\n}\n@inproceedings{lamm2018tap,\n\ttitle        = {Textual Analogy Parsing: What's Shared and What's Compared among Analogous Facts},\n\tauthor       = {Matthew Lamm and Arun Chaganty and Christopher D. Manning and Dan Jurafsky and Percy Liang},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{lampert2009learning,\n\ttitle        = {Learning to detect unseen object classes by between-class attribute transfer},\n\tauthor       = {Christoph H Lampert and Hannes Nickisch and Stefan Harmeling},\n\tyear         = 2009,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {951--958}\n}\n@article{lample2017unsupervised,\n\ttitle        = {Unsupervised Machine Translation Using Monolingual Corpora Only},\n\tauthor       = {Guillaume Lample and Ludovic Denoyer and Marc'Aurelio Ranzato},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1711.00043}\n}\n@inproceedings{lample2018phrase,\n\ttitle        = {Phrase-based \\& neural unsupervised machine translation},\n\tauthor       = {Guillaume Lample and Myle Ott and Alexis Conneau and Ludovic Denoyer and Marc'Aurelio Ranzato},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{Lan2011,\n\ttitle        = {{An optimal method for stochastic composite optimization}},\n\tauthor       = {Lan, Guanghui},\n\tyear         = 2011,\n\tmonth        = jan,\n\tjournal      = {Mathematical Programming},\n\tvolume       = 133,\n\tnumber       = {1-2},\n\tpages        = {365--397},\n\tdoi          = {10.1007/s10107-010-0434-y},\n\tisbn         = {0001408100},\n\tissn         = {0025-5610},\n\tfile         = {:D$\\backslash$:/Mendeley Desktop/Lan - 2011 - An optimal method for stochastic composite optimization.pdf:pdf},\n\tkeywords     = {convex optimization,stochastic approximation},\n\tmendeley-groups = {Optimization/Gradient Descent Theory/Composite}\n}\n@inproceedings{lan2020albert,\n\ttitle        = {{ALBERT}: A Lite {BERT} for Self-supervised Learning of Language Representations},\n\tauthor       = {Zhenzhong Lan and Mingda Chen and Sebastian Goodman and Kevin Gimpel and Piyush Sharma and Radu Soricut},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{lan2021policy,\n\ttitle        = {Policy mirror descent for reinforcement learning: Linear convergence, new sampling complexity, and generalized problem classes},\n\tauthor       = {Lan, Guanghui},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2102.00135}\n}\n@inproceedings{landeiro2016confounder,\n\ttitle        = {Robust Text Classification in the Presence of Confounding Bias},\n\tauthor       = {Virgile Landeiro and Aron Culotta},\n\tyear         = 2016,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@article{landrieu2017cut,\n\ttitle        = {Cut pursuit: Fast algorithms to learn piecewise constant functions on general weighted graphs},\n\tauthor       = {Landrieu, Loic and Obozinski, Guillaume},\n\tyear         = 2017,\n\tjournal      = {SIAM Journal on Imaging Sciences},\n\tpublisher    = {SIAM},\n\tvolume       = 10,\n\tnumber       = 4,\n\tpages        = {1724--1766}\n}\n@article{lane2016genome,\n\ttitle        = {Genome-wide association analysis identifies novel loci for chronotype in 100,420 individuals from the {UK} Biobank},\n\tauthor       = {Jacqueline M Lane and Irma Vlasac and Simon G Anderson and Simon D Kyle and William G Dixon and David A Bechtold and Shubhroz Gill and Max A Little and Annemarie Luik and Andrew Loudon and others},\n\tyear         = 2016,\n\tjournal      = {Nature Communications},\n\tvolume       = 7\n}\n@inproceedings{langford2008epoch,\n\ttitle        = {The epoch-greedy algorithm for multi-armed bandits with side information},\n\tauthor       = {Langford, John and Zhang, Tong},\n\tyear         = 2008,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {817--824}\n}\n@article{LangRao2004,\n\ttitle        = {{A flow-based method for improving the expansion or conductance of graph cuts}},\n\tauthor       = {Lang, Kevin and Rao, Satish},\n\tyear         = 2004,\n\tjournal      = {Integer Programming and Combinatorial Optimization},\n\tvolume       = 3064,\n\tpages        = {325--337}\n}\n@article{languagegans2018,\n\ttitle        = {Language GANs Falling Short},\n\tauthor       = {Massimo Caccia and Lucas Caccia and William Fedus and Hugo Larochelle and Joelle Pineau and Laurent Charlin},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1811.02549}\n}\n@article{LanZhou2015,\n\ttitle        = {An optimal randomized incremental gradient method},\n\tauthor       = {Guanghui Lan and Yi Zhou},\n\tyear         = 2015,\n\tmonth        = oct,\n\tjournal      = {ArXiv e-prints},\n\tvolume       = {abs/1507.02000}\n}\n@inproceedings{lao2011pathranking,\n\ttitle        = {Random walk inference and learning in a large scale knowledge base},\n\tauthor       = {Ni Lao and Tom Mitchell and William W Cohen},\n\tyear         = 2011,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {529--539}\n}\n@inproceedings{lao2012rules,\n\ttitle        = {Reading the web with learned syntactic-semantic inference rules},\n\tauthor       = {Ni Lao and Amarnag Subramanya and Fernando Pereira and William W Cohen},\n\tyear         = 2012,\n\tbooktitle    = {Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP/CoNLL)},\n\tpages        = {1017--1026}\n}\n@inproceedings{lao2015learning,\n\ttitle        = {Learning relational features with backward random walks},\n\tauthor       = {Ni Lao and Einat Minkov and William Cohen},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{lari90scfg,\n\ttitle        = {The estimation of stochastic context-free grammars using the inside-outside algorithm},\n\tauthor       = {K. Lari and S. J. Young},\n\tyear         = 1990,\n\tjournal      = {Computer Speech and Language},\n\tvolume       = 4,\n\tpages        = {35--56}\n}\n@inproceedings{laroche2017transfer,\n\ttitle        = {Transfer Reinforcement Learning with Shared Dynamics},\n\tauthor       = {Romain Laroche and Merwan Barlier},\n\tyear         = 2017,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tpages        = {2147--2153}\n}\n@inproceedings{larochelle2008zero,\n\ttitle        = {Zero-data Learning of New Tasks},\n\tauthor       = {Hugo Larochelle and Dumitru Erhan and Yoshua Bengio},\n\tyear         = 2008,\n\tbooktitle    = {AAAI},\n\tvolume       = 8,\n\tpages        = {646--651}\n}\n@article{larrazabal2020gender,\n\ttitle        = {Gender imbalance in medical imaging datasets produces biased classifiers for computer-aided diagnosis},\n\tauthor       = {Agostina J Larrazabal and Nicol{\\'a}s Nieto and Victoria Peterson and Diego H Milone and Enzo Ferrante},\n\tyear         = 2020,\n\tjournal      = {Proceedings of the National Academy of Sciences}\n}\n@inproceedings{larsen2016autoencoding,\n\ttitle        = {Autoencoding beyond pixels using a learned similarity metric},\n\tauthor       = {Anders Boesen Lindbo Larsen and Søren Kaae Sønderby and Hugo Larochelle and Ole Winther},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{lary2016MLremotesensing,\n\ttitle        = {Machine learning in geosciences and remote sensing},\n\tauthor       = {David J. Lary and Amir H. Alavi and Amir H. Gandomi and Annette L. Walker},\n\tyear         = 2016,\n\tjournal      = {Geoscience Frontiers},\n\tvolume       = 7,\n\tnumber       = 1,\n\tpages        = {3--10}\n}\n@inproceedings{lasecki2011realtime,\n\ttitle        = {Real-time crowd control of existing interfaces},\n\tauthor       = {Walter S Lasecki and Kyle I Murray and Samuel White and Robert C Miller and Jeffrey P Bigham},\n\tyear         = 2011,\n\tbooktitle    = {User Interface Software and Technology (UIST)},\n\tpages        = {23--32}\n}\n@inproceedings{lasecki2013conversations,\n\ttitle        = {Conversations in the crowd: Collecting data for task-oriented dialog learning},\n\tauthor       = {Walter Stephen Lasecki and Ece Kamar and Dan Bohus},\n\tyear         = 2013,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{lasecki2013realtime,\n\ttitle        = {Real-time Crowd Labeling for Deployable Activity Recognition},\n\tauthor       = {Walter S Lasecki and Young Chol Song and Henry Kautz and Jeffrey P. Bigham},\n\tyear         = 2013,\n\tbooktitle    = {Conference on Computer Supported Cooperative Work},\n\tpages        = {1203--1212}\n}\n@inproceedings{laskey2017dart,\n\ttitle        = {DART: Noise Injection for Robust Imitation Learning},\n\tauthor       = {Michael Laskey and Jonathan N. Lee and Roy Fox and A. Dragan and Ken Goldberg},\n\tyear         = 2017,\n\tbooktitle    = {Conference on Robot Learning (CORL)}\n}\n@inproceedings{laskov2014practical,\n\ttitle        = {Practical evasion of a learning-based classifier: A case study},\n\tauthor       = {Pavel Laskov and Nedim \\v{S}rndi{\\`c}},\n\tyear         = 2014,\n\tbooktitle    = {Symposium on Security and Privacy}\n}\n@inproceedings{lasserre06hybrid,\n\ttitle        = {Principled Hybrids of Generative and Discriminative Models},\n\tauthor       = {Julia A. Lasserre and Christopher M. Bishop and Thomas P. Minka},\n\tyear         = 2006,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {87--94}\n}\n@article{lasserre2001global,\n\ttitle        = {Global optimization with polynomials and the problem of moments},\n\tauthor       = {Lasserre, Jean B},\n\tyear         = 2001,\n\tjournal      = {SIAM Journal on Optimization},\n\tpublisher    = {SIAM},\n\tvolume       = 11,\n\tnumber       = 3,\n\tpages        = {796--817}\n}\n@article{lasserre2008semidefinite,\n\ttitle        = {A semidefinite programming approach to the generalized problem of moments},\n\tauthor       = {Jean B Lasserre},\n\tyear         = 2008,\n\tjournal      = {Mathematical Programming},\n\tvolume       = 112,\n\tnumber       = 1,\n\tpages        = {65--92}\n}\n@book{lasserre2011moments,\n\ttitle        = {Moments, Positive Polynomials and Their Applications},\n\tauthor       = {Jean Bernard Lasserre},\n\tyear         = 2011,\n\tpublisher    = {Imperial College Press}\n}\n@article{latala1997estimation,\n\ttitle        = {Estimation of moments of sums of independent real random variables},\n\tauthor       = {Latala, Rafal},\n\tyear         = 1997,\n\tjournal      = {The Annals of Probability},\n\tpublisher    = {JSTOR},\n\tpages        = {1502--1513}\n}\n@article{latala2006estimates,\n\ttitle        = {Estimates of moments and tails of {G}aussian chaoses},\n\tauthor       = {Rafa{\\l{}} Lata{\\l{}}a},\n\tyear         = 2006,\n\tjournal      = {The Annals of Probability},\n\tvolume       = 34,\n\tnumber       = 6,\n\tpages        = {2315--2331}\n}\n@article{LatalaBound,\n\ttitle        = {{Estimates of moments and tails of Gaussian chaoses}},\n\tauthor       = {R. Latala},\n\tyear         = 2006,\n\tjournal      = {Ann. Prob.},\n\tvolume       = 34,\n\tnumber       = 6,\n\tpages        = {2315--2331}\n}\n@article{latcinnik2020explaining,\n\ttitle        = {Explaining Question Answering Models through Text Generation},\n\tauthor       = {Veronica Latcinnik and Jonathan Berant},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2004.05569}\n}\n@inproceedings{latentgraph2020raboh,\n\ttitle        = {Differentiable Scene Graphs},\n\tauthor       = {Moshiko Raboh and Roei Herzig and Gal Chechik and Jonathan Berant and Amir Globerson},\n\tyear         = 2020,\n\tbooktitle    = {Winter Conference on Applications of Computer Vision (WACV)}\n}\n@inproceedings{lattimore2012pac,\n\ttitle        = {PAC bounds for discounted MDPs},\n\tauthor       = {Lattimore, Tor and Hutter, Marcus},\n\tyear         = 2012,\n\tbooktitle    = {International Conference on Algorithmic Learning Theory},\n\tpages        = {320--334},\n\torganization = {Springer}\n}\n@inproceedings{lattimore2013sample,\n\ttitle        = {The sample-complexity of general reinforcement learning},\n\tauthor       = {Lattimore, Tor and Hutter, Marcus and Sunehag, Peter and others},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Machine Learning}\n}\n@inproceedings{lattimore2019learning,\n\ttitle        = {Learning with Good Feature Representations in Bandits and in RL with a Generative Model},\n\tauthor       = {Lattimore, Tor and Szepesvari, Csaba},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning}\n}\n@book{lattimore2020bandit,\n\ttitle        = {Bandit algorithms},\n\tauthor       = {Lattimore, Tor and Szepesv{\\'a}ri, Csaba},\n\tyear         = 2020,\n\tpublisher    = {Cambridge University Press}\n}\n@article{lau03programming,\n\ttitle        = {Programming by demonstration using version space algebra},\n\tauthor       = {T. Lau and S. Wolfman and P. Domingos and D. S. Weld},\n\tyear         = 2003,\n\tjournal      = {Machine Learning},\n\tvolume       = 53,\n\tpages        = {111--156}\n}\n@inproceedings{lau03traces,\n\ttitle        = {Learning Programs from Traces using Version Space Algebra},\n\tauthor       = {T. Lau and P. Domingos and D. S. Weld},\n\tyear         = 2003,\n\tbooktitle    = {International Conference On Knowledge Capture},\n\tpages        = {36--43}\n}\n@inproceedings{Lau04bipartiteroots,\n\ttitle        = {Bipartite roots of graphs},\n\tauthor       = {Lap Chi Lau},\n\tyear         = 2004,\n\tbooktitle    = {In Proceedings of the 15th Annual ACM-SIAM Symposium on Discrete Algorithms},\n\tpages        = {952--961}\n}\n@inproceedings{lau2000version,\n\ttitle        = {Version Space Algebra and its Application to Programming by Demonstration},\n\tauthor       = {Tessa A Lau and Pedro Domingos and Daniel S Weld},\n\tyear         = 2000,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {527--534}\n}\n@article{lau2014social,\n\ttitle        = {Social analytics: Learning fuzzy product ontologies for aspect-oriented sentiment analysis},\n\tauthor       = {Raymond YK Lau and Chunping Li and Stephen SY Liao},\n\tyear         = 2014,\n\tjournal      = {Decision Support Systems},\n\tvolume       = 65,\n\tpages        = {80--94}\n}\n@inproceedings{lau2015unsupervised,\n\ttitle        = {Unsupervised Prediction of Acceptability Judgements},\n\tauthor       = {Jey Han Lau and Alexander Clark and Shalom Lappin},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {1618--1628}\n}\n@article{lau2017grammaticality,\n\ttitle        = {Grammaticality, Acceptability, and Probability: A Probabilistic View of Linguistic Knowledge},\n\tauthor       = {Jey Han Lau and Alexander Clark and Shalom Lappin},\n\tyear         = 2017,\n\tjournal      = {Cognitive Science},\n\tvolume       = 41,\n\tpages        = {1202--1241}\n}\n@inproceedings{lauer2000algorithm,\n\ttitle        = {An algorithm for distributed reinforcement learning in cooperative multi-agent systems},\n\tauthor       = {Martin Lauer and Martin Riedmiller},\n\tyear         = 2000,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {535--542}\n}\n@article{laurent2000,\n\ttitle        = {Adaptive estimation of a quadratic functional by model selection},\n\tauthor       = {Laurent, B. and Massart, P.},\n\tyear         = 2000,\n\tmonth        = 10,\n\tjournal      = {Ann. Statist.},\n\tpublisher    = {The Institute of Mathematical Statistics},\n\tvolume       = 28,\n\tnumber       = 5,\n\tpages        = {1302--1338},\n\tdoi          = {10.1214/aos/1015957395},\n\turl          = {http://dx.doi.org/10.1214/aos/1015957395},\n\tfjournal     = {The Annals of Statistics}\n}\n@article{laurent2000adaptive,\n\ttitle        = {Adaptive estimation of a quadratic functional by model selection},\n\tauthor       = {B. Laurent and P. Massart},\n\tyear         = 2000,\n\tjournal      = {Annals of Statistics},\n\tvolume       = 28,\n\tnumber       = 5,\n\tpages        = {1302--1338}\n}\n@article{laurent2008sparse,\n\ttitle        = {A Sparse Flat Extension Theorem for Moment Matrices},\n\tauthor       = {Monique Laurent},\n\tyear         = 2008,\n\tjournal      = {arXiv preprint arXiv:0812.2563}\n}\n@article{laurent2009generalized,\n\ttitle        = {A generalized flat extension theorem for moment matrices},\n\tauthor       = {Monique Laurent and Bernard Mourrain},\n\tyear         = 2009,\n\tjournal      = {Archiv der Mathematik},\n\tvolume       = 93,\n\tnumber       = 1,\n\tpages        = {87--98}\n}\n@inproceedings{laurent2009sums,\n\ttitle        = {Sums of squares, moment matrices and optimization over polynomials},\n\tauthor       = {Monique Laurent},\n\tyear         = 2009,\n\tbooktitle    = {Emerging applications of algebraic geometry},\n\tpages        = {157--270}\n}\n@article{lavie2009meteor,\n\ttitle        = {The Meteor Metric for Automatic Evaluation of Machine Translation},\n\tauthor       = {Alon Lavie and Michael Denkowski},\n\tyear         = 2009,\n\tjournal      = {Machine Translation},\n\tvolume       = 23\n}\n@misc{law1983sensitive,\n\ttitle        = {Division of consumer and community affairs. 2011-07. 12 cfr supplement \\i to part l02},\n\tauthor       = {official staff interpretations},\n\tyear         = 1983\n}\n@article{lawrence1998searching,\n\ttitle        = {Searching the world wide web},\n\tauthor       = {Steve Lawrence and C. Lee Giles},\n\tyear         = 1998,\n\tjournal      = {Science},\n\tvolume       = 280,\n\tnumber       = 5360,\n\tpages        = {98--100}\n}\n@inproceedings{lawrence2007hierarchical,\n\ttitle        = {Hierarchical Gaussian process latent variable models},\n\tauthor       = {Lawrence,, Neil D. and Moore,, Andrew J.},\n\tyear         = 2007,\n\tbooktitle    = {\n\t\tICML '07: Proceedings of the 24th international conference on Machine\n\n\t\tlearning\n\t},\n\tlocation     = {Corvalis, Oregon},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tpages        = {481--488},\n\tdoi          = {http://doi.acm.org/10.1145/1273496.1273557},\n\tisbn         = {978-1-59593-793-3}\n}\n@inproceedings{lazaric2010finite,\n\ttitle        = {Finite-sample analysis of LSTD},\n\tauthor       = {Lazaric, Alessandro and Ghavamzadeh, Mohammad and Munos, R{\\'e}mi},\n\tyear         = 2010,\n\tbooktitle    = {ICML-27th International Conference on Machine Learning},\n\tpages        = {615--622}\n}\n@article{lazaric2012finite,\n\ttitle        = {Finite-sample analysis of least-squares policy iteration},\n\tauthor       = {Lazaric, Alessandro and Ghavamzadeh, Mohammad and Munos, R{\\'e}mi},\n\tyear         = 2012,\n\tjournal      = {The Journal of Machine Learning Research},\n\tpublisher    = {JMLR. org},\n\tvolume       = 13,\n\tnumber       = 1,\n\tpages        = {3041--3074}\n}\n@inproceedings{lazaridou2017multi,\n\ttitle        = {Multi-agent cooperation and the emergence of (natural) language},\n\tauthor       = {Angeliki Lazaridou and Alexander Peysakhovich and Marco Baroni},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{LB93,\n\ttitle        = {Multivariate normal mixtures: a fast consistent method},\n\tauthor       = {B. G. Lindsay and P. Basak},\n\tyear         = 1993,\n\tjournal      = {Journal of the American Statistical Association},\n\tvolume       = 88,\n\tnumber       = 422,\n\tpages        = {468--476}\n}\n@article{lbnsps17,\n\ttitle        = {Deep Neural Networks as {G}aussian Processes},\n\tauthor       = {Lee, Jaehoon and Bahri, Yasaman and Novak, Roman and Schoenholz, Samuel S. and Pennington, Jeffrey and Sohl-Dickstein, Jascha},\n\tyear         = 2017,\n\tjournal      = {arXiv:1711.00165},\n\turl          = {http://arxiv.org/abs/1711.00165}\n}\n@article{LDA,\n\ttitle        = {Latent dirichlet allocation},\n\tauthor       = {Blei, David M. and Ng, Andrew Y. and Jordan, Michael I.},\n\tyear         = 2003,\n\tmonth        = mar,\n\tjournal      = {J. Mach. Learn. Res.},\n\tpublisher    = {JMLR.org},\n\tvolume       = 3,\n\tpages        = {993--1022},\n\tissn         = {1532-4435},\n\turl          = {http://dl.acm.org/citation.cfm?id=944919.944937},\n\tissue_date   = {3/1/2003},\n\tnumpages     = 30,\n\tacmid        = 944937\n}\n@inproceedings{LDAinference,\n\ttitle        = {Efficient Methods for Topic Model Inference on Streaming Document Collections},\n\tauthor       = {Yao, Limin and Mimno, David and McCallum, Andrew},\n\tyear         = 2009,\n\tbooktitle    = {Proceedings of the 15th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},\n\tlocation     = {Paris, France},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {KDD '09},\n\tpages        = {937--946},\n\tdoi          = {10.1145/1557019.1557121},\n\tisbn         = {978-1-60558-495-9},\n\turl          = {http://doi.acm.org/10.1145/1557019.1557121},\n\tnumpages     = 10,\n\tacmid        = 1557121,\n\tkeywords     = {inference, topic modeling}\n}\n@inproceedings{le2011ica,\n\ttitle        = {{ICA with Reconstruction Cost for Efficient Overcomplete Feature Learning}},\n\tauthor       = {Q. V. Le and A. Karpenko and J. Ngiam and A. Y. Ng},\n\tyear         = 2011,\n\tbooktitle    = {NIPS},\n\tpages        = {1017--1025}\n}\n@article{le2015concentration,\n\ttitle        = {Concentration and Regularization of Random Graphs},\n\tauthor       = {Can M. Le and Elizaveta Levina and Roman Vershynin},\n\tyear         = 2015,\n\tjournal      = {arXiv}\n}\n@article{le2015simple,\n\ttitle        = {A Simple Way to Initialize Recurrent Networks of Rectified Linear Units},\n\tauthor       = {Quoc V Le and Navdeep Jaitly and Geoffrey E Hinton},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1504.00941}\n}\n@article{le2015tiny,\n\ttitle        = {Tiny imagenet visual recognition challenge},\n\tauthor       = {Le, Ya and Yang, Xuan},\n\tyear         = 2015,\n\tjournal      = {CS 231N},\n\tvolume       = 7,\n\tpages        = 7\n}\n@inproceedings{le2018preference,\n\ttitle        = {Preference Elicitation with Interdependency and User Bother Cost},\n\tauthor       = {Tiep Le and Atena M. Tabakhi and Long Tran-Thanh and William Yeoh and Tran Cao Son},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Autonomous Agents and Multiagent Systems (AAMAS)}\n}\n@inproceedings{le2020adversarial,\n\ttitle        = {Adversarial filters of dataset biases},\n\tauthor       = {Ronan Le Bras and Swabha Swayamdipta and Chandra Bhagavatula and Rowan Zellers and Matthew Peters and Ashish Sabharwal and Yejin Choi},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1078--1088}\n}\n@book{LeCam86,\n\ttitle        = {Asymptotic Methods in Statistical Decision Theory},\n\tauthor       = {L. {Le Cam}},\n\tyear         = 1986,\n\tpublisher    = {Springer}\n}\n@article{lecun1995convolutional,\n\ttitle        = {Convolutional networks for images, speech, and time series},\n\tauthor       = {LeCun, Yann and Bengio, Yoshua},\n\tyear         = 1995,\n\tjournal      = {The handbook of brain theory and neural networks},\n\tvolume       = 3361,\n\tnumber       = 10,\n\tpages        = 1995\n}\n@article{lecun1998gradient,\n\ttitle        = {Gradient-based learning applied to document recognition},\n\tauthor       = {Yann LeCun and L{\\'e}on Bottou and Yoshua Bengio and Patrick Haffner},\n\tyear         = 1998,\n\tjournal      = {Proceedings of the IEEE},\n\tvolume       = 86,\n\tnumber       = 11,\n\tpages        = {2278--2324}\n}\n@article{lecun1998mnist,\n\ttitle        = {The {MNIST} database of handwritten digits},\n\tauthor       = {Yann LeCun and Corinna Cortes and Christopher JC Burges},\n\tyear         = 1998,\n\tjournal      = {http://yann.lecun.com/exdb/mnist/}\n}\n@inproceedings{lecun2004learning,\n\ttitle        = {Learning methods for generic object recognition with invariance to pose and lighting},\n\tauthor       = {Yann LeCun and Fu Jie Huang and Leon Bottou},\n\tyear         = 2004,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tvolume       = 2\n}\n@incollection{lecun2012efficient,\n\ttitle        = {Efficient backprop},\n\tauthor       = {LeCun, Yann A and Bottou, L{\\'e}on and Orr, Genevieve B and M{\\\"u}ller, Klaus-Robert},\n\tyear         = 2012,\n\tbooktitle    = {Neural networks: Tricks of the trade},\n\tpublisher    = {Springer},\n\tpages        = {9--48}\n}\n@article{lecun2015deep,\n\ttitle        = {Deep Learning},\n\tauthor       = {Yann LeCun and Yoshua Bengio and Geoffrey Hinton},\n\tyear         = 2015,\n\tjournal      = {Nature},\n\tvolume       = 521,\n\tnumber       = 7553,\n\tpages        = {436--444}\n}\n@inproceedings{lecuyer2019certified,\n\ttitle        = {Certified robustness to adversarial examples with differential privacy},\n\tauthor       = {Mathias Lecuyer and Vaggelis Atlidakis and Roxana Geambasu and Daniel Hsu and Suman Jana},\n\tyear         = 2019,\n\tbooktitle    = {In IEEE Symposium on Security and Privacy (SP)}\n}\n@book{ledoux1991probability,\n\ttitle        = {Probability in Banach Spaces: Isoperimetry and Processes},\n\tauthor       = {Michel Ledoux and Michel Talagrand},\n\tyear         = 1991,\n\tpublisher    = {Springer Berlin Heidelberg}\n}\n@book{ledoux2013probability,\n\ttitle        = {Probability in Banach Spaces: isoperimetry and processes},\n\tauthor       = {Ledoux, Michel and Talagrand, Michel},\n\tyear         = 2013,\n\tpublisher    = {Springer Science \\& Business Media},\n\tvolume       = 23\n}\n@inproceedings{lee13coordinate,\n\ttitle        = {Efficient Accelerated Coordinate Descent Methods and Faster Algorithms for Solving Linear Systems},\n\tauthor       = {Yin Tat Lee and Aaron Sidford},\n\tyear         = 2013,\n\tbooktitle    = {Foundations of Computer Science (FOCS)}\n}\n@inproceedings{lee1999hierarchical,\n\ttitle        = {\n\t\tA hierarchical approach to interactive motion editing for human-like\n\n\t\tfigures\n\t},\n\tauthor       = {Lee, Jehee and Shin, Sung Yong},\n\tyear         = 1999,\n\tbooktitle    = {\n\t\tProceedings of the 26th annual conference on Computer graphics and\n\n\t\tinteractive techniques\n\t},\n\tpublisher    = {ACM Press/Addison-Wesley Publishing Co.},\n\taddress      = {New York, NY, USA},\n\tseries       = {SIGGRAPH '99},\n\tpages        = {39--48},\n\tdoi          = {http://dx.doi.org/10.1145/311535.311539},\n\tisbn         = {0-201-48560-5},\n\tacmid        = 311539,\n\tkeywords     = {\n\t\thierarchical techniques, inverse kinematics, motion adaptation, motion\n\n\t\tediting, spacetime constraints\n\t},\n\tnumpages     = 10\n}\n@article{lee1999learning,\n\ttitle        = {Learning the parts of objects by non-negative matrix factorization},\n\tauthor       = {Lee, Daniel and Seung, Sebastian},\n\tyear         = 1999,\n\tjournal      = {Nature},\n\tpublisher    = {Nature Publishing Group},\n\tvolume       = 401,\n\tnumber       = 6755,\n\tpages        = {788--791},\n\towner        = {gewor_000},\n\ttimestamp    = {2013.11.10}\n}\n@inproceedings{lee2000algorithms,\n\ttitle        = {Algorithms for Non-negative Matrix Factorization},\n\tauthor       = {Lee, Daniel D. and Seung, H. Sebastian},\n\tyear         = 2000,\n\tbooktitle    = {{NIPS}},\n\tpages        = {556--562},\n\turl          = {citeseer.ist.psu.edu/lee01algorithms.html},\n\tbiburl       = {http://www.bibsonomy.org/bibtex/2a54d0f1fa298d6e6a7135fa56b80fb5e/zeno},\n\tinterhash    = {cf8707cab8812be3c21d3e5c10fad477},\n\tintrahash    = {a54d0f1fa298d6e6a7135fa56b80fb5e},\n\tkeywords     = {matrix-factorization nmf},\n\ttimestamp    = {2009-12-17T17:15:39.000+0100}\n}\n@inproceedings{lee2001algorithms,\n\ttitle        = {Algorithms for non-negative matrix factorization},\n\tauthor       = {Daniel D. Lee and Sebastian H. Seung},\n\tyear         = 2001,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {556--562}\n}\n@inproceedings{lee2002interactive,\n\ttitle        = {Interactive control of avatars animated with human motion data},\n\tauthor       = {\n\t\tLee, Jehee and Chai, Jinxiang and Reitsma, Paul S. A. and Hodgins,\n\n\t\tJessica K. and Pollard, Nancy S.\n\t},\n\tyear         = 2002,\n\tbooktitle    = {\n\t\tProceedings of the 29th annual conference on Computer graphics and\n\n\t\tinteractive techniques\n\t},\n\tlocation     = {San Antonio, Texas},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {SIGGRAPH '02},\n\tpages        = {491--500},\n\tdoi          = {http://doi.acm.org/10.1145/566570.566607},\n\tisbn         = {1-58113-521-1},\n\tacmid        = 566607,\n\tkeywords     = {\n\t\tavatars, human motion, interactive control, motion capture, virtual\n\n\t\tenvironments\n\t},\n\tnumpages     = 10\n}\n@inproceedings{lee2005spam,\n\ttitle        = {Spam Deobfuscation using a Hidden {M}arkov Model},\n\tauthor       = {Honglak Lee and Andrew Y. Ng},\n\tyear         = 2005,\n\tbooktitle    = {Conference on Email and Anti-Spam (CEAS)}\n}\n@article{lee2008existence,\n\ttitle        = {{Existence of Asymptotic Solutions to Semi-linear Partial Difference Equations}},\n\tauthor       = {{Jason D. Lee} and Neuberger, John},\n\tyear         = 2008,\n\tjournal      = {Joint Mathematics Meetings},\n\tpages        = {}\n}\n@inproceedings{lee2008trajectory,\n\ttitle        = {Trajectory Outlier Detection: A Partition-and-Detect Framework},\n\tauthor       = {Jae-Gil Lee and Jiawei Han and Xiaolei Li},\n\tyear         = 2008,\n\tmonth        = apr,\n\tbooktitle    = {ICDE 2008: IEEE 24th International Conference on Data Engineering},\n\tpages        = {140--149},\n\tdoi          = {10.1109/ICDE.2008.4497422},\n\tkeywords     = {\n\t\tdata mining;partition-and-detect framework;trajectory outlier detection;data\n\n\t\tmining;object detection;\n\t}\n}\n@article{lee2010multiscale,\n\ttitle        = {{Multiscale Estimation of Intrinsic Dimensionality of Point Cloud Data and Multiscale Analysis of Dynamic Graphs}},\n\tauthor       = {{Jason D. Lee}},\n\tyear         = 2010,\n\tjournal      = {Senior Thesis, Duke University}\n}\n@article{lee2010practical,\n\ttitle        = {{Practical Large-Scale Optimization for Max-Norm Regularization}},\n\tauthor       = {{Jason D. Lee} and Recht, Ben and Srebro, Nathan and Tropp, Joel and Salakhutdinov, Ruslan},\n\tyear         = 2010,\n\tjournal      = {Neural Information Processing Systems (NIPS)},\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1297--1305}\n}\n@article{lee2011chebyshev,\n\ttitle        = {Chebyshev center based column generation},\n\tauthor       = {Lee, Chungmok and Park, Sungsoo},\n\tyear         = 2011,\n\tjournal      = {Discrete Applied Mathematics},\n\tpublisher    = {Elsevier},\n\tvolume       = 159,\n\tnumber       = 18,\n\tpages        = {2251--2265}\n}\n@article{lee2011multiscale,\n\ttitle        = {{Multiscale Analysis of Time Series of Graphs}},\n\tauthor       = {{Jason D. Lee} and Maggioni, Mauro},\n\tyear         = 2011,\n\tjournal      = {International Conference on Sampling Theory and Applications (SAMPTA)}\n}\n@article{lee2012convergence,\n\ttitle        = {{Convergence Analysis of Inexact Proximal Newton-Type Methods}},\n\tauthor       = {{Jason D. Lee} and Sun, Yuekai and Saunders, Michael A},\n\tyear         = 2012,\n\tjournal      = {NIPS Workshop on Optimization in Machine Learning},\n\tpages        = {}\n}\n@article{lee2012proximal,\n\ttitle        = {{Proximal Newton-type Methods for Convex Optimization}},\n\tauthor       = {{Jason D. Lee} and Sun, Yuekai and Saunders, Michael},\n\tyear         = 2012,\n\tjournal      = {Neural Information Processing Systems (NIPS)},\n\tpages        = {836--844}\n}\n@article{lee2013model,\n\ttitle        = {{On Model Selection Consistency of Penalized M-Estimators: a Geometric Theory}},\n\tauthor       = {{Jason D. Lee} and Sun, Yuekai and Taylor, Jonathan E.},\n\tyear         = 2013,\n\tjournal      = {Neural Information Processing Systems (NIPS)},\n\tpages        = {342--350}\n}\n@inproceedings{lee2013path,\n\ttitle        = {Path finding methods for linear programming: solving linear programs in {O}(sqrt(rank)) iterations and faster algorithms for maximum flow},\n\tauthor       = {Yin Tat Lee and Aaron Sidford},\n\tyear         = 2014,\n\tmonth        = oct,\n\tbooktitle    = {2014 IEEE 55th Annual Symposium on Foundations of Computer Science},\n\tpages        = {424--433},\n\tdoi          = {10.1109/FOCS.2014.52},\n\tissn         = {0272-5428}\n}\n@inproceedings{lee2013pseudo,\n\ttitle        = {Pseudo-label: The simple and efficient semi-supervised learning method for deep neural networks},\n\tauthor       = {Dong-Hyun Lee},\n\tyear         = 2013,\n\tbooktitle    = {ICML Workshop on Challenges in Representation Learning}\n}\n@article{lee2013structure,\n\ttitle        = {{Structure Learning of Mixed Graphical Models}},\n\tauthor       = {{Jason D. Lee} and Hastie, Trevor},\n\tyear         = 2013,\n\tjournal      = {Artificial Intelligence and Statistics (AISTATS)},\n\tpages        = {388--396}\n}\n@article{lee2013using,\n\ttitle        = {{Using Multiple Samples to Learn Mixture Models}},\n\tauthor       = {{Jason D. Lee} and Gilad-Bachrach, Ran and Caruana, Rich},\n\tyear         = 2013,\n\tjournal      = {Neural Information Processing Systems (NIPS)},\n\tpages        = {324--332}\n}\n@article{lee2014exact,\n\ttitle        = {{Exact Post Model Selection Inference for Marginal Screening}},\n\tauthor       = {{Jason D. Lee} and Taylor, Jonathan E.},\n\tyear         = 2014,\n\tjournal      = {Neural Information Processing Systems (NIPS)},\n\tpages        = {1--9}\n}\n@article{lee2014learning,\n\ttitle        = {{Learning the Structure of Mixed Graphical Models}},\n\tauthor       = {{Jason D. Lee} and Hastie, Trevor J},\n\tyear         = 2014,\n\tjournal      = {Journal of Computational and Graphical Statistics},\n\tpublisher    = {Taylor \\& Francis},\n\tnumber       = {},\n\tpages        = {}\n}\n@article{lee2014multiway,\n\ttitle        = {Multiway spectral partitioning and higher-order cheeger inequalities},\n\tauthor       = {Lee, James R and Gharan, Shayan Oveis and Trevisan, Luca},\n\tyear         = 2014,\n\tjournal      = {Journal of the ACM (JACM)},\n\tpublisher    = {ACM New York, NY, USA},\n\tvolume       = 61,\n\tnumber       = 6,\n\tpages        = {1--30}\n}\n@article{lee2014proximal,\n\ttitle        = {{Proximal Newton-Type Methods for Minimizing Composite Functions}},\n\tauthor       = {{Jason D. Lee} and Sun, Yuekai and Saunders, Michael},\n\tyear         = 2014,\n\tjournal      = {SIAM Journal on Optimization}\n}\n@inproceedings{lee2014time,\n\ttitle        = {Context-dependent Semantic Parsing for Time Expressions},\n\tauthor       = {Kenton Lee and Yoav Artzi and Jesse Dodge and Luke Zettlemoyer},\n\tyear         = 2014,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{lee2015conversational,\n\ttitle        = {Conversational knowledge teaching agent that uses a knowledge base},\n\tauthor       = {Kyusong Lee and Paul Hongsuck Seo and Junhwi Choi and Sangjun Koo and Gary Geunbae Lee},\n\tyear         = 2015,\n\tjournal      = {16th Annual Meeting of the Special Interest Group on Discourse and Dialogue},\n\tpages        = {139--143}\n}\n@inproceedings{lee2015efficient,\n\ttitle        = {Efficient inverse maintenance and faster algorithms for linear programming},\n\tauthor       = {Lee, Yin Tat and Sidford, Aaron},\n\tyear         = 2015,\n\tbooktitle    = {Foundations of Computer Science (FOCS), 2015 IEEE 56th Annual Symposium on},\n\tpages        = {230--249},\n\torganization = {IEEE}\n}\n@article{lee2015modelJournal,\n\ttitle        = {{On Model Selection Consistency of Regularized M-Estimators}},\n\tauthor       = {{Jason D. Lee} and Sun, Yuekai and Taylor, Jonathan E.},\n\tyear         = 2015,\n\tjournal      = {Electronic Journal of Statistics}\n}\n@article{lee2015significance,\n\ttitle        = {{Evaluating the Statistical Significance of Biclusters}},\n\tauthor       = {{Jason D. Lee} and Sun, Yuekai and Taylor, Jonathan E.},\n\tyear         = 2015,\n\tjournal      = {Neural Information Processing Systems (NIPS)},\n\tpages        = {1--9}\n}\n@article{lee2016exact,\n\ttitle        = {{Exact Inference after Model Selection via the Lasso}},\n\tauthor       = {{Jason D. Lee} and Sun, Dennis L. and Sun, Yuekai and Taylor, Jonathan E.},\n\tyear         = 2016,\n\tjournal      = {Annals of Statistics}\n}\n@inproceedings{lee2016global,\n\ttitle        = {Global neural {CCG} parsing with optimality guarantees},\n\tauthor       = {Kenton lee and Mike Lewis and Luke Zettlemoyer},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{lee2016gradient,\n\ttitle        = {Gradient descent converges to minimizers},\n\tauthor       = {Lee, Jason D and Simchowitz, Max and Jordan, Michael I and Recht, Benjamin},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1602.04915},\n\tvolume       = 1050,\n\tpages        = 16\n}\n@article{lee2017distributed,\n\ttitle        = {Distributed Stochastic Variance Reduced Gradient Methods},\n\tauthor       = {{Jason D. Lee} and Ma, Tengyu and Lin, Qihang and Yang, Tianbao},\n\tyear         = 2017,\n\tjournal      = {Journal of Machine Learning Research}\n}\n@article{lee2017minimax,\n\ttitle        = {Minimax Statistical Learning and Domain Adaptation with Wasserstein Distances},\n\tauthor       = {Jaeho Lee and Maxim Raginsky},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1705.07815}\n}\n@article{lee2017one,\n\ttitle        = {{Communication-Efficient Distributed Sparse Regression}},\n\tauthor       = {{Jason D. Lee} and Liu, Qiang and Sun, Yuekai and Taylor, Jonathan E.},\n\tyear         = 2017,\n\tjournal      = {Journal of Machine Learning Research}\n}\n@article{lee2017rasor,\n\ttitle        = {Learning Recurrent Span Representations for Extractive Question Answering},\n\tauthor       = {Kenton Lee and Shimi Salant and Tom Kwiatkowski and Ankur Parikh and Dipanjan Das and Jonathan Berant},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@article{lee2018first,\n\ttitle        = {First-order Methods Almost Always Avoid Saddle Points},\n\tauthor       = {Lee, Jason D and Panageas, Ioannis and Piliouras, Georgios and Simchowitz, Max and Jordan, Michael I and Recht, Benjamin},\n\tyear         = 2018,\n\tjournal      = {Accepted at Math Programming}\n}\n@article{lee2018stochastic,\n\ttitle        = {Stochastic Subgradient Converges in Polynomial Time on Nonsmooth Functions},\n\tauthor       = {Lee, Jason D},\n\tyear         = 2018,\n\tjournal      = {Unpublished}\n}\n@inproceedings{lee2019autocomplete,\n\ttitle        = {Learning Autocomplete Systems as a Communication Game},\n\tauthor       = {Mina Lee and Tatsunori Hashimoto and Percy Liang},\n\tyear         = 2019,\n\tbooktitle    = {Emergent Communication Workshop at Neural Information Processing Systems (NeurIPS)}\n}\n@article{lee2019efficient,\n\ttitle        = {Efficient Exploration via State Marginal Matching},\n\tauthor       = {Lisa Lee and Benjamin Eysenbach and Emilio Parisotto and Eric Xing and Sergey Levine and Ruslan Salakhutdinov},\n\tyear         = 2019,\n\tjournal      = {arXiv}\n}\n@article{lee2019ikea,\n\ttitle        = {{IKEA} Furniture Assembly Environment for Long-Horizon Complex Manipulation Tasks},\n\tauthor       = {Youngwoon Lee and Edward S. Hu and Zhengyu Yang and Alex Yin and Joseph J Lim},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1911.07246}\n}\n@inproceedings{lee2019latent,\n\ttitle        = {Latent Retrieval for Weakly Supervised Open Domain Question Answering},\n\tauthor       = {Kenton Lee and Ming-Wei Chang and Kristina Toutanova},\n\tyear         = 2019,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{lee2020addressing,\n\ttitle        = {Addressing Distribution Shift in Online Reinforcement Learning with Offline Datasets},\n\tauthor       = {Seunghyun Lee and Younggyo Seo and Kimin Lee and Pieter Abbeel and Jinwoo Shin},\n\tyear         = 2020,\n\tbooktitle    = {Offline Reinforcement Learning Workshop @ NeurIPS}\n}\n@article{lee2020bias,\n\ttitle        = {Bias no more: high-probability data-dependent regret bounds for adversarial bandits and MDPs},\n\tauthor       = {Lee, Chung-Wei and Luo, Haipeng and Wei, Chen-Yu and Zhang, Mengxiao},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.08040}\n}\n@article{lee2020biobert,\n\ttitle        = {BioBERT: a pre-trained biomedical language representation model for biomedical text mining},\n\tauthor       = {Jinhyuk Lee and Wonjin Yoon and Sungdong Kim and Donghyeon Kim and Sunkyu Kim and Chan Ho So and Jaewoo Kang},\n\tyear         = 2020,\n\tjournal      = {Bioinformatics},\n\tvolume       = 36,\n\tnumber       = 4,\n\tpages        = {1234--1240}\n}\n@article{lee2020detect,\n\ttitle        = {Detect, Reject, Correct: Crossmodal Compensation of Corrupted Sensors},\n\tauthor       = {Michelle A. Lee and Matthew Tan and Yuke Zhu and Jeannette Bohg},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2012.00201}\n}\n@article{lee2020generalized,\n\ttitle        = {Generalized Leverage Score Sampling for Neural Networks},\n\tauthor       = {Lee, Jason D and Shen, Ruoqi and Song, Zhao and Wang, Mengdi and others},\n\tyear         = 2020,\n\tjournal      = {Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{lee2020mixout,\n\ttitle        = {Mixout: Effective regularization to finetune large-scale pretrained language models},\n\tauthor       = {Cheolhyoung Lee and Kyunghyun Cho and and Wanmo Kang},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{lee2020predicting,\n\ttitle        = {Predicting what you already know helps: Provable self-supervised learning},\n\tauthor       = {Lee, Jason D and Lei, Qi and Saunshi, Nikunj and Zhuo, Jiacheng},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2008.01064}\n}\n@article{lee2021offline,\n\ttitle        = {Offline-to-Online Reinforcement Learning via Balanced Replay and Pessimistic Q-Ensemble},\n\tauthor       = {Lee, Seunghyun and Seo, Younggyo and Lee, Kimin and Abbeel, Pieter and Shin, Jinwoo},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2107.00591}\n}\n@article{leek2010tackling,\n\ttitle        = {Tackling the widespread and critical impact of batch effects in high-throughput data},\n\tauthor       = {Jeffrey T. Leek and Robert B. Scharpf and H{\\'e}ctor Corrada Bravo and David Simcha and Benjamin Langmead and W. Evan Johnson and Donald Geman and Keith Baggerly and Rafael A. Irizarry},\n\tyear         = 2010,\n\tjournal      = {Nature Reviews Genetics},\n\tvolume       = 11,\n\tnumber       = 10\n}\n@article{leemulticlass,\n\ttitle        = {{Multiclass Clustering using a Semidefinite Relaxation}},\n\tauthor       = {{Jason D. Lee}},\n\tjournal      = {Tech Report}\n}\n@article{LeeS15,\n\ttitle        = {Efficient Inverse Maintenance and Faster Algorithms for Linear Programming},\n\tauthor       = {Yin Tat Lee and Aaron Sidford},\n\tyear         = 2015,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1503.01752},\n\turl          = {http://arxiv.org/abs/1503.01752},\n\tbdsk-url-1   = {http://arxiv.org/abs/1503.01752},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/journals/corr/LeeS15},\n\ttimestamp    = {Wed, 07 Jun 2017 14:42:08 +0200}\n}\n@inproceedings{LeeSidford2013,\n\ttitle        = {Efficient accelerated coordinate descent methods and faster algorithms for solving linear systems},\n\tauthor       = {Lee, Yin Tat and Sidford, Aaron},\n\tyear         = 2013,\n\tbooktitle    = {FOCS},\n\tpages        = {147--156},\n\torganization = {IEEE}\n}\n@inproceedings{LeeSun2015-bss,\n\ttitle        = {Constructing Linear-Sized Spectral Sparsification in Almost-Linear Time},\n\tauthor       = {Lee, Yin Tat and Sun, He},\n\tyear         = 2015,\n\tbooktitle    = {FOCS},\n\tpages        = {250--269},\n\torganization = {IEEE}\n}\n@inproceedings{lehnert1977conceptual,\n\ttitle        = {A conceptual theory of question answering},\n\tauthor       = {Wendy G Lehnert},\n\tyear         = 1977,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@phdthesis{lehnert1977process,\n\ttitle        = {The Process of Question Answering},\n\tauthor       = {Wendy Lehnert},\n\tyear         = 1977,\n\tschool       = {Yale University}\n}\n@article{lehr2017playing,\n\ttitle        = {Playing with the data: what legal scholars should learn about machine learning},\n\tauthor       = {David Lehr and Paul Ohm},\n\tyear         = 2017,\n\tjournal      = {UCDL Rev.},\n\tvolume       = 51,\n\tpages        = {653--717}\n}\n@inproceedings{lei2013natural,\n\ttitle        = {From Natural Language Specifications to Program Input Parsers},\n\tauthor       = {Tao Lei and Fan Long and Regina Barzilay and Martin Rinard},\n\tyear         = 2013,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{lei2015consistency,\n\ttitle        = {Consistency of spectral clustering in stochastic block models},\n\tauthor       = {Lei, Jing and Rinaldo, Alessandro and others},\n\tyear         = 2015,\n\tjournal      = {Annals of Statistics},\n\tpublisher    = {Institute of Mathematical Statistics},\n\tvolume       = 43,\n\tnumber       = 1,\n\tpages        = {215--237}\n}\n@article{lei2016distribution,\n\ttitle        = {Distribution-Free Predictive Inference For Regression},\n\tauthor       = {Jing Lei and Max G'Sell and Alessandro Rinaldo and Ryan J. Tibshirani and Larry Wasserman},\n\tyear         = 2016,\n\tjournal      = {Journal of the American Statistical Association},\n\tvolume       = 113,\n\tpages        = {1094--1111}\n}\n@inproceedings{lei2016rationalizing,\n\ttitle        = {Rationalizing Neural Predictions},\n\tauthor       = {Tao Lei and Regina Barzilay and Tommi Jaakkola},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{lei2016semisupervised,\n\ttitle        = {Semi-supervised Question Retrieval with Gated Convolutions},\n\tauthor       = {Tao Lei and Hrishikesh Joshi and Regina Barzilay and Tommi Jaakkola and Kateryna Tymoshenko and Alessandro Moschitti and Lluis Marquez},\n\tyear         = 2016,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)},\n\tpages        = {1279--1289}\n}\n@article{lei2019sgd,\n\ttitle        = {{SGD} Learns One-Layer Networks in WGANs},\n\tauthor       = {Lei, Qi and Lee, Jason D and Dimakis, Alexandros G and Daskalakis, Constantinos},\n\tyear         = 2020,\n\tjournal      = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{leibfried2019unified,\n\ttitle        = {A unified bellman optimality principle combining reward maximization and empowerment},\n\tauthor       = {Felix Leibfried and Sergio Pascual-Diaz and Jordi Grau-Moya},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {7869--7880}\n}\n@article{leighton1999multicommodity,\n\ttitle        = {Multicommodity max-flow min-cut theorems and their use in designing approximation algorithms},\n\tauthor       = {Leighton, Tom and Rao, Satish},\n\tyear         = 1999,\n\tjournal      = {Journal of the ACM (JACM)},\n\tpublisher    = {ACM New York, NY, USA},\n\tvolume       = 46,\n\tnumber       = 6,\n\tpages        = {787--832}\n}\n@article{LeightonRao99,\n\ttitle        = {Multicommodity max-flow min-cut theorems and their use in designing approximation algorithms},\n\tauthor       = {Frank Thomson Leighton and Satish Rao},\n\tyear         = 1999,\n\tjournal      = {Journal of the ACM},\n\tvolume       = 46,\n\tnumber       = 6,\n\tpages        = {787--832},\n\tee           = {http://doi.acm.org/10.1145/331524.331526},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de}\n}\n@article{leike2017gridworlds,\n\ttitle        = {{AI} Safety Gridworlds},\n\tauthor       = {Jan Leike and Miljan Martic and Victoria Krakovna and Pedro A. Ortega and Tom Everitt and Andrew Lefrancq and Laurent Orseau and S. Legg},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1711.09883}\n}\n@article{LeL10,\n\ttitle        = {Randomized methods for linear constraints: convergence rates and conditioning},\n\tauthor       = {Leventhal, Dennis and Lewis, Adrian S.},\n\tyear         = 2010,\n\tjournal      = {Math. Oper. Res.},\n\tpublisher    = {INFORMS},\n\tvolume       = 35,\n\tnumber       = 3,\n\tpages        = {641--654},\n\tdoi          = {10.1287/moor.1100.0456},\n\tissn         = {0364-765X},\n\turl          = {http://dx.doi.org/10.1287/moor.1100.0456},\n\tfjournal     = {Mathematics of Operations Research},\n\tmrclass      = {65F10 (15A39 65K05 90C25)},\n\tmrnumber     = 2724068,\n\tmrreviewer   = {Raimundo J. B. de Sampaio}\n}\n@article{LeM08,\n\ttitle        = {Alternating projections on manifolds},\n\tauthor       = {Lewis, Adrian S. and Malick, J\\'er\\^ome},\n\tyear         = 2008,\n\tjournal      = {Math. Oper. Res.},\n\tvolume       = 33,\n\tnumber       = 1,\n\tpages        = {216--234},\n\tdoi          = {10.1287/moor.1070.0291},\n\tissn         = {0364-765X},\n\turl          = {http://dx.doi.org/10.1287/moor.1070.0291},\n\tfjournal     = {Mathematics of Operations Research},\n\tmrclass      = {90C30 (49J53 65K10)},\n\tmrnumber     = 2393548\n}\n@article{lemaignan2012grounding,\n\ttitle        = {Grounding the interaction: Anchoring situated discourse in everyday human-robot interaction},\n\tauthor       = {S. Lemaignan and R. Ros and E. A. Sisbot and R. Alami and M. Beetz},\n\tyear         = 2012,\n\tjournal      = {International Journal of Social Robotics (IJSR)},\n\tvolume       = 4,\n\tnumber       = 2,\n\tpages        = {181--199}\n}\n@article{lenat1985cyc,\n\ttitle        = {CYC: Using Common Sense Knowledge to Overcome Brittleness and Knowledge Acquisition Bottlenecks},\n\tauthor       = {Doug Lenat and Mayank Prakash and Mary Shepherd},\n\tyear         = 1985,\n\tjournal      = {{AI} Magazine},\n\tvolume       = 6,\n\tnumber       = 4\n}\n@article{lenet,\n\ttitle        = {Gradient-based learning applied to document recognition},\n\tauthor       = {Y. Lecun and L. Bottou and Y. Bengio and P. Haffner},\n\tyear         = 1998,\n\tmonth        = nov,\n\tjournal      = {Proceedings of the IEEE},\n\tvolume       = 86,\n\tnumber       = 11,\n\tpages        = {2278--2324},\n\tdoi          = {10.1109/5.726791},\n\tissn         = {0018-9219},\n\tkeywords     = {backpropagation;convolution;multilayer perceptrons;optical character recognition;2D shape variability;GTN;back-propagation;cheque reading;complex decision surface synthesis;convolutional neural network character recognizers;document recognition;document recognition systems;field extraction;gradient based learning technique;gradient-based learning;graph transformer networks;handwritten character recognition;handwritten digit recognition task;high-dimensional patterns;language modeling;multilayer neural networks;multimodule systems;performance measure minimization;segmentation recognition;Character recognition;Feature extraction;Hidden Markov models;Machine learning;Multi-layer neural network;Neural networks;Optical character recognition software;Optical computing;Pattern recognition;Principal component analysis}\n}\n@article{lennart1999system,\n\ttitle        = {System identification: theory for the user},\n\tauthor       = {Lennart, Ljung},\n\tyear         = 1999,\n\tjournal      = {PTR Prentice Hall, Upper Saddle River, NJ},\n\tpages        = {1--14}\n}\n@inproceedings{lenz2013deep,\n\ttitle        = {Deep Learning for Detecting Robotic Grasps},\n\tauthor       = {I. Lenz and H. Lee and A. Saxena},\n\tyear         = 2013,\n\tbooktitle    = {Robotics: Science and Systems (RSS)}\n}\n@inproceedings{lenz2015deepmpc,\n\ttitle        = {DeepMPC: Learning Deep Latent Features for Model Predictive Control},\n\tauthor       = {Ian Lenz and Ross Knepper and Ashutosh Saxena},\n\tyear         = 2015,\n\tbooktitle    = {Robotics Science and Systems (RSS)}\n}\n@article{lepski1997optimal,\n\ttitle        = {Optimal pointwise adaptive methods in nonparametric estimation},\n\tauthor       = {Lepski, Oleg V and Spokoiny, VG},\n\tyear         = 1997,\n\tjournal      = {The Annals of Statistics},\n\tpublisher    = {JSTOR},\n\tpages        = {2512--2546}\n}\n@article{lepskii1991problem,\n\ttitle        = {On a problem of adaptive estimation in Gaussian white noise},\n\tauthor       = {Lepskii, OV},\n\tyear         = 1991,\n\tjournal      = {Theory of Probability \\& Its Applications},\n\tpublisher    = {SIAM},\n\tvolume       = 35,\n\tnumber       = 3,\n\tpages        = {454--466}\n}\n@article{lepskii1992asymptotically,\n\ttitle        = {Asymptotically minimax adaptive estimation. I: Upper bounds. Optimally adaptive estimates},\n\tauthor       = {Lepskii, OV},\n\tyear         = 1992,\n\tjournal      = {Theory of Probability \\& Its Applications},\n\tpublisher    = {SIAM},\n\tvolume       = 36,\n\tnumber       = 4,\n\tpages        = {682--697}\n}\n@article{lepskii1993asymptotically,\n\ttitle        = {Asymptotically minimax adaptive estimation. II. Schemes without optimal adaptation: Adaptive estimators},\n\tauthor       = {Lepskii, OV},\n\tyear         = 1993,\n\tjournal      = {Theory of Probability \\& Its Applications},\n\tpublisher    = {SIAM},\n\tvolume       = 37,\n\tnumber       = 3,\n\tpages        = {433--448}\n}\n@inproceedings{leroux2012sag,\n\ttitle        = {A Stochastic Gradient Method with an Exponential Convergence Rate for Finite Training Sets},\n\tauthor       = {N. Le Roux and M. Schmidt and F. Bach},\n\tyear         = 2012,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{leshno1993multilayer,\n\ttitle        = {Multilayer feedforward networks with a nonpolynomial activation function can approximate any function},\n\tauthor       = {Leshno, Moshe and Lin, Vladimir Ya and Pinkus, Allan and Schocken, Shimon},\n\tyear         = 1993,\n\tjournal      = {Neural networks},\n\tpublisher    = {Elsevier},\n\tvolume       = 6,\n\tnumber       = 6,\n\tpages        = {861--867}\n}\n@article{leskovec,\n\ttitle        = {Latent Multi-group Membership Graph Model},\n\tauthor       = {Kim, Myunghwan and Leskovec, Jure},\n\tyear         = 2012,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1205.4546}\n}\n@inproceedings{leskovec2007cost,\n\ttitle        = {Cost-effective outbreak detection in networks},\n\tauthor       = {\n\t\tJure Leskovec and Andreas Krause and Carlos Guestrin and Christos\n\n\t\tFaloutsos and Jeanne VanBriesen and Natalie Glance\n\t},\n\tyear         = 2007,\n\tbooktitle    = {\n\t\tProceedings of the 13th ACM SIGKDD international conference on Knowledge\n\n\t\tdiscovery and data mining\n\t},\n\tpublisher    = {ACM},\n\taddress      = {San Jose, California, USA},\n\tpages        = {420--429},\n\tnote         = {http://doi.acm.org/10.1145/1281192.1281239},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@article{lessard2016analysis,\n\ttitle        = {Analysis and design of optimization algorithms via integral quadratic constraints},\n\tauthor       = {Lessard, Laurent and Recht, Benjamin and Packard, Andrew},\n\tyear         = 2016,\n\tjournal      = {SIAM Journal on Optimization},\n\tpublisher    = {SIAM},\n\tvolume       = 26,\n\tnumber       = 1,\n\tpages        = {57--95}\n}\n@article{LessardRP14,\n\ttitle        = {Analysis and Design of Optimization Algorithms via Integral Quadratic Constraints},\n\tauthor       = {Laurent Lessard and Benjamin Recht and Andrew Packard},\n\tyear         = 2014,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1408.3595}\n}\n@article{lester2021power,\n\ttitle        = {The Power of Scale for Parameter-Efficient Prompt Tuning},\n\tauthor       = {Lester, Brian and Al-Rfou, Rami and Constant, Noah},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2104.08691}\n}\n@inproceedings{leung96repeat,\n\ttitle        = {Detecting, Localizing and Grouping Repeated Scene Elements from an Image},\n\tauthor       = {Thomas Leung and Jitendra Malik},\n\tyear         = 1996,\n\tbooktitle    = {European Conference on Computer Vision (ECCV)}\n}\n@article{leurgans1993decomposition,\n\ttitle        = {A decomposition for three-way arrays},\n\tauthor       = {Leurgans, SE and Ross, RT and Abel, RB},\n\tyear         = 1993,\n\tjournal      = {SIAM Journal on Matrix Analysis and Applications},\n\tpublisher    = {SIAM},\n\tvolume       = 14,\n\tnumber       = 4,\n\tpages        = {1064--1083}\n}\n@inproceedings{lev04logic,\n\ttitle        = {Solving Logic Puzzles: From Robust Processing to Precise Semantics},\n\tauthor       = {Iddo Lev and Bill MacCartney and Christopher D. Manning and Roger Levy},\n\tyear         = 2004,\n\tbooktitle    = {ACL Workshop on Text Meaning and Interpretation}\n}\n@inproceedings{levesque2013best,\n\ttitle        = {On our best behaviour},\n\tauthor       = {Hector J. Levesque},\n\tyear         = 2013,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@article{levi2019evaluating,\n\ttitle        = {Evaluating and calibrating uncertainty prediction in regression tasks},\n\tauthor       = {Levi, Dan and Gispan, Liran and Giladi, Niv and Fetaya, Ethan},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1905.11659}\n}\n@book{levin2008markov,\n\ttitle        = {{M}arkov Chains and Mixing Times},\n\tauthor       = {D. Levin and Y. Peres and E. Wilmer},\n\tyear         = 2008,\n\tpublisher    = {American Mathematical Society}\n}\n@article{levine2012modeling,\n\ttitle        = {Modeling the rate of senescence: can estimated biological age predict mortality more accurately than chronological age?},\n\tauthor       = {Morgan E Levine},\n\tyear         = 2012,\n\tjournal      = {Journals of Gerontology Series A: Biomedical Sciences and Medical Sciences},\n\tvolume       = 68,\n\tnumber       = 6,\n\tpages        = {667--674}\n}\n@inproceedings{levine2013guided,\n\ttitle        = {Guided policy search},\n\tauthor       = {Levine, Sergey and Koltun, Vladlen},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of The 30th International Conference on Machine Learning},\n\tpages        = {1--9},\n\tdate-added   = {2016-04-04 17:32:28 +0000},\n\tdate-modified = {2016-04-04 17:32:28 +0000}\n}\n@phdthesis{levine2014motor,\n\ttitle        = {Motor Skill Learning with Local Trajectory Methods},\n\tauthor       = {Sergey Levine},\n\tyear         = 2014,\n\tschool       = {Stanford University}\n}\n@article{levine2016end,\n\ttitle        = {End-to-end training of deep visuomotor policies},\n\tauthor       = {Levine, Sergey and Finn, Chelsea and Darrell, Trevor and Abbeel, Pieter},\n\tyear         = 2016,\n\tjournal      = {The Journal of Machine Learning Research},\n\tpublisher    = {JMLR. org},\n\tvolume       = 17,\n\tnumber       = 1,\n\tpages        = {1334--1373}\n}\n@article{levine2020pmi,\n\ttitle        = {PMI-Masking: Principled masking of correlated spans},\n\tauthor       = {Levine, Yoav and Lenz, Barak and Lieber, Opher and Abend, Omri and Leyton-Brown, Kevin and Tennenholtz, Moshe and Shoham, Yoav},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.01825}\n}\n@article{levit85second,\n\ttitle        = {Second-order asymptotic optimality and positive solutions of the Schrödinger equation},\n\tauthor       = {B. Ya. Levit},\n\tyear         = 1985,\n\tjournal      = {Theory of Probability and its Applications},\n\tvolume       = 30,\n\tpages        = {333--363}\n}\n@inproceedings{levy2007speakers,\n\ttitle        = {Speakers optimize information density through syntactic reduction},\n\tauthor       = {Roger Levy and T. Florian Jaeger},\n\tyear         = 2007,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {849--856}\n}\n@incollection{levy2013memory,\n\ttitle        = {Memory and Surprisal in Human Sentence Comprehension},\n\tauthor       = {Roger Levy},\n\tyear         = 2013,\n\tbooktitle    = {Sentence Processing}\n}\n@article{levy2013surprisal,\n\ttitle        = {Surprisal, the {PDC}, and the Primary Locus of Processing Difficulty in Relative Clauses},\n\tauthor       = {Roger Levy and Edward Gibson},\n\tyear         = 2013,\n\tjournal      = {Frontiers in Psychology},\n\tvolume       = 4\n}\n@inproceedings{levy2014linguistic,\n\ttitle        = {Linguistic Regularities in Sparse and Explicit Word Representations},\n\tauthor       = {Levy, Omer and Goldberg, Yoav},\n\tyear         = 2014,\n\tbooktitle    = {Proceedings of the Eighteenth Conference on Computational Natural Language Learning}\n}\n@inproceedings{levy2014neural,\n\ttitle        = {Neural word embedding as implicit matrix factorization},\n\tauthor       = {Levy, Omer and Goldberg, Yoav},\n\tyear         = 2014,\n\tbooktitle    = {Advances in Neural Information Processing Systems}\n}\n@article{levy2016power,\n\ttitle        = {The Power of Normalization: Faster Evasion of Saddle Points},\n\tauthor       = {Levy, Kfir Y},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1611.04831}\n}\n@inproceedings{levy2017zero,\n\ttitle        = {Zero-Shot Relation Extraction via Reading Comprehension},\n\tauthor       = {Omer Levy and Minjoon Seo and Eunsol Choi and Luke Zettlemoyer},\n\tyear         = 2017,\n\tbooktitle    = {Computational Natural Language Learning (CoNLL)}\n}\n@inproceedings{levy2018communicative,\n\ttitle        = {Communicative Efficiency, Uniform Information Density, and the Rational Speech Act Theory},\n\tauthor       = {Roger Levy},\n\tyear         = 2018,\n\tbooktitle    = {Proceedings of the 40th Annual Meeting of the Cognitive Science Society},\n\tpages        = {684--689}\n}\n@article{levy2020large,\n\ttitle        = {Large-Scale Methods for Distributionally Robust Optimization},\n\tauthor       = {Daniel Levy and Yair Carmon and John C Duchi and Aaron Sidford},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.05893}\n}\n@article{lewicki2000learning,\n\ttitle        = {Learning overcomplete representations},\n\tauthor       = {M. S. Lewicki and T. J. Sejnowski},\n\tyear         = 2000,\n\tjournal      = {Neural computation},\n\tpublisher    = {MIT Press},\n\tvolume       = 12,\n\tnumber       = 2,\n\tpages        = {337--365}\n}\n@inproceedings{lewis1992feature,\n\ttitle        = {Feature selection and feature extract ion for text categorization},\n\tauthor       = {David D Lewis},\n\tyear         = 1992,\n\tbooktitle    = {Speech and Natural Language: Proceedings of a Workshop Held at Harriman, New York, February 23-26, 1992}\n}\n@inproceedings{lewis1994heterogeneous,\n\ttitle        = {Heterogeneous uncertainty sampling for supervised learning},\n\tauthor       = {David D Lewis and Jason Catlett},\n\tyear         = 1994,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {148--156}\n}\n@inproceedings{lewis1994sequential,\n\ttitle        = {A sequential algorithm for training text classifiers},\n\tauthor       = {David D Lewis and William A Gale},\n\tyear         = 1994,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)}\n}\n@inproceedings{lewis1995evaluating,\n\ttitle        = {Evaluating and Optimizing Autonomous Text Classification Systems},\n\tauthor       = {David D. Lewis},\n\tyear         = 1995,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)}\n}\n@article{lewis2004rcv1,\n\ttitle        = {RCV1: A New Benchmark Collection for Text Categorization Research},\n\tauthor       = {David D. Lewis and Yiming Yang and Tony G. Rose and Fan Li},\n\tyear         = 2004,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 5\n}\n@book{lewis2008convention,\n\ttitle        = {Convention: A philosophical study},\n\tauthor       = {David Lewis},\n\tyear         = 2008,\n\tpublisher    = {John Wiley \\& Sons}\n}\n@article{lewis2013combining,\n\ttitle        = {Combining distributional and logical semantics},\n\tauthor       = {Mike Lewis and Mark Steedman},\n\tyear         = 2013,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 1\n}\n@inproceedings{lewis2014ccg,\n\ttitle        = {A* {CCG} Parsing with a Supertag-factored Model},\n\tauthor       = {Mike Lewis and Mark Steedman},\n\tyear         = 2014,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{lewis2017deal,\n\ttitle        = {Deal or No Deal? End-to-End Learning for Negotiation Dialogues},\n\tauthor       = {Mike Lewis and Denis Yarats and Yann N. Dauphin and Devi Parikh and Dhruv Batra},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{lewis2020bart,\n\ttitle        = {BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension},\n\tauthor       = {Mike Lewis and Yinhan Liu and Naman Goyal and Marjan Ghazvininejad and Abdelrahman Mohamed and Omer Levy and Ves Stoyanov and Luke Zettlemoyer},\n\tyear         = 2020,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{li1997relativeiii,\n\ttitle        = {Relative perturbation theory. III. More bounds on eigenvalue variation},\n\tauthor       = {Li, Ren-Cang},\n\tyear         = 1997,\n\tjournal      = {Linear algebra and its applications},\n\tpublisher    = {Elsevier},\n\tvolume       = 266,\n\tpages        = {337--345}\n}\n@article{li1998relative,\n\ttitle        = {Relative perturbation theory: I. Eigenvalue and singular value variations},\n\tauthor       = {Li, Ren-Cang},\n\tyear         = 1998,\n\tjournal      = {SIAM Journal on Matrix Analysis and Applications},\n\tpublisher    = {SIAM},\n\tvolume       = 19,\n\tnumber       = 4,\n\tpages        = {956--982}\n}\n@article{li1998relativeII,\n\ttitle        = {Relative perturbation theory: II. Eigenspace and singular subspace variations},\n\tauthor       = {Li, Ren-Cang},\n\tyear         = 1998,\n\tjournal      = {SIAM Journal on Matrix Analysis and Applications},\n\tpublisher    = {SIAM},\n\tvolume       = 20,\n\tnumber       = 2,\n\tpages        = {471--492}\n}\n@inproceedings{li2002motion,\n\ttitle        = {\n\t\tMotion texture: a two-level statistical model for character motion\n\n\t\tsynthesis\n\t},\n\tauthor       = {Li, Yan and Wang, Tianshu and Shum, Heung-Yeung},\n\tyear         = 2002,\n\tbooktitle    = {\n\t\tProceedings of the 29th annual conference on Computer graphics and\n\n\t\tinteractive techniques\n\t},\n\tlocation     = {San Antonio, Texas},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {SIGGRAPH '02},\n\tpages        = {465--472},\n\tdoi          = {http://doi.acm.org/10.1145/566570.566604},\n\tisbn         = {1-58113-521-1},\n\tacmid        = 566604,\n\tkeywords     = {\n\t\tlinear dynamic systems, motion editing, motion synthesis, motion texture,\n\n\t\ttexture synthesis\n\t},\n\tnumpages     = 8\n}\n@inproceedings{li2006towards,\n\ttitle        = {Towards a unified theory of state abstraction for MDPs},\n\tauthor       = {Li, Lihong and Walsh, Thomas J and Littman, Michael L},\n\tyear         = 2006,\n\tbooktitle    = {ISAIM}\n}\n@article{li2007estimating,\n\ttitle        = {Estimating crop yield from multi-temporal satellite data using multivariate regression and neural network techniques},\n\tauthor       = {Ainong Li and Shunlin Liang and Angsheng Wang and Jun Qin},\n\tyear         = 2007,\n\tjournal      = {Photogrammetric Engineering \\& Remote Sensing},\n\tvolume       = 73,\n\tnumber       = 10,\n\tpages        = {1149--1157}\n}\n@inproceedings{li2008laziness,\n\ttitle        = {Laziness is a virtue: Motion stitching using effort minimization},\n\tauthor       = {Lei Li and James McCann and Christos Faloutsos and Nancy Pollard},\n\tyear         = 2008,\n\tbooktitle    = {Short Papers Proceedings of EUROGRAPHICS},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@inproceedings{li2009dynammo,\n\ttitle        = {\n\t\tDynaMMo: Mining and Summarization of Coevolving Sequences with Missing\n\n\t\tValues\n\t},\n\tauthor       = {Lei Li and James McCann and Nancy Pollard and Christos Faloutsos},\n\tyear         = 2009,\n\tbooktitle    = {\n\t\tKDD '09: Proceeding of the 15th ACM SIGKDD international conference\n\n\t\ton Knowledge discovery and data mining\n\t},\n\tlocation     = {Paris, France},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tisbn         = {978-1-60558-193-4},\n\tfile         = {:http\\://www.cs.cmu.edu/~leili/pubs/li-kdd09.pdf:PDF},\n\towner        = {leili},\n\ttimestamp    = {2009.05.01}\n}\n@article{li2010parsimonious,\n\ttitle        = {Parsimonious linear fingerprinting for time series},\n\tauthor       = {Li, Lei and Prakash, B. Aditya and Faloutsos, Christos},\n\tyear         = 2010,\n\tmonth        = sep,\n\tjournal      = {Proc. VLDB Endow.},\n\tpublisher    = {VLDB Endowment},\n\tvolume       = 3,\n\tpages        = {385--396},\n\tissn         = {2150-8097},\n\tacmid        = 1920893,\n\tissue        = {1-2},\n\tissue_date   = {September 2010},\n\tnumpages     = 12\n}\n@article{li2011knows,\n\ttitle        = {Knows what it knows: a framework for self-aware learning},\n\tauthor       = {Li, Lihong and Littman, Michael L and Walsh, Thomas J and Strehl, Alexander L},\n\tyear         = 2011,\n\tjournal      = {Machine learning},\n\tpublisher    = {Springer},\n\tvolume       = 82,\n\tnumber       = 3,\n\tpages        = {399--443}\n}\n@inproceedings{li2011semi,\n\ttitle        = {Semi-supervised learning for imbalanced sentiment classification},\n\tauthor       = {Shoushan Li and Zhongqing Wang and Guodong Zhou and Sophia Yat Mei Lee},\n\tyear         = 2011,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@inproceedings{li2012twiner,\n\ttitle        = {Twiner: named entity recognition in targeted twitter stream},\n\tauthor       = {Chenliang Li and Jianshu Weng and Qi He and Yuxia Yao and Anwitaman Datta and Aixin Sun and Bu-Sung Lee},\n\tyear         = 2012,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)},\n\tpages        = {721--730}\n}\n@inproceedings{li2013joint,\n\ttitle        = {Joint event extraction via structured prediction with global features},\n\tauthor       = {Qi Li and Heng Ji and Liang Huang},\n\tyear         = 2013,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{li2014mean,\n\ttitle        = {Mean-Field Networks},\n\tauthor       = {Yujia Li and Richard Zemel},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1410.5884}\n}\n@article{li2015convergence,\n\ttitle        = {Convergence of the block Lanczos method for eigenvalue clusters},\n\tauthor       = {Li, Ren-Cang and Zhang, Lei-Hong},\n\tyear         = 2015,\n\tjournal      = {Numerische Mathematik},\n\tpublisher    = {Springer},\n\tvolume       = 131,\n\tnumber       = 1,\n\tpages        = {83--113}\n}\n@inproceedings{li2015gmmn,\n\ttitle        = {Generative Moment Matching Networks},\n\tauthor       = {Yujia Li and Kevin Swersky and Richard Zemel},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{li2015towards,\n\ttitle        = {Towards making unlabeled data never hurt},\n\tauthor       = {Yu-Feng Li and Zhi-Hua Zhou},\n\tyear         = 2015,\n\tjournal      = {IEEE Transactions on Pattern Analysis and Machine Intelligence},\n\tvolume       = 37,\n\tnumber       = 1,\n\tpages        = {175--188}\n}\n@article{li2016adversarial,\n\ttitle        = {Adversarial examples detection in deep networks with convolutional filter statistics},\n\tauthor       = {Xin Li and Fuxin Li},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1612.07767}\n}\n@inproceedings{li2016data,\n\ttitle        = {Data Poisoning Attacks on Factorization-Based Collaborative Filtering},\n\tauthor       = {Bo Li and Yining Wang and Aarti Singh and Yevgeniy Vorobeychik},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{li2016diversity,\n\ttitle        = {A Diversity-Promoting Objective Function for Neural Conversation Models},\n\tauthor       = {Jiwei Li and Michel Galley and Chris Brockett and Jianfeng Gao and William B. Dolan},\n\tyear         = 2016,\n\tbooktitle    = {Human Language Technology and North American Association for Computational Linguistics (HLT/NAACL)},\n\tpages        = {110--119}\n}\n@inproceedings{li2016gated,\n\ttitle        = {Gated graph sequence neural networks},\n\tauthor       = {Yujia Li and Daniel Tarlow and Marc Brockschmidt and Richard Zemel},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{li2016learning,\n\ttitle        = {Learning Through Dialogue Interactions},\n\tauthor       = {Jiwei Li and Alexander H Miller and Sumit Chopra and Marc'Aurelio Ranzato and Jason Weston},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1612.04936}\n}\n@inproceedings{li2016persona,\n\ttitle        = {A Persona-Based Neural Conversation Model},\n\tauthor       = {Jiwei Li and Michel Galley and Chris Brockett and Jianfeng Gao and Bill Dolan},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{li2016recovery,\n\ttitle        = {Recovery guarantee of weighted low-rank approximation via alternating minimization},\n\tauthor       = {Li, Yuanzhi and Liang, Yingyu and Risteski, Andrej},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1602.02262}\n}\n@inproceedings{li2016rl,\n\ttitle        = {Deep Reinforcement Learning for Dialogue Generation},\n\tauthor       = {Jiwei Li and Will Monroe and Alan Ritter and Daniel Jurafsky and Michel Galley and Jianfeng Gao},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{li2016symmetry,\n\ttitle        = {Symmetry, Saddle Points, and Global Geometry of Nonconvex Matrix Factorization},\n\tauthor       = {Li, Xingguo and Wang, Zhaoran and Lu, Junwei and Arora, Raman and Haupt, Jarvis and Liu, Han and Zhao, Tuo},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1612.09296}\n}\n@article{li2016understanding,\n\ttitle        = {Understanding Neural Networks through Representation Erasure},\n\tauthor       = {Jiwei Li and Will Monroe and Dan Jurafsky},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1612.08220}\n}\n@article{li2016user,\n\ttitle        = {A User Simulator for Task-Completion Dialogues},\n\tauthor       = {Xiujun Li and Zachary C. Lipton and Bhuwan Dhingra and Lihong Li and Jianfeng Gao and Yun-Nung Chen},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@inproceedings{li2017adversarial,\n\ttitle        = {Adversarial Learning for Neural Dialogue Generation},\n\tauthor       = {Jiwei Li and Will Monroe and Tianlin Shi and Alan Ritter and Dan Jurafsky},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{li2017algorithmic,\n\ttitle        = {Algorithmic regularization in over-parameterized matrix sensing and neural networks with quadratic activations},\n\tauthor       = {Li, Yuanzhi and Ma, Tengyu and Zhang, Hongyang},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1712.09203},\n\tbooktitle    = {Conference On Learning Theory},\n\tpages        = {2--47},\n\torganization = {PMLR}\n}\n@article{li2017convergence,\n\ttitle        = {Convergence Analysis of Two-layer Neural Networks with ReLU Activation},\n\tauthor       = {Li, Yuanzhi and Yuan, Yang},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1705.09886}\n}\n@inproceedings{li2017deeper,\n\ttitle        = {Deeper, broader and artier domain generalization},\n\tauthor       = {Da Li and Yongxin Yang and Yi-Zhe Song and Timothy M Hospedales},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the IEEE international conference on computer vision},\n\tpages        = {5542--5550}\n}\n@inproceedings{li2017demystifying,\n\ttitle        = {Demystifying Neural Style Transfer},\n\tauthor       = {Yanghao Li and Naiyan Wang and Jiaying Liu and Xiaodi Hou},\n\tyear         = 2017,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@inproceedings{li2017dialogue,\n\ttitle        = {Dialogue learning with human-in-the-loop},\n\tauthor       = {Jiwei Li and Alexander H Miller and Sumit Chopra and Marc\\'Aurelio Ranzato and Jason Weston},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{li2017learning,\n\ttitle        = {Learning through dialogue interactions by asking questions},\n\tauthor       = {Jiwei Li and Alexander H Miller and Sumit Chopra and Marc\\'Aurelio Ranzato and Jason Weston},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{li2017provably,\n\ttitle        = {Provably optimal algorithms for generalized linear contextual bandits},\n\tauthor       = {Li, Lihong and Lu, Yu and Zhou, Dengyong},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {2071--2080},\n\torganization = {PMLR}\n}\n@inproceedings{li2017reinforcement,\n\ttitle        = {Reinforcement learning with temporal logic rewards},\n\tauthor       = {Li, Xiao and Vasile, Cristian-Ioan and Belta, Calin},\n\tyear         = 2017,\n\tbooktitle    = {2017 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},\n\tpages        = {3834--3839},\n\torganization = {IEEE}\n}\n@inproceedings{li2017robust,\n\ttitle        = {Robust and Proper Learning for Mixtures of {G}aussians via Systems of Polynomial Inequalities},\n\tauthor       = {Jerry Li and Ludwig Schmidt},\n\tyear         = 2017,\n\tbooktitle    = {Conference on Learning Theory (COLT)},\n\tpages        = {1302--1382}\n}\n@article{li2017sparse,\n\ttitle        = {Robust Sparse Estimation Tasks in High Dimensions},\n\tauthor       = {Jerry Li},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@misc{li2017webvision,\n\ttitle        = {WebVision Database: Visual Learning and Understanding from Web Data},\n\tauthor       = {Wen Li and Limin Wang and Wei Li and Eirikur Agustsson and Luc Van Gool},\n\tyear         = 2017,\n\teprint       = {1708.02862},\n\tarchiveprefix = {arXiv},\n\tprimaryclass = {cs.CV}\n}\n@inproceedings{li2018deep,\n\ttitle        = {Deep domain generalization via conditional invariant adversarial networks},\n\tauthor       = {Ya Li and Xinmei Tian and Mingming Gong and Yajing Liu and Tongliang Liu and Kun Zhang and Dacheng Tao},\n\tyear         = 2018,\n\tbooktitle    = {European Conference on Computer Vision (ECCV)},\n\tpages        = {624--639}\n}\n@inproceedings{li2018domain,\n\ttitle        = {Domain generalization with adversarial feature learning},\n\tauthor       = {Li, Haoliang and Pan, Sinno Jialin and Wang, Shiqi and Kot, Alex C},\n\tyear         = 2018,\n\tbooktitle    = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},\n\tpages        = {5400--5409}\n}\n@article{li2018estimation,\n\ttitle        = {Estimation of {Markov} chain via rank-constrained likelihood},\n\tauthor       = {Li, Xudong and Wang, Mengdi and Zhang, Anru},\n\tyear         = 2018,\n\tjournal      = {Proceedings of the 35th international conference on Machine learning}\n}\n@inproceedings{li2018learning,\n\ttitle        = {Learning overparameterized neural networks via stochastic gradient descent on structured data},\n\tauthor       = {Li, Yuanzhi and Liang, Yingyu},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {8157--8166},\n\turl          = {https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/view/16067},\n\tcdate        = 1514764800000\n}\n@article{li2018second,\n\ttitle        = {Second-Order Adversarial Attack and Certifiable Robustness},\n\tauthor       = {Bai Li and Changyou Chen and Wenlin Wang and Lawrence Carin},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1809.03113}\n}\n@inproceedings{li2018style,\n\ttitle        = {Delete, Retrieve, Generate: A Simple Approach to Sentiment and Style Transfer},\n\tauthor       = {Juncen Li and Robin Jia and He He and Percy Liang},\n\tyear         = 2018,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@phdthesis{li2018thesis,\n\ttitle        = {Principled Approaches to Robust Machine Learning and Beyond},\n\tauthor       = {Jerry Li},\n\tyear         = 2018,\n\tschool       = {Massachusetts Institute of Technology}\n}\n@article{li2019anchor,\n\ttitle        = {Anchor: trans-cell type prediction of transcription factor binding sites},\n\tauthor       = {Hongyang Li and Daniel Quang and Yuanfang Guan},\n\tyear         = 2019,\n\tjournal      = {Genome research},\n\tvolume       = 29,\n\tnumber       = 2,\n\tpages        = {281--292}\n}\n@article{li2019enhanced,\n\ttitle        = {Enhanced convolutional neural tangent kernels},\n\tauthor       = {Li, Zhiyuan and Wang, Ruosong and Yu, Dingli and Du, Simon S and Hu, Wei and Salakhutdinov, Ruslan and Arora, Sanjeev},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1911.00809}\n}\n@article{li2019exponential,\n\ttitle        = {An exponential learning rate schedule for deep learning},\n\tauthor       = {Li, Zhiyuan and Arora, Sanjeev},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1910.07454}\n}\n@article{li2019fair,\n\ttitle        = {Fair resource allocation in federated learning},\n\tauthor       = {Tian Li and Maziar Sanjabi and Ahmad Beirami and Virginia Smith},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1905.10497}\n}\n@article{li2019generalization,\n\ttitle        = {On generalization error bounds of noisy gradient methods for non-convex learning},\n\tauthor       = {Li, Jian and Luo, Xuanyuan and Qiao, Mingda},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1902.00621}\n}\n@article{li2019incremental,\n\ttitle        = {Incremental (Sub)-Gradient Descent for Weakly Convex Optimization},\n\tauthor       = {Li, Xiao and Zhu, Zhihui and So, Anthony Man-Cho and Lee, Jason D.},\n\tyear         = 2019,\n\tjournal      = {Submitted to SIOPT}\n}\n@article{li2019leopard,\n\ttitle        = {Leopard: fast decoding cell type-specific transcription factor binding landscape at single-nucleotide resolution},\n\tauthor       = {Hongyang Li and Yuanfang Guan},\n\tyear         = 2019,\n\tjournal      = {bioRxiv}\n}\n@article{li2019reducing,\n\ttitle        = {Reducing Over-confident Errors outside the Known Distribution},\n\tauthor       = {Zhizhong Li and Derek Hoiem},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1804.03166}\n}\n@inproceedings{li2019repair,\n\ttitle        = {Repair: Removing representation bias by dataset resampling},\n\tauthor       = {Yi Li and Nuno Vasconcelos},\n\tyear         = 2019,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {9572--9581}\n}\n@inproceedings{li2019towards,\n\ttitle        = {Towards explaining the regularization effect of initial large learning rate in training neural networks},\n\tauthor       = {Li, Yuanzhi and Wei, Colin and Ma, Tengyu},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1907.04595},\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {11669--11680}\n}\n@inproceedings{li2020breaking,\n\ttitle        = {Breaking the sample size barrier in model-based reinforcement learning with a generative model},\n\tauthor       = {Li, Gen and Wei, Yuting and Chi, Yuejie and Gu, Yuantao and Chen, Yuxin},\n\tyear         = 2020,\n\tbooktitle    = {Advances in Neural Information Processing Systems}\n}\n@inproceedings{li2020greedy,\n\ttitle        = {A Tight Analysis of Greedy Yields Subexponential Time Approximation for Uniform Decision Tree},\n\tauthor       = {Ray Li and Percy Liang and Stephen Mussmann},\n\tyear         = 2020,\n\tbooktitle    = {Symposium on Discrete Algorithms (SODA)}\n}\n@inproceedings{li2020intuitive,\n\ttitle        = {Learning User-Preferred Mappings for Intuitive Robot Control},\n\tauthor       = {Mengxi Li and Dylan P. Losey and Jeannette Bohg and Dorsa Sadigh},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Intelligent Robots and Systems (IROS)}\n}\n@inproceedings{li2020learning,\n\ttitle        = {Learning over-parametrized two-layer neural networks beyond ntk},\n\tauthor       = {Li, Yuanzhi and Ma, Tengyu and Zhang, Hongyang R},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.04596},\n\tbooktitle    = {Conference on Learning Theory},\n\tpages        = {2613--2682},\n\torganization = {PMLR}\n}\n@article{li2020towards,\n\ttitle        = {Towards Resolving the Implicit Bias of Gradient Descent for Matrix Factorization: Greedy Low-Rank Learning},\n\tauthor       = {Li, Zhiyuan and Luo, Yuping and Lyu, Kaifeng},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2012.09839}\n}\n@misc{li2021eluder,\n\ttitle        = {Eluder Dimension and Generalized Rank},\n\tauthor       = {Gene Li and Pritish Kamath and Dylan J. Foster and Nathan Srebro},\n\tyear         = 2021,\n\teprint       = {2104.06970},\n\tarchiveprefix = {arXiv},\n\tprimaryclass = {cs.LG}\n}\n@article{li2021prefix,\n\ttitle        = {Prefix-Tuning: Optimizing Continuous Prompts for Generation},\n\tauthor       = {Li, Xiang Lisa and Liang, Percy},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2101.00190},\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@techreport{liang03maxwmfarm,\n\ttitle        = {How Much Of A Hypertree Can Be Captured By Windmills?},\n\tauthor       = {Percy Liang and Nathan Srebro},\n\tyear         = 2003,\n\tinstitution  = {Massachusetts Institute of Technology}\n}\n@techreport{liang04markov,\n\ttitle        = {Methods and Experiments With Bounded Tree-width {M}arkov Networks},\n\tauthor       = {Percy Liang and Nathan Srebro},\n\tyear         = 2004,\n\tinstitution  = {Massachusetts Institute of Technology}\n}\n@inproceedings{liang05geometric,\n\ttitle        = {Efficient Geometric Algorithms for Parsing in Two Dimensions},\n\tauthor       = {Percy Liang and Mukund Narasimhan and Michael Shilman and Paul Viola},\n\tyear         = 2005,\n\tbooktitle    = {International Conference on Document Analysis and Recognition (ICDAR)}\n}\n@techreport{liang05hypercycle,\n\ttitle        = {A Data Structure for Maintaining Acyclicity in Hypergraphs},\n\tauthor       = {Percy Liang and Nathan Srebro},\n\tyear         = 2005,\n\tinstitution  = {Massachusetts Institute of Technology}\n}\n@inproceedings{liang05mcmaster,\n\ttitle        = {Linear Programming in Bounded Tree-width {M}arkov Networks},\n\tauthor       = {Percy Liang and Nathan Srebro},\n\tyear         = 2005,\n\tbooktitle    = {Mathematical Programing for Data Mining and Machine Learning Workshop at McMaster University}\n}\n@mastersthesis{liang05meng,\n\ttitle        = {Semi-Supervised Learning for Natural Language},\n\tauthor       = {Percy Liang},\n\tyear         = 2005,\n\tschool       = {Massachusetts Institute of Technology}\n}\n@inproceedings{liang06alignment,\n\ttitle        = {Alignment by Agreement},\n\tauthor       = {Percy Liang and Ben Taskar and Dan Klein},\n\tyear         = 2006,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)},\n\tpages        = {104--111}\n}\n@inproceedings{liang06discrimative,\n\ttitle        = {An End-to-End Discriminative Approach to Machine Translation},\n\tauthor       = {Percy Liang and Alexandre Bouchard-C\\^ot\\'e and Dan Klein and Ben Taskar},\n\tyear         = 2006,\n\tbooktitle    = {International Conference on Computational Linguistics and Association for Computational Linguistics (COLING/ACL)}\n}\n@inproceedings{liang07infpcfg,\n\ttitle        = {The Infinite {PCFG} using Hierarchical {D}irichlet Processes},\n\tauthor       = {Percy Liang and Slav Petrov and Michael I. Jordan and Dan Klein},\n\tyear         = 2007,\n\tbooktitle    = {Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP/CoNLL)}\n}\n@inproceedings{liang07permdp,\n\ttitle        = {A permutation-augmented sampler for {D}irichlet process mixture models},\n\tauthor       = {Percy Liang and Michael I. Jordan and Ben Taskar},\n\tyear         = 2007,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{liang07tutorial,\n\ttitle        = {Structured {B}ayesian Nonparametric Models with Variational Inference (tutorial)},\n\tauthor       = {Percy Liang and Dan Klein},\n\tyear         = 2007,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{liang08agreement,\n\ttitle        = {Agreement-Based Learning},\n\tauthor       = {Percy Liang and Dan Klein and Michael I. Jordan},\n\tyear         = 2008,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{liang08asymptotics,\n\ttitle        = {An Asymptotic Analysis of Generative, Discriminative, and Pseudolikelihood Estimators},\n\tauthor       = {Percy Liang and Michael I. Jordan},\n\tyear         = 2008,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {584--591}\n}\n@inproceedings{liang08errors,\n\ttitle        = {Analyzing the Errors of Unsupervised Learning},\n\tauthor       = {Percy Liang and Dan Klein},\n\tyear         = 2008,\n\tbooktitle    = {Human Language Technology and Association for Computational Linguistics (HLT/ACL)}\n}\n@inproceedings{liang08structure,\n\ttitle        = {Structure Compilation: Trading Structure for Features},\n\tauthor       = {Percy Liang and Hal {Daum{\\'e} III} and Dan Klein},\n\tyear         = 2008,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@incollection{liang09hdppcfg,\n\ttitle        = {Probabilistic grammars and hierarchical {D}irichlet processes},\n\tauthor       = {Percy Liang and Michael I. Jordan and Dan Klein},\n\tyear         = 2009,\n\tbooktitle    = {The Oxford Handbook of Applied Bayesian Analysis}\n}\n@inproceedings{liang09measurements,\n\ttitle        = {Learning from Measurements in Exponential Families},\n\tauthor       = {Percy Liang and Michael I. Jordan and Dan Klein},\n\tyear         = 2009,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{liang09online,\n\ttitle        = {Online {EM} for Unsupervised Models},\n\tauthor       = {Percy Liang and Dan Klein},\n\tyear         = 2009,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)},\n\tpages        = {611--619}\n}\n@inproceedings{liang09semantics,\n\ttitle        = {Learning Semantic Correspondences with Less Supervision},\n\tauthor       = {Percy Liang and Michael I. Jordan and Dan Klein},\n\tyear         = 2009,\n\tbooktitle    = {Association for Computational Linguistics and International Joint Conference on Natural Language Processing (ACL-IJCNLP)},\n\tpages        = {91--99}\n}\n@inproceedings{liang10abstraction,\n\ttitle        = {A Dynamic Evaluation of Static Heap Abstractions},\n\tauthor       = {Percy Liang and Omer Tripp and Mayur Naik and Mooly Sagiv},\n\tyear         = 2010,\n\tbooktitle    = {Object-Oriented Programming, Systems, Languages, and Applications (OOPSLA)}\n}\n@inproceedings{liang10programs,\n\ttitle        = {Learning Programs: A Hierarchical {B}ayesian Approach},\n\tauthor       = {Percy Liang and Michael I. Jordan and Dan Klein},\n\tyear         = 2010,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {639--646}\n}\n@inproceedings{liang10regimes,\n\ttitle        = {On the Interaction between Norm and Dimensionality: Multiple Regimes in Learning},\n\tauthor       = {Percy Liang and Nati Srebro},\n\tyear         = 2010,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{liang10regularizationTR,\n\ttitle        = {Asymptotically Optimal Regularization in Smooth Parametric Models},\n\tauthor       = {Percy Liang and Francis Bach and Guillaume Bouchard and Michael I. Jordan},\n\tyear         = 2010,\n\tjournal      = {arXiv},\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{liang10type,\n\ttitle        = {Type-Based {MCMC}},\n\tauthor       = {Percy Liang and Michael I. Jordan and Dan Klein},\n\tyear         = 2010,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@inproceedings{liang11minimal,\n\ttitle        = {Learning Minimal Abstractions},\n\tauthor       = {Percy Liang and Omer Tripp and Mayur Naik},\n\tyear         = 2011,\n\tbooktitle    = {Principles of Programming Languages (POPL)}\n}\n@inproceedings{liang11pruning,\n\ttitle        = {Scaling up Abstraction Refinement via Pruning},\n\tauthor       = {Percy Liang and Mayur Naik},\n\tyear         = 2011,\n\tbooktitle    = {Programming Language Design and Implementation (PLDI)}\n}\n@article{liang13cl,\n\ttitle        = {Learning Dependency-Based Compositional Semantics},\n\tauthor       = {Percy Liang and Michael Jordan and Dan Klein},\n\tyear         = 2013,\n\tjournal      = {Computational Linguistics},\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tvolume       = 39,\n\tpages        = {389--446},\n\tschool       = {University of California Berkeley at Berkeley}\n}\n@inproceedings{liang2009racnet,\n\ttitle        = {RACNet: a high-fidelity data center sensing network},\n\tauthor       = {\n\t\tLiang, Chieh-Jan Mike and Liu, Jie and Luo, Liqian and Terzis, Andreas\n\n\t\tand Zhao, Feng\n\t},\n\tyear         = 2009,\n\tbooktitle    = {Sensys},\n\tlocation     = {Berkeley, California},\n\tpages        = {15--28},\n\tdoi          = {http://doi.acm.org/10.1145/1644038.1644041},\n\tisbn         = {978-1-60558-519-2},\n\tbdsk-url-1   = {http://doi.acm.org/10.1145/1644038.1644041},\n\tdate-added   = {2010-03-14 14:15:56 -0400},\n\tdate-modified = {2011-01-30 23:12:34 -0500},\n\tkeywords     = {terzisbib}\n}\n@article{liang2013lambdadcs,\n\ttitle        = {Lambda Dependency-Based Compositional Semantics},\n\tauthor       = {Percy Liang},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1309.4408}\n}\n@article{liang2014talking,\n\ttitle        = {Talking to computers in natural language},\n\tauthor       = {Percy Liang},\n\tyear         = 2014,\n\tjournal      = {XRDS: Crossroads, The ACM Magazine for Students},\n\tvolume       = 21,\n\tnumber       = 1,\n\tpages        = {18--21}\n}\n@article{liang2015semantics,\n\ttitle        = {Bringing machine learning and compositional semantics together},\n\tauthor       = {Percy Liang and Christopher Potts},\n\tyear         = 2015,\n\tjournal      = {Annual Reviews of Linguistics},\n\tvolume       = 1,\n\tnumber       = 1,\n\tpages        = {355--376}\n}\n@article{liang2016deep,\n\ttitle        = {Why Deep Neural Networks for Function Approximation?},\n\tauthor       = {Liang, Shiyu and Srikant, R},\n\tyear         = 2016\n}\n@article{liang2016executable,\n\ttitle        = {Learning Executable Semantic Parsers for Natural Language Understanding},\n\tauthor       = {Percy Liang},\n\tyear         = 2016,\n\tjournal      = {Communications of the ACM},\n\tvolume       = 59\n}\n@inproceedings{liang2017nsm,\n\ttitle        = {Neural Symbolic Machines: Learning Semantic Parsers on {F}reebase with Weak Supervision},\n\tauthor       = {Chen Liang and Jonathan Berant and Quoc Le and Kenneth D. Forbus, Ni Lao},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{liang2018adding,\n\ttitle        = {Adding One Neuron Can Eliminate All Bad Local Minima},\n\tauthor       = {Liang, Shiyu and Sun, Ruoyu and Lee, Jason D and Srikant, R},\n\tyear         = 2018,\n\tjournal      = {Neural Information Processing Systems (NIPS)}\n}\n@inproceedings{liang2018enhancing,\n\ttitle        = {Enhancing The Reliability of Out-of-distribution Image Detection in Neural Networks},\n\tauthor       = {Shiyu Liang and Yixuan Li and R. Srikant},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{liang2018just,\n\ttitle        = {Just interpolate: Kernel\" ridgeless\" regression can generalize},\n\tauthor       = {Tengyuan Liang and Alexander Rakhlin},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1808.00387}\n}\n@inproceedings{liang2018mapo,\n\ttitle        = {Memory Augmented Policy Optimization for Program Synthesis with Generalization},\n\tauthor       = {Chen Liang and Mohammad Norouzi and Jonathan Berant and Quoc Le and Ni Lao},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{liao2005logistic,\n\ttitle        = {Logistic regression with an auxiliary data source},\n\tauthor       = {Liao, Xuejun and Xue, Ya and Carin, Lawrence},\n\tyear         = 2005,\n\tbooktitle    = {Proceedings of the 22nd international conference on Machine learning},\n\tpages        = {505--512},\n\torganization = {ACM}\n}\n@article{libbrecht2015machine,\n\ttitle        = {Machine learning applications in genetics and genomics},\n\tauthor       = {Maxwell W Libbrecht and William Stafford Noble},\n\tyear         = 2015,\n\tjournal      = {Nature Reviews Genetics},\n\tvolume       = 16,\n\tnumber       = 6,\n\tpages        = {321--332}\n}\n@misc{LibSVMdata,\n\ttitle        = {{LIBSVM Data: Classification, Regression and Multi-label}},\n\tauthor       = {Fan, Rong-En and Lin, Chih-Jen},\n\turl          = {http://www.csie.ntu.edu.tw/cjlin/libsvmtools/datasets},\n\tnote         = {Accessed: 2015-06}\n}\n@book{lichtenstein1982calibration,\n\ttitle        = {Judgement under Uncertainty: Heuristics and Biases},\n\tauthor       = {Sarah Lichtenstein and Baruch Fischhoff and Lawrence D. Phillips},\n\tyear         = 1982,\n\tpublisher    = {Cambridge University Press}\n}\n@article{Lieb1973convex,\n\ttitle        = {Convex trace functions and the Wigner-Yanase-Dyson conjecture},\n\tauthor       = {Lieb, Elliott H.},\n\tyear         = 1973,\n\tjournal      = {Advances in Mathematics},\n\tpublisher    = {Elsevier},\n\tvolume       = 11,\n\tnumber       = 3,\n\tpages        = {267--288}\n}\n@article{lieb2006elementary,\n\ttitle        = {Elementary proof of a theorem of Jean Ville},\n\tauthor       = {Lieb, Elliott H and Osherson, Daniel and Weinstein, Scott},\n\tyear         = 2006,\n\tjournal      = {arXiv preprint cs/0607054}\n}\n@misc{liebert2007liebert,\n\ttitle        = {Liebert Deluxe System/3 - Chilled Water - System Design Manual},\n\tauthor       = {Liebert},\n\tyear         = 2007,\n\thowpublished = {Available at \\url{http://shared.liebert.com/SharedDocuments/Manuals/sl_18110826.pdf}}\n}\n@misc{liebert2007lieberta,\n\ttitle        = {Liebert Deluxe System/3 Precision Cooling System},\n\tauthor       = {Liebert},\n\tyear         = 2007,\n\thowpublished = {Available at \\url{http://www.liebert.com/product_pages/ProductDocumentation.aspx?id=13&hz=60}}\n}\n@misc{liebert2008technical,\n\ttitle        = {\n\t\tTechnical Note: Using EC Plug Fans to Improve Energy Efficiency of\n\n\t\tChilled Water Cooling Systems in Large Data Centers\n\t},\n\tauthor       = {Liebert},\n\tyear         = 2008,\n\thowpublished = {Available at \\url{http://shared.liebert.com/SharedDocuments/White\\%20Papers/PlugFan_Low060608.pdf}},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@article{liesen2008nonsymmetric,\n\ttitle        = {On nonsymmetric saddle point matrices that allow conjugate gradient iterations},\n\tauthor       = {Liesen, J{\\\"o}rg and Parlett, Beresford N},\n\tyear         = 2008,\n\tjournal      = {Numerische Mathematik},\n\tpublisher    = {Springer},\n\tvolume       = 108,\n\tnumber       = 4,\n\tpages        = {605--624}\n}\n@inproceedings{lihong06towardsa,\n\ttitle        = {Towards a Unified Theory of State Abstraction for MDPs},\n\tauthor       = {Lihong Li and Thomas J. Walsh and Michael L. Littman},\n\tyear         = 2006,\n\tbooktitle    = {In Proceedings of the Ninth International Symposium on Artificial Intelligence and Mathematics},\n\tpages        = {531--539}\n}\n@phdthesis{lihong2009disaggregation,\n\ttitle        = {A Unifying Framework for Computational Reinforcement Learning Theory},\n\tauthor       = {Li, Lihong},\n\tyear         = 2009,\n\tpublisher    = {Rutgers University},\n\taddress      = {USA},\n\tisbn         = 9781109524970,\n\tnote         = {AAI3386797},\n\tadvisor      = {Littman, Michael L.},\n\tabstract     = {Computational learning theory studies mathematical models that allow one to formally analyze and compare the performance of supervised-learning algorithms such as their sample complexity. While existing models such as PAC ( Probably Approximately Correct ) have played an influential role in understanding the nature of supervised learning, they have not been as successful in reinforcement learning (RL). Here, the fundamental barrier is the need for active exploration in sequential decision problems. An RL agent tries to maximize long-term utility by exploiting its knowledge about the problem, but this knowledge has to be acquired by the agent itself through exploring the problem that may reduce short-term utility. The need for active exploration is common in many problems in daily life, engineering, and sciences. For example, a Backgammon program strives to take good moves to maximize the probability of winning a game, but sometimes it may try novel and possibly harmful moves to discover how the opponent reacts in the hope of discovering a better game-playing strategy. It has been known since the early days of RL that a good tradeoff between exploration and exploitation is critical for the agent to learn fast ( i.e. , to reach near-optimal strategies with a small sample complexity ), but a general theoretical analysis of this tradeoff remained open until recently. In this dissertation, we introduce a novel computational learning model called KWIK ( Knows What It Knows ) that is designed particularly for its utility in analyzing learning problems like RL where active exploration can impact the training data the learner is exposed to. My thesis is that the KWIK learning model provides a flexible, modularized, and unifying way for creating and analyzing reinforcement-learning algorithms with provably efficient exploration. In particular, we show how the KWIK perspective can be used to unify the analysis of existing RL algorithms with polynomial sample complexity. It also facilitates the development of new algorithms with smaller sample complexity, which have demonstrated empirically faster learning speed in real-world problems. Furthermore, we provide an improved, matching sample complexity lower bound, which suggests the optimality (in a sense) of one of the KWIK-based algorithms known as delayed Q-learning .}\n}\n@inproceedings{LiLin2015,\n\ttitle        = {{Accelerated Proximal Gradient Methods for Nonconvex Programming}},\n\tauthor       = {Li, Huan and Lin, Zhouchen},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems - NIPS '15},\n\tpages        = {379----387},\n\tfile         = {:D$\\backslash$:/Mendeley Desktop/Li, Lin - Unknown - Accelerated Proximal Gradient Methods for Nonconvex Programming.pdf:pdf},\n\tmendeley-groups = {Optimization/Non-Convex}\n}\n@inproceedings{lillicrap2016continuous,\n\ttitle        = {Continuous control with deep reinforcement learning.},\n\tauthor       = {Lillicrap, Timothy P and Hunt, Jonathan J and Pritzel, Alexander and Heess, Nicolas and Erez, Tom and Tassa, Yuval and Silver, David and Wierstra, Daan},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1509.02971},\n\tbooktitle    = {ICLR (Poster)}\n}\n@article{lim_tensoreig,\n\ttitle        = {Singular values and eigenvalues of tensors: a variational approach},\n\tauthor       = {L.-H. Lim},\n\tyear         = 2005,\n\tjournal      = {Proceedings of the IEEE International Workshop on Computational Advances in Multi-Sensor Adaptive Processing (CAMSAP '05)},\n\tvolume       = 1,\n\tpages        = {129--132}\n}\n@article{lim2018actor,\n\ttitle        = {Actor-Expert: A Framework for using Q-learning in Continuous Action Spaces},\n\tauthor       = {Lim, Sungsu and Joseph, Ajin and Le, Lei and Pan, Yangchen and White, Martha},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1810.09103}\n}\n@inproceedings{limaye2010annotating,\n\ttitle        = {Annotating and searching web tables using entities, types and relationships},\n\tauthor       = {Girija Limaye and Sunita Sarawagi and Soumen Chakrabarti},\n\tyear         = 2010,\n\tbooktitle    = {Very Large Data Bases (VLDB)},\n\tvolume       = 3,\n\tpages        = {1338--1347}\n}\n@article{lin2001discovery,\n\ttitle        = {Discovery of inference rules for question-answering},\n\tauthor       = {Dekang Lin and Patrick Pantel},\n\tyear         = 2001,\n\tjournal      = {Natural Language Engineering},\n\tvolume       = 7,\n\tpages        = {343--360}\n}\n@inproceedings{lin2003symbolic,\n\ttitle        = {\n\t\tA symbolic representation of time series, with implications for streaming\n\n\t\talgorithms\n\t},\n\tauthor       = {Lin, Jessica and Keogh, Eamonn and Lonardi, Stefano and Chiu, Bill},\n\tyear         = 2003,\n\tbooktitle    = {\n\t\tDMKD '03: Proceedings of the 8th ACM SIGMOD workshop on Research\n\n\t\tissues in data mining and knowledge discovery\n\t},\n\tlocation     = {San Diego, California},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tpages        = {2--11},\n\tdoi          = {http://doi.acm.org/10.1145/882082.882086},\n\towner        = {leili},\n\ttimestamp    = {2010.02.01}\n}\n@inproceedings{lin2004rouge,\n\ttitle        = {Looking for a Few Good Metrics: {ROUGE} and its Evaluation},\n\tauthor       = {Chin-yew Lin and Marina Rey},\n\tyear         = 2004,\n\tbooktitle    = {NTCIR Workshop}\n}\n@inproceedings{lin2010energy,\n\ttitle        = {Energy-accuracy trade-off for continuous mobile device location},\n\tauthor       = {Kaisen Lin and Aman Kansal and Dimitrios Lymberopoulos and Feng Zhao},\n\tyear         = 2010,\n\tbooktitle    = {International conference on Mobile systems, applications, and services},\n\tpages        = {285--298}\n}\n@inproceedings{lin2012linking,\n\ttitle        = {Entity linking at web scale},\n\tauthor       = {Thomas Lin and Mausam and Oren Etzioni},\n\tyear         = 2012,\n\tbooktitle    = {Knowledge Extraction Workshop (AKBC-WEKEX)}\n}\n@article{lin2013lookahead,\n\ttitle        = {Lookahead strategies for sequential {M}onte {C}arlo},\n\tauthor       = {Ming Lin and Rong Chen and Jun S. Liu},\n\tyear         = 2013,\n\tjournal      = {Statistical Science},\n\tvolume       = 28,\n\tnumber       = 1,\n\tpages        = {69--94}\n}\n@article{lin2013network,\n\ttitle        = {Network in network},\n\tauthor       = {Lin, Min and Chen, Qiang and Yan, Shuicheng},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1312.4400}\n}\n@inproceedings{lin2014accelerated,\n\ttitle        = {An Accelerated Proximal Coordinate Gradient Method},\n\tauthor       = {Qihang Lin and Zhaosong Lu and Lin Xiao},\n\tyear         = 2014,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{lin2014adaptive,\n\ttitle        = {An adaptive accelerated proximal gradient method and its homotopy continuation for sparse optimization},\n\tauthor       = {Lin, Qihang and Xiao, Lin},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {73--81}\n}\n@inproceedings{lin2014microsoft,\n\ttitle        = {Microsoft {COCO}: Common objects in context},\n\tauthor       = {Tsung-Yi Lin and Michael Maire and Serge Belongie and James Hays and Pietro Perona and  Deva Ramanan and Piotr Doll{\\'a}r and C. Lawrence Zitnick},\n\tyear         = 2014,\n\tbooktitle    = {European Conference on Computer Vision (ECCV)},\n\tpages        = {740--755}\n}\n@article{lin2014pdtb,\n\ttitle        = {A {PDTB}-styled end-to-end discourse parser},\n\tauthor       = {Ziheng Lin and Hwee Tou Ng and Min-Yen Kan},\n\tyear         = 2014,\n\tjournal      = {Natural Language Engineering},\n\tvolume       = 20,\n\tnumber       = 2,\n\tpages        = {151--184}\n}\n@misc{Lin2016-email,\n\tauthor       = {Lin, Hongzhou},\n\tyear         = 2016,\n\thowpublished = {private communication}\n}\n@article{lin2017active,\n\ttitle        = {Active Learning for Visual Question Answering: An Empirical Study},\n\tauthor       = {Xiao Lin and Devi Parikh},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1711.01732}\n}\n@techreport{lin2017program,\n\ttitle        = {Program synthesis from natural language using recurrent neural networks},\n\tauthor       = {Xi Victoria Lin and Chenglong Wang and Deric Pang and Kevin Vu and Luke Zettlemoyer and Michael D. Ernst},\n\tyear         = {2017 2017},\n\tnumber       = {0},\n\tinstitution  = {University of Washington}\n}\n@article{lin2017tactics,\n\ttitle        = {Tactics of Adversarial Attack on Deep Reinforcement Learning Agents},\n\tauthor       = {Yen-Chen Lin and Zhang-Wei Hong and Yuan-Hong Liao and Meng-Li Shih and Ming-Yu Liu and Min Sun},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@inproceedings{lin2018denoising,\n\ttitle        = {Denoising distantly supervised open-domain question answering},\n\tauthor       = {Yankai Lin and Haozhe Ji and Zhiyuan Liu and Maosong Sun},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tvolume       = 1,\n\tpages        = {1736--1745}\n}\n@inproceedings{lin2018nl2bash,\n\ttitle        = {NL2Bash: A Corpus and Semantic Parser for Natural Language Interface to the Linux Operating System},\n\tauthor       = {Xi Victoria Lin and Chenglong Wang and Luke S. Zettlemoyer and Michael D. Ernst},\n\tyear         = 2018,\n\tbooktitle    = {Language Resources and Evaluation Conference (LREC)}\n}\n@article{lin2018resnet,\n\ttitle        = {Resnet with one-neuron hidden layers is a universal approximator},\n\tauthor       = {Lin, Hongzhou and Jegelka, Stefanie},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1806.10909}\n}\n@article{lin2019grammar,\n\ttitle        = {Grammar-based Neural Text-to-{SQL} Generation},\n\tauthor       = {Kevin Lin and Ben Bogin and Mark Neumann and Jonathan Berant and Matt Gardner},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1905.13326}\n}\n@inproceedings{lin2020model,\n\ttitle        = {Model-based Adversarial Meta-Reinforcement Learning},\n\tauthor       = {Lin, Zichuan and Thomas, Garrett and Yang, Guangwen and Ma, Tengyu},\n\tyear         = 2020,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 33,\n\tpages        = {10161--10173},\n\turl          = {https://proceedings.neurips.cc/paper/2020/file/73634c1dcbe056c1f7dcf5969da406c8-Paper.pdf},\n\teditor       = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin}\n}\n@inproceedings{LinCohen10,\n\ttitle        = {Power Iteration Clustering},\n\tauthor       = {Frank Lin and William W. Cohen},\n\tyear         = 2010,\n\tbooktitle    = {ICML '10},\n\tpages        = {655--662}\n}\n@article{lindley1956measure,\n\ttitle        = {On a measure of the information provided by an experiment},\n\tauthor       = {Dennis V Lindley},\n\tyear         = 1956,\n\tjournal      = {The Annals of Mathematical Statistics},\n\tpages        = {986--1005}\n}\n@article{lindsay88composite,\n\ttitle        = {Composite likelihood methods},\n\tauthor       = {B. Lindsay},\n\tyear         = 1988,\n\tjournal      = {Contemporary Mathematics},\n\tvolume       = 80,\n\tpages        = {221--239}\n}\n@article{Lindsay89,\n\ttitle        = {Moment matrices: applications in mixtures},\n\tauthor       = {B. G. Lindsay},\n\tyear         = 1989,\n\tjournal      = {Annals of Statistics},\n\tvolume       = 17,\n\tnumber       = 2,\n\tpages        = {722--740}\n}\n@book{Lindsay95,\n\ttitle        = {Mixture models: theory, geometry and applications},\n\tauthor       = {B. G. Lindsay},\n\tyear         = 1995,\n\tpublisher    = {American Statistical Association}\n}\n@inproceedings{ling2016latent,\n\ttitle        = {Latent Predictor Networks for Code Generation},\n\tauthor       = {Wang Ling and Edward Grefenstette and Karl Moritz Hermann and Tomáš Kočiský and Andrew Senior and Fumin Wang and Phil Blunsom},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {599--609}\n}\n@inproceedings{ling2017teaching,\n\ttitle        = {Teaching Machines to Describe Images via Natural Language Feedback},\n\tauthor       = {Huan Ling and Sanja Fidler},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{LinMH2015-Catalyst,\n\ttitle        = {{A Universal Catalyst for First-Order Optimization}},\n\tauthor       = {Lin, Hongzhou and Mairal, Julien and Harchaoui, Zaid},\n\tyear         = 2015,\n\tbooktitle    = {NIPS},\n\turl          = {http://arxiv.org/pdf/1506.02186v1.pdf},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {1506.02186},\n\teprint       = {1506.02186},\n\tfile         = {:D$\\backslash$:/Mendeley Desktop/Lin, Mairal, Harchaoui - 2015 - A Universal Catalyst for First-Order Optimization.pdf:pdf},\n\tmendeley-groups = {Optimization/Gradient Descent Theory}\n}\n@article{linzen2016assessing,\n\ttitle        = {Assessing the ability of {LSTMs} to learn syntax-sensitive dependencies},\n\tauthor       = {Tal Linzen and Emmanuel Dupoux and Yoav Goldberg},\n\tyear         = 2016,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 4\n}\n@article{lipton2016sisyphean,\n\ttitle        = {Combating Reinforcement Learning's Sisyphean Curse with Intrinsic Fear},\n\tauthor       = {Zachary C. Lipton and Kamyar Azizzadenesheli and Abhishek Kumar and Lihong Li and Jianfeng Gao and Li Deng},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@inproceedings{lipton2018does,\n\ttitle        = {Does mitigating ML's impact disparity require treatment disparity?},\n\tauthor       = {Zachary Lipton and Julian McAuley and Alexandra Chouldechova},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {8125--8135}\n}\n@inproceedings{lipton2018labelshift,\n\ttitle        = {Detecting and Correcting for Label Shift with Black Box Predictors},\n\tauthor       = {Z. C. Lipton and Y. Wang and A. J. Smola},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{LiSSA2016,\n\ttitle        = {Second Order Stochastic Optimization for Machine Learning in Linear Time},\n\tauthor       = {Naman Agarwal and Brian Bullins and Elad Hazan},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1602.03943}\n}\n@article{litjens2018camelyon,\n\ttitle        = {1399 H\\&E-stained sentinel lymph node sections of breast cancer patients: the {CAMELYON} dataset},\n\tauthor       = {Geert Litjens and Peter Bandi and Babak Ehteshami Bejnordi and Oscar Geessink and Maschenka Balkenhol and Peter Bult and Altuna Halilovic and Meyke Hermsen and Rob van de Loo and Rob Vogels and others},\n\tyear         = 2018,\n\tjournal      = {Science},\n\tvolume       = 7,\n\tnumber       = 6\n}\n@inproceedings{little2007keyword,\n\ttitle        = {Keyword Programming in Java},\n\tauthor       = {Greg Little and Robert C. Miller},\n\tyear         = 2007,\n\tbooktitle    = {Automated Software Engineering (ASE)}\n}\n@article{little2009estimation,\n\ttitle        = {{Estimation of Intrinsic Dimensionality of Samples from Noisy Low-Dimensional Manifolds in High Dimensions with Multiscale SVD}},\n\tauthor       = {Little, Anna V. and {Jason D. Lee} and Jung, Yoon-Mo and Maggioni, Mauro},\n\tyear         = 2009,\n\tjournal      = {IEEE Workshop on Statistical Signal Processing (SSP)},\n\tpages        = {85--88}\n}\n@article{littlestone1994weighted,\n\ttitle        = {The weighted majority algorithm},\n\tauthor       = {Nick Littlestone and Manfred K Warmuth},\n\tyear         = 1994,\n\tjournal      = {Information and computation},\n\tvolume       = 108,\n\tnumber       = 2,\n\tpages        = {212--261}\n}\n@incollection{littman1994markov,\n\ttitle        = {Markov games as a framework for multi-agent reinforcement learning},\n\tauthor       = {Littman, Michael L},\n\tyear         = 1994,\n\tbooktitle    = {Machine learning proceedings 1994},\n\tlocation     = {New Brunswick, NJ, USA},\n\tpublisher    = {Elsevier},\n\taddress      = {San Francisco, CA, USA},\n\tseries       = {ICML'94},\n\tpages        = {157--163},\n\tisbn         = {1-55860-335-2},\n\turl          = {http://dl.acm.org/citation.cfm?id=3091574.3091594},\n\tacmid        = 3091594,\n\tnumpages     = 7\n}\n@inproceedings{littman1995complexity,\n\ttitle        = {On the complexity of solving {M}arkov decision problems},\n\tauthor       = {Littman, Michael L and Dean, Thomas L and Kaelbling, Leslie Pack},\n\tyear         = 1995,\n\tbooktitle    = {Proceedings of the Eleventh conference on Uncertainty in artificial intelligence},\n\tpages        = {394--402},\n\torganization = {Morgan Kaufmann Publishers Inc.},\n\tdate-added   = {2017-05-19 01:00:50 +0000},\n\tdate-modified = {2017-05-19 01:00:50 +0000}\n}\n@article{littman1996algorithms,\n\ttitle        = {Algorithms for sequential decision making},\n\tauthor       = {Littman, Michael Lederman},\n\tyear         = 1996,\n\tpublisher    = {Brown University Providence, RI}\n}\n@inproceedings{littman2001predictive,\n\ttitle        = {Predictive representations of state.},\n\tauthor       = {Littman, Michael L and Sutton, Richard S and Singh, Satinder P},\n\tyear         = 2001,\n\tbooktitle    = {NIPS},\n\tvolume       = 14,\n\tnumber       = 1555,\n\tpages        = 30\n}\n@article{littman2001value,\n\ttitle        = {Value-function reinforcement learning in {Markov} games},\n\tauthor       = {Michael L Littman},\n\tyear         = 2001,\n\tjournal      = {Cognitive Systems Research},\n\tvolume       = 2,\n\tnumber       = 1,\n\tpages        = {55--66}\n}\n@article{littman2017environment,\n\ttitle        = {Environment-independent task specifications via GLTL},\n\tauthor       = {Littman, Michael L and Topcu, Ufuk and Fu, Jie and Isbell, Charles and Wen, Min and MacGlashan, James},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1704.04341}\n}\n@article{liu1989limited,\n\ttitle        = {On the limited memory {BFGS} method for large scale optimization},\n\tauthor       = {Dong C Liu and Jorge Nocedal},\n\tyear         = 1989,\n\tjournal      = {Mathematical Programming},\n\tvolume       = 45,\n\tnumber       = 1,\n\tpages        = {503--528}\n}\n@inproceedings{liu1995keyframe,\n\ttitle        = {Keyframe Motion Optimization By Relaxing Speed and Timing},\n\tauthor       = {Zicheng Liu and Michael F. Cohen},\n\tyear         = 1995,\n\tbooktitle    = {Computer Animation and Simulation '95},\n\tpublisher    = {Springer-Verlag},\n\tpages        = {144--153},\n\teditor       = {Dimitri Terzopoulos and Daniel Thalmann}\n}\n@inproceedings{liu2000xwrap,\n\ttitle        = {{XWRAP}: An {XML}-enabled wrapper construction system for web information sources},\n\tauthor       = {Ling Liu and Calton Pu and Wei Han},\n\tyear         = 2000,\n\tbooktitle    = {Data Engineering, 2000. Proceedings. 16th International Conference on},\n\tpages        = {611--621}\n}\n@incollection{liu2001combined,\n\ttitle        = {Combined parameter and state estimation in simulation-based filtering},\n\tauthor       = {Jane Liu and Mike West},\n\tyear         = 2001,\n\tbooktitle    = {Sequential {M}onte {C}arlo Methods in Practice}\n}\n@inproceedings{liu2003mining,\n\ttitle        = {Mining data records in Web pages},\n\tauthor       = {Bing Liu and Robert Grossman and Yanhong Zhai},\n\tyear         = 2003,\n\tbooktitle    = {Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining},\n\tpages        = {601--606}\n}\n@inproceedings{liu2005vector,\n\ttitle        = {Text representation: from vector to tensor},\n\tauthor       = {Ning Liu and Benyu Zhang and Jun Yan and Zheng Chen and Wenyin Liu and Fengshan Bai and Leefeng Chien},\n\tyear         = 2005,\n\tbooktitle    = {International Conference on Data Mining}\n}\n@article{liu2006estimation,\n\ttitle        = {Estimation of missing markers in human motion capture},\n\tauthor       = {Guodong Liu and Leonard McMillan},\n\tyear         = 2006,\n\tjournal      = {Vis. Comput.},\n\tpublisher    = {Springer-Verlag New York, Inc.},\n\taddress      = {Secaucus, NJ, USA},\n\tvolume       = 22,\n\tnumber       = 9,\n\tpages        = {721--728},\n\tdoi          = {http://dx.doi.org/10.1007/s00371-006-0080-9},\n\tissn         = {0178-2789},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@inproceedings{liu2007oversampling,\n\ttitle        = {Generative Oversampling for Mining Imbalanced Datasets},\n\tauthor       = {Alexander Liu and Joydeep Ghosh and Cheryl Martin},\n\tyear         = 2007,\n\tbooktitle    = {International Conference on Data Mining (DMIN)}\n}\n@book{liu2008monte,\n\ttitle        = {{M}onte {C}arlo strategies in scientific computing},\n\tauthor       = {Jun S Liu},\n\tyear         = 2008,\n\tpublisher    = {Springer Science \\& Business Media Springer Science \\& Business Media}\n}\n@inproceedings{liu2008towards,\n\ttitle        = {Towards Discovering Data Center Genome Using Sensor Net},\n\tauthor       = {\n\t\tJie Liu and Bodhi Priyantha and Feng Zhao and Chieh-Jan Mike Liang\n\n\t\tand Qiang Wang and Sean James\n\t},\n\tyear         = 2008,\n\tbooktitle    = {Proceedings of the 5th Workshop on Embedded Networked Sensors (HotEmNets)},\n\tlocation     = {Charlottesville, VA}\n}\n@inproceedings{liu2009bbm,\n\ttitle        = {BBM: bayesian browsing model from petabyte-scale data},\n\tauthor       = {Liu, Chao and Guo, Fan and Faloutsos, Christos},\n\tyear         = 2009,\n\tbooktitle    = {\n\t\tProceedings of the 15th ACM SIGKDD international conference on Knowledge\n\n\t\tdiscovery and data mining\n\t},\n\tlocation     = {Paris, France},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {KDD '09},\n\tpages        = {537--546},\n\tdoi          = {http://doi.acm.org/10.1145/1557019.1557081},\n\tisbn         = {978-1-60558-495-9},\n\tacmid        = 1557081,\n\tkeywords     = {bayesian models, click log analysis, web search},\n\tnumpages     = 10\n}\n@inproceedings{liu2010distributed,\n\ttitle        = {Distributed nonnegative matrix factorization for web-scale dyadic data analysis on mapreduce},\n\tauthor       = {Liu, Chao and Yang, Hung-chih and Fan, Jinliang and He, Li-Wei and Wang, Yi-Min},\n\tyear         = 2010,\n\tbooktitle    = {Proceedings of the 19th international conference on World wide web},\n\tpages        = {681--690},\n\torganization = {ACM}\n}\n@inproceedings{liu2012regularized,\n\ttitle        = {Regularized off-policy TD-learning},\n\tauthor       = {Liu, Bo and Mahadevan, Sridhar and Liu, Ji},\n\tyear         = 2012,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {836--844}\n}\n@article{liu2013change,\n\ttitle        = {Change-point detection in time-series data by relative density-ratio estimation},\n\tauthor       = {Song Liu and Makoto Yamada and Nigel Collier and Masashi Sugiyama},\n\tyear         = 2013,\n\tjournal      = {Neural Networks},\n\tvolume       = 43,\n\tpages        = {72--83}\n}\n@inproceedings{liu2014control,\n\ttitle        = {Control in a safe set: Addressing safety in human-robot interactions},\n\tauthor       = {Liu, Changliu and Tomizuka, Masayoshi},\n\tyear         = 2014,\n\tbooktitle    = {ASME 2014 Dynamic Systems and Control Conference},\n\torganization = {American Society of Mechanical Engineers Digital Collection}\n}\n@inproceedings{liu2014efficient,\n\ttitle        = {Efficient Approximation of Cross-Validation for Kernel Methods using {B}ouligand Influence Function},\n\tauthor       = {Yong Liu and Shali Jiang and Shizhong Liao},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {324--332}\n}\n@inproceedings{liu2015addressing,\n\ttitle        = {Addressing Covariate Shift in Active Learning with Adversarial Prediction},\n\tauthor       = {Anqi Liu and Kaiser Asif},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{liu2015classification,\n\ttitle        = {Classification with noisy labels by importance reweighting},\n\tauthor       = {Liu, Tongliang and Tao, Dacheng},\n\tyear         = 2015,\n\tjournal      = {IEEE Transactions on pattern analysis and machine intelligence},\n\tpublisher    = {IEEE},\n\tvolume       = 38,\n\tnumber       = 3,\n\tpages        = {447--461}\n}\n@inproceedings{liu2015deep,\n\ttitle        = {Deep learning face attributes in the wild},\n\tauthor       = {Ziwei Liu and Ping Luo and Xiaogang Wang and Xiaoou Tang},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the IEEE International Conference on Computer Vision},\n\tpages        = {3730--3738}\n}\n@inproceedings{liu2015finite,\n\ttitle        = {Finite-Sample Analysis of Proximal Gradient TD Algorithms},\n\tauthor       = {Liu, Bo and Liu, Ji and Ghavamzadeh, Mohammad and Mahadevan, Sridhar and Petrik, Marek},\n\tyear         = 2015,\n\tbooktitle    = {Proc. The 31st Conf. Uncertainty in Artificial Intelligence, Amsterdam, Netherlands}\n}\n@inproceedings{liu2016effective,\n\ttitle        = {Effective Crowd Annotation for Relation Extraction},\n\tauthor       = {Angli Liu and Stephen Soderland and Jonathan Bragg and Christopher H Lin and Xiao Ling and Daniel S Weld},\n\tyear         = 2016,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)},\n\tpages        = {897--906}\n}\n@inproceedings{liu2016evaluate,\n\ttitle        = {How {NOT} To Evaluate Your Dialogue System: An Empirical Study of Unsupervised Evaluation Metrics for Dialogue Response Generation},\n\tauthor       = {Chia-Wei Liu and Ryan Lowe and Iulian V. Serban and Michael Noseworthy and Laurent Charlin and Joelle Pineau},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{liu2016kernelized,\n\ttitle        = {A Kernelized Stein Discrepancy for Goodness-of-fit Tests and Model Evaluation},\n\tauthor       = {Liu, Qiang and {Jason D. Lee} and Jordan, Michael I},\n\tyear         = 2016,\n\tjournal      = {International Conference on Machine Learning (ICML)}\n}\n@article{liu2016teaching,\n\ttitle        = {The Teaching Dimension of Linear Learners},\n\tauthor       = {Ji Liu and Xiaojin Zhu},\n\tyear         = 2016,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 17,\n\tnumber       = 162\n}\n@article{liu2017black,\n\ttitle        = {Black-box importance sampling},\n\tauthor       = {Liu, Qiang and Lee, Jason D},\n\tyear         = 2017,\n\tjournal      = {Artificial Intelligence and Statistics (AISTATS)}\n}\n@article{liu2017detecting,\n\ttitle        = {Detecting cancer metastases on gigapixel pathology images},\n\tauthor       = {Yun Liu and Krishna Gadepalli and Mohammad Norouzi and George E Dahl and Timo Kohlberger and Aleksey Boyko and Subhashini Venugopalan and Aleksei Timofeev and Philip Q Nelson and Greg S Corrado and others},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1703.02442}\n}\n@inproceedings{liu2017imagetoimage,\n\ttitle        = {Unsupervised Image-to-Image Translation Networks},\n\tauthor       = {Ming-Yu Liu and Thomas Breuel and Jan Kautz},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{liu2017sect,\n\ttitle        = {Structural Embedding of Syntactic Trees for Machine Comprehension},\n\tauthor       = {Rui Liu and Junjie Hu and Wei Wei and Zi Yang and Eric Nyberg},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{liu2018delayed,\n\ttitle        = {Delayed Impact of Fair Machine Learning},\n\tauthor       = {Lydia T Liu and Sarah Dean and Esther Rolf and Max Simchowitz and Moritz Hardt},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{liu2018end,\n\ttitle        = {End-to-End Learning of Task-Oriented Dialogs},\n\tauthor       = {Bing Liu and Ian Lane},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {67--73}\n}\n@article{liu2018implicit,\n\ttitle        = {The implicit fairness criterion of unconstrained learning},\n\tauthor       = {Liu, Lydia T and Simchowitz, Max and Hardt, Moritz},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1808.10013}\n}\n@article{liu2018inexact,\n\ttitle        = {An inexact subsampled proximal Newton-type method for large-scale machine learning},\n\tauthor       = {Liu, Xuanqing and Hsieh, Cho-Jui and Lee, Jason D and Sun, Yuekai},\n\tyear         = {},\n\tjournal      = {Submitted to Journal of Machine Learning Research}\n}\n@inproceedings{liu2018stochastic,\n\ttitle        = {Stochastic Answer Networks for Machine Reading Comprehension},\n\tauthor       = {Xiaodong Liu and Yelong Shen and Kevin Duh and Jianfeng Gao},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{liu2018workflow,\n\ttitle        = {Reinforcement Learning on Web Interfaces using Workflow-Guided Exploration},\n\tauthor       = {Evan Zheran Liu and Kelvin Guu and Panupong Pasupat and Tianlin Shi and Percy Liang},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{liu2019bad,\n\ttitle        = {Bad global minima exist and sgd can reach them},\n\tauthor       = {Liu, Shengchao and Papailiopoulos, Dimitris and Achlioptas, Dimitris},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1906.02613}\n}\n@inproceedings{liu2019implicit,\n\ttitle        = {The Implicit Fairness Criterion of Unconstrained Learning},\n\tauthor       = {Lydia T. Liu and Max Simchowitz and Moritz Hardt},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{liu2019inoculation,\n\ttitle        = {Inoculation by Fine-Tuning: A Method for Analyzing Challenge Datasets},\n\tauthor       = {Nelson F. Liu and Roy Schwartz and Noah A. Smith},\n\tyear         = 2019,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@inproceedings{liu2019large,\n\ttitle        = {Large-Scale Long-Tailed Recognition in an Open World},\n\tauthor       = {Liu, Ziwei and Miao, Zhongqi and Zhan, Xiaohang and Wang, Jiayun and Gong, Boqing and Yu, Stella X.},\n\tyear         = 2019,\n\tmonth        = {June},\n\tbooktitle    = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}\n}\n@article{liu2019multi,\n\ttitle        = {Multi-Task Deep Neural Networks for Natural Language Understanding},\n\tauthor       = {Xiaodong Liu and Pengcheng He and Weizhu Chen and Jianfeng Gao},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1901.11504}\n}\n@article{liu2019roberta,\n\ttitle        = {{R}o{BERT}a: A Robustly Optimized {BERT} Pretraining Approach},\n\tauthor       = {Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and Mandar Joshi and Danqi Chen and Omer Levy and Mike Lewis and Luke Zettlemoyer and Veselin Stoyanov},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1907.11692}\n}\n@article{liu2019tigs,\n\ttitle        = {{TIGS}: An Inference Algorithm for Text Infilling with Gradient Search},\n\tauthor       = {Dayiheng Liu and Jie Fu and Pengfei Liu and Jiancheng Lv},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1905.10752}\n}\n@inproceedings{liu2019towards,\n\ttitle        = {Towards Understanding the Importance of Shortcut Connections in Residual Networks},\n\tauthor       = {Liu, Tianyi and Chen, Minshuo and Zhou, Mo and Du, Simon S and Zhou, Enlu and Zhao, Tuo},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 32,\n\tpages        = {},\n\turl          = {https://proceedings.neurips.cc/paper/2019/file/7716d0fc31636914783865d34f6cdfd5-Paper.pdf},\n\teditor       = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\\textquotesingle Alch\\'{e}-Buc and E. Fox and R. Garnett}\n}\n@article{liu2020explore,\n\ttitle        = {Explore then Execute: Adapting without Rewards via Factorized Meta-Reinforcement Learning},\n\tauthor       = {Evan Zheran Liu and Aditi Raghunathan and Percy Liang and Chelsea Finn},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2008.02790}\n}\n@article{liu2020imitation,\n\ttitle        = {An Imitation Learning Approach for Cache Replacement},\n\tauthor       = {Evan Zheran Liu and Milad Hashemi and Kevin Swersky and Parthasarathy Ranganathan and Junwhan Ahn},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.16239}\n}\n@article{liu2020learning,\n\ttitle        = {Learning Abstract Models for Strategic Exploration and Fast Reward Transfer},\n\tauthor       = {Evan Zheran Liu and Ramtin Keramati and Sudarshan Seshadri and Kelvin Guu and Panupong Pasupat and Emma Brunskill and Percy Liang},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.05896}\n}\n@misc{liu2020metalearning,\n\ttitle        = {Meta-learning Transferable Representations with a Single Target Domain},\n\tauthor       = {Hong Liu and Jeff Z. HaoChen and Colin Wei and Tengyu Ma},\n\tyear         = 2020,\n\teprint       = {2011.01418},\n\tarchiveprefix = {arXiv},\n\tprimaryclass = {cs.LG}\n}\n@article{liu2020primer,\n\ttitle        = {A Primer on Zeroth-Order Optimization in Signal Processing and Machine Learning: Principals, Recent Advances, and Applications},\n\tauthor       = {Liu, Sijia and Chen, Pin-Yu and Kailkhura, Bhavya and Zhang, Gaoyuan and Hero III, Alfred O and Varshney, Pramod K},\n\tyear         = 2020,\n\tjournal      = {IEEE Signal Processing Magazine},\n\tpublisher    = {IEEE},\n\tvolume       = 37,\n\tnumber       = 5,\n\tpages        = {43--54}\n}\n@article{liu2021concur,\n\ttitle        = {Can Small and Synthetic Benchmarks Drive Modeling Innovation?  A Retrospective Study of Question Answering Modeling Approaches},\n\tauthor       = {Nelson F. Liu and Tony Lee and Robin Jia and Percy Liang},\n\tyear         = 2021,\n\tjournal      = {arXiv}\n}\n@inproceedings{liu2021dream,\n\ttitle        = {Decoupling Exploration and Exploitation for Meta-Reinforcement Learning without Sacrifices},\n\tauthor       = {Evan Zheran Liu and Aditi Raghunathan and Percy Liang and Chelsea Finn},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{liu2021jtt,\n\ttitle        = {Just Train Twice: Improving Group Robustness without Training Group Information},\n\tauthor       = {Evan Zheran Liu and Behzad Haghgoo and Annie S. Chen and Aditi Raghunathan and Pang Wei Koh and Shiori Sagawa and Percy Liang and Chelsea Finn},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{liu89lbfgs,\n\ttitle        = {On the Limited Memory Method for Large Scale Optimization},\n\tauthor       = {D. C. Liu and J. Nocedal},\n\tyear         = 1989,\n\tjournal      = {Mathematical Programming B},\n\tvolume       = 45,\n\tnumber       = 3,\n\tpages        = {503--528}\n}\n@article{liu98pxem,\n\ttitle        = {Parameter expansion to accelerate {EM}: the {PX-EM} algorithm},\n\tauthor       = {C. Liu and D. Rubin and Y. N. Wu},\n\tyear         = 1998,\n\tjournal      = {Biometrika},\n\tvolume       = 85,\n\tpages        = {755--770}\n}\n@article{liu99pxda,\n\ttitle        = {Parameter expansion for data augmentation},\n\tauthor       = {J. Liu and Y. Wu},\n\tyear         = 1999,\n\tjournal      = {Journal of the American Statistical Association (JASA)},\n\tvolume       = 94,\n\tpages        = {1264--1274}\n}\n@inproceedings{LiuWRBS2014asynchronous,\n\ttitle        = {An Asynchronous Parallel Stochastic Coordinate Descent Algorithm},\n\tauthor       = {Liu, Ji and Wright, Steve and Re, Christopher and Bittorf, Victor and Sridhar, Srikrishna},\n\tyear         = 2014,\n\tbooktitle    = {Proceedings of the 31st International Conference on Machine Learning (ICML-14)},\n\tpages        = {469--477}\n}\n@article{livni2013algorithm,\n\ttitle        = {An Algorithm for Training Polynomial Networks},\n\tauthor       = {Roi Livni and Shai Shalev-Shwartz and Ohad Shamir},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1304.7045}\n}\n@inproceedings{livni2013vanishing,\n\ttitle        = {Vanishing Component Analysis.},\n\tauthor       = {Livni, Roi and Lehavi, David and Schein, Sagi and Nachlieli, Hila and Shalev-Shwartz, Shai and Globerson, Amir},\n\tyear         = 2013,\n\tbooktitle    = {ICML (1)},\n\tpages        = {597--605}\n}\n@inproceedings{livni2014computational,\n\ttitle        = {On the computational efficiency of training neural networks},\n\tauthor       = {Livni, Roi and Shalev-Shwartz, Shai and Shamir, Ohad},\n\tyear         = 2014,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {855--863}\n}\n@article{LiWLZ2016-online1SVD,\n\ttitle        = {{Near-Optimal Stochastic Approximation for Online Principal Component Estimation}},\n\tauthor       = {Chris J. Li and Mengdi Wang and Han Liu and Tong Zhang},\n\tyear         = 2016,\n\tmonth        = mar,\n\tjournal      = {ArXiv e-prints},\n\tvolume       = {abs/1603.05305}\n}\n@inproceedings{LiYSH18,\n\ttitle        = {Learning to Generalize: Meta-Learning for Domain Generalization},\n\tauthor       = {Da Li and Yongxin Yang and Yi-Zhe Song and Timothy M. Hospedales},\n\tyear         = 2018,\n\tbooktitle    = {AAAI},\n\tpages        = {3490--3497},\n\turl          = {https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/view/16067},\n\tcdate        = 1514764800000\n}\n@book{LjungBook,\n\ttitle        = {System Identification. Theory for the user},\n\tauthor       = {Lennart Ljung},\n\tyear         = 1998,\n\tpublisher    = {Prentice Hall},\n\taddress      = {Upper Saddle River, NJ},\n\tdate-added   = {2016-04-02 18:41:10 +0000},\n\tdate-modified = {2016-04-02 18:41:10 +0000},\n\tedition      = {2nd}\n}\n@article{LLDM09,\n\ttitle        = {Community Structure in Large Networks: Natural Cluster Sizes and the Absence of Large Well-Defined Clusters},\n\tauthor       = {Jure Leskovec and Kevin J. Lang and Anirban Dasgupta and Michael W. Mahoney},\n\tyear         = 2009,\n\tjournal      = {Internet Mathematics},\n\tvolume       = 6,\n\tnumber       = 1,\n\tpages        = {29--123}\n}\n@inproceedings{LLM10WWW,\n\ttitle        = {Empirical comparison of algorithms for network community detection},\n\tauthor       = {Leskovec, Jure and Lang, Kevin J. and Mahoney, Michael},\n\tyear         = 2010,\n\tseries       = {WWW},\n\tpages        = {631--640}\n}\n@inproceedings{LLX2014-ProxSDCA-APCG,\n\ttitle        = {{An Accelerated Proximal Coordinate Gradient Method and its Application to Regularized Empirical Risk Minimization}},\n\tauthor       = {Lin, Qihang and Lu, Zhaosong and Xiao, Lin},\n\tyear         = 2014,\n\tbooktitle    = {NIPS},\n\tpages        = {3059--3067},\n\turl          = {http://arxiv.org/abs/1407.1296 http://papers.nips.cc/paper/5356-an-accelerated-proximal-coordinate-gradient-method.pdf},\n\tannote       = {A short version has appeared in NIPS 2014 with its first 3 sections.},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {1407.1296},\n\teprint       = {1407.1296}\n}\n@inproceedings{LM,\n\ttitle        = {Pachinko Allocation: DAG-structured mixture models of topic correlations},\n\tauthor       = {W. Li and A. McCallum},\n\tyear         = 2007,\n\tbooktitle    = {ICML},\n\tpages        = {633--640}\n}\n@article{lmz17,\n\ttitle        = {Algorithmic Regularization in Over-parameterized Matrix Recovery},\n\tauthor       = {Yuanzhi Li and Tengyu Ma and Hongyang Zhang},\n\tyear         = 2017,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1712.09203},\n\turl          = {http://arxiv.org/abs/1712.09203},\n\tarchiveprefix = {arXiv},\n\teprint       = {1712.09203},\n\ttimestamp    = {Mon, 13 Aug 2018 16:48:32 +0200},\n\tbiburl       = {https://dblp.org/rec/bib/journals/corr/abs-1712-09203},\n\tbibsource    = {dblp computer science bibliography, https://dblp.org}\n}\n@inproceedings{locascio2016regex,\n\ttitle        = {Neural Generation of Regular Expressions from Natural Language with Minimal Domain Knowledge},\n\tauthor       = {Nicholas Locascio and Kumaravelu Narasimhan and Eduardo DeLeon and Nate Kushman and Regina Barzilay},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{lofberg2004,\n\ttitle        = {{YALMIP}: A Toolbox for Modeling and Optimization in {MATLAB}},\n\tauthor       = {Johan L{\\\"{o}}fberg},\n\tyear         = 2004,\n\tbooktitle    = {CACSD}\n}\n@article{loftus2018causal,\n\ttitle        = {Causal reasoning for algorithmic fairness},\n\tauthor       = {Joshua R Loftus and Chris Russell and Matt J Kusner and Ricardo Silva},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.05859}\n}\n@article{loh2014support,\n\ttitle        = {Support recovery without incoherence: A case for nonconvex regularization},\n\tauthor       = {Loh, Po-Ling and Wainwright, Martin J},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1412.5632}\n}\n@article{lohstroh1983worst,\n\ttitle        = {Worst-case static noise margin criteria for logic circuits and their mathematical equivalence},\n\tauthor       = {Jan Lohstroh and Evert Seevinck and Jan De Groot},\n\tyear         = 1983,\n\tjournal      = {IEEE Journal of Solid-State Circuits},\n\tvolume       = 18,\n\tnumber       = 6,\n\tpages        = {803--807}\n}\n@article{lomuscio2017approach,\n\ttitle        = {An approach to reachability analysis for feed-forward ReLU neural networks},\n\tauthor       = {Alessio Lomuscio and Lalit Maganti},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1706.07351}\n}\n@inproceedings{long2010restricted,\n\ttitle        = {Restricted {B}oltzmann machines are hard to approximately evaluate or simulate},\n\tauthor       = {P. Long and R. Servedio},\n\tyear         = 2010,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {703--710}\n}\n@inproceedings{long2013transfer,\n\ttitle        = {Transfer feature learning with joint distribution adaptation},\n\tauthor       = {Mingsheng Long and Jianmin Wang and Guiguang Ding and Jiaguang Sun and Philip S Yu},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of the IEEE international conference on computer vision},\n\tpages        = {2200--2207}\n}\n@inproceedings{long2015learning,\n\ttitle        = {Learning transferable features with deep adaptation networks},\n\tauthor       = {Mingsheng Long and Yue Cao and Jianmin Wang and Michael Jordan},\n\tyear         = 2015,\n\tbooktitle    = {International conference on machine learning},\n\tpages        = {97--105}\n}\n@inproceedings{long2016projections,\n\ttitle        = {Simpler Context-Dependent Logical Forms via Model Projections},\n\tauthor       = {Reginald Long and Panupong Pasupat and Percy Liang},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{long2017deep,\n\ttitle        = {Deep transfer learning with joint adaptation networks},\n\tauthor       = {Mingsheng Long and Han Zhu and Jianmin Wang and Michael I Jordan},\n\tyear         = 2017,\n\tbooktitle    = {International conference on machine learning},\n\tpages        = {2208--2217}\n}\n@inproceedings{long2018conditional,\n\ttitle        = {Conditional Adversarial Domain Adaptation},\n\tauthor       = {Mingsheng Long and Zhangjie Cao and Jianmin Wang and Michael I Jordan},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 31,\n\tpages        = {1640--1650},\n\turl          = {https://proceedings.neurips.cc/paper/2018/file/ab88b15733f543179858600245108dd8-Paper.pdf},\n\teditor       = {S. Bengio and H. Wallach and H. Larochelle and K. Grauman and N. Cesa-Bianchi and R. Garnett}\n}\n@inproceedings{lopez2013evaluating,\n\ttitle        = {Evaluating question answering over linked data},\n\tauthor       = {Vanessa Lopez and Christina Unger and Philipp Cimiano and Enrico Motta},\n\tyear         = 2013,\n\tbooktitle    = {World Wide Web (WWW)}\n}\n@article{lorentz1966metric,\n\ttitle        = {Metric entropy and approximation},\n\tauthor       = {G. G. Lorentz},\n\tyear         = 1966,\n\tjournal      = {Bulletin of the American Mathematical Society},\n\tvolume       = 72,\n\tnumber       = 6,\n\tpages        = {903--937}\n}\n@article{losch2019interpretability,\n\ttitle        = {Interpretability beyond classification output: Semantic bottleneck networks},\n\tauthor       = {Max Losch and Mario Fritz and Bernt Schiele},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1907.10882}\n}\n@article{losey2018review,\n\ttitle        = {A review of intent detection, arbitration, and communication aspects of shared control for physical human-robot interaction},\n\tauthor       = {Dylan P Losey and Craig G McDonald and Edoardo Battaglia and Marcia K O'Malley},\n\tyear         = 2018,\n\tjournal      = {Applied Mechanics Reviews},\n\tvolume       = 70\n}\n@inproceedings{losey2020latent,\n\ttitle        = {Controlling Assistive Robots with Learned Latent Actions},\n\tauthor       = {Dylan P. Losey and Krishnan Srinivasan and Ajay Mandlekar and Animesh Garg and Dorsa Sadigh},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Robotics and Automation (ICRA)},\n\tpages        = {378--384}\n}\n@article{loshchilov2016sgdr,\n\ttitle        = {Sgdr: Stochastic gradient descent with warm restarts},\n\tauthor       = {Loshchilov, Ilya and Hutter, Frank},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1608.03983}\n}\n@inproceedings{loshchilov2017sgdr,\n\ttitle        = {Sgdr: Stochastic gradient descent with warm restarts},\n\tauthor       = {Ilya Loshchilov and Frank Hutter},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{loshchilov2019decoupled,\n\ttitle        = {Decoupled Weight Decay Regularization},\n\tauthor       = {Ilya Loshchilov and Frank Hutter},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{louis1986explaining,\n\ttitle        = {Explaining discrepancies between longitudinal and cross-sectional models},\n\tauthor       = {Thomas A Louis and James Robins and Douglas W Dockery and Avron Spiro and James H Ware},\n\tyear         = 1986,\n\tjournal      = {Journal of Clinical Epidemiology},\n\tvolume       = 39,\n\tnumber       = 10,\n\tpages        = {831--839}\n}\n@incollection{louis2011algorithmic,\n\ttitle        = {Algorithmic extensions of Cheeger’s inequality to higher eigenvalues and partitions},\n\tauthor       = {Louis, Anand and Raghavendra, Prasad and Tetali, Prasad and Vempala, Santosh},\n\tyear         = 2011,\n\tbooktitle    = {Approximation, Randomization, and Combinatorial Optimization. Algorithms and Techniques},\n\tpublisher    = {Springer},\n\tpages        = {315--326}\n}\n@inproceedings{louis2014approximation,\n\ttitle        = {Approximation algorithm for sparsest k-partitioning},\n\tauthor       = {Louis, Anand and Makarychev, Konstantin},\n\tyear         = 2014,\n\tbooktitle    = {Proceedings of the twenty-fifth annual ACM-SIAM symposium on Discrete algorithms},\n\tpages        = {1244--1255},\n\torganization = {SIAM}\n}\n@article{louizos2015variational,\n\ttitle        = {The variational fair autoencoder},\n\tauthor       = {Louizos, Christos and Swersky, Kevin and Li, Yujia and Welling, Max and Zemel, Richard},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1511.00830}\n}\n@inproceedings{louvan2015semantic,\n\ttitle        = {Semantic Role Labeling for Process Recognition Questions},\n\tauthor       = {Louvan, Samuel and Naik, Chetan and Lynn, Veronica and Arun, Ankit and Balasubramanian, Niranjan and Clark, Peter},\n\tyear         = 2015,\n\tbooktitle    = {K-CAP Scientific Knowledge Workshop}\n}\n@inproceedings{louvan2016cross,\n\ttitle        = {Cross-Sentence Inference for Process Knowledge},\n\tauthor       = {Louvan, Samuel and Chetan Naik and Sadhana Kumaravel and Heeyoung Kwon and Niranjan Balasubramanian and Peter Clark},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{lovasz1975ratio,\n\ttitle        = {On the ratio of optimal integral and fractional covers},\n\tauthor       = {L{\\'a}szl{\\'o} Lov{\\'a}sz},\n\tyear         = 1975,\n\tjournal      = {Discrete Mathematics},\n\tvolume       = 13,\n\tnumber       = 4,\n\tpages        = {383--390}\n}\n@article{lovasz2006simulated,\n\ttitle        = {Simulated annealing in convex bodies and an O*(n4) volume algorithm},\n\tauthor       = {Lov{\\'a}sz, L{\\'a}szl{\\'o} and Vempala, Santosh},\n\tyear         = 2006,\n\tjournal      = {Journal of Computer and System Sciences},\n\tpublisher    = {Elsevier},\n\tvolume       = 72,\n\tnumber       = 2,\n\tpages        = {392--417}\n}\n@inproceedings{LovaszSimonovits90,\n\ttitle        = {The mixing rate of Markov chains, an isoperimetric inequality, and computing the volume},\n\tauthor       = {L{\\'a}szl{\\'o} Lov{\\'a}sz and Mikl{\\'o}s Simonovits},\n\tyear         = 1990,\n\tseries       = {FOCS},\n\tpages        = {346--354}\n}\n@article{LovaszSimonovits93,\n\ttitle        = {Random Walks in a Convex Body and an Improved Volume Algorithm},\n\tauthor       = {L{\\'a}szl{\\'o} Lov{\\'a}sz and Mikl{\\'o}s Simonovits},\n\tyear         = 1993,\n\tjournal      = {Random Struct. Algorithms},\n\tvolume       = 4,\n\tnumber       = 4,\n\tpages        = {359--412}\n}\n@inproceedings{lowd2005adversarial,\n\ttitle        = {Adversarial learning},\n\tauthor       = {Daniel Lowd and Christopher Meek},\n\tyear         = 2005,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)}\n}\n@inproceedings{lowe1999sift,\n\ttitle        = {Object recognition from local scale-invariant features},\n\tauthor       = {David G Lowe},\n\tyear         = {1999 1999},\n\tbooktitle    = {International Conference on Computer Vision (ICCV) Proceedings of the seventh IEEE international conference on computer vision},\n\tvolume       = 2,\n\tpages        = {1150--1157}\n}\n@article{lowe2015ubuntu,\n\ttitle        = {The {U}buntu dialogue corpus: A large dataset for research in unstructured multi-turn dialogue systems},\n\tauthor       = {Ryan Lowe and Nissan Pow and Iulian Serban and Joelle Pineau},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1506.08909}\n}\n@article{lowe2017multi,\n\ttitle        = {Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments},\n\tauthor       = {Lowe, Ryan and Wu, Yi and Tamar, Aviv and Harb, Jean and Pieter Abbeel, OpenAI and Mordatch, Igor},\n\tyear         = 2017,\n\tjournal      = {Advances in Neural Information Processing Systems},\n\tvolume       = 30,\n\tpages        = {6379--6390}\n}\n@inproceedings{lowe2017towards,\n\ttitle        = {Towards an Automatic Turing Test: Learning to Evaluate Dialogue Responses},\n\tauthor       = {Ryan Lowe and Michael Noseworthy and Iulian V. Serban and Nicolas Angelard-Gontier and Yoshua Bengio and Joelle Pineau},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{lowe2017ubuntu,\n\ttitle        = {Training End-to-End Dialogue Systems with the Ubuntu Dialogue Corpus},\n\tauthor       = {Ryan Thomas Lowe and Nissan Pow and Iulian Serban and Laurent Charlin and Chia-Wei Liu and Joelle Pineau},\n\tyear         = 2017,\n\tjournal      = {Dialogue and Discourse},\n\tvolume       = 8\n}\n@inproceedings{lowe2020selfplay,\n\ttitle        = {On the interaction between supervision and self-play in emergent communication},\n\tauthor       = {Ryan Lowe and Abhinav Gupta and Jakob N. Foerster and Douwe Kiela and Joelle Pineau},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{lowell2019practical,\n\ttitle        = {Practical Obstacles to Deploying Active Learning},\n\tauthor       = {David Lowell and Zachary C. Lipton and Byron C. Wallace},\n\tyear         = 2019,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{lowner1934monotone,\n\ttitle        = {{\\\"U}ber monotone matrixfunktionen},\n\tauthor       = {Karl L{\\\"o}wner},\n\tyear         = 1934,\n\tjournal      = {Mathematische Zeitschrift},\n\tvolume       = 38,\n\tnumber       = 1,\n\tpages        = {177--216}\n}\n@article{lowrey2018plan,\n\ttitle        = {Plan online, learn offline: Efficient learning and exploration via model-based control},\n\tauthor       = {Lowrey, Kendall and Rajeswaran, Aravind and Kakade, Sham and Todorov, Emanuel and Mordatch, Igor},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1811.01848}\n}\n@inproceedings{LQBC12,\n\ttitle        = {Spectral Learning for Non-Deterministic Dependency Parsing},\n\tauthor       = {F. M. Luque and A. Quattoni and B. Balle and X. Carreras},\n\tyear         = 2012,\n\tbooktitle    = {Conference of the European Chapter of the Association for Computational Linguistics}\n}\n@inproceedings{LRS2013,\n\ttitle        = {{A new approach to computing maximum flows using electrical flows}},\n\tauthor       = {Lee, Yin Tat and Rao, Satish and Srivastava, Nikhil},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of the 45th annual ACM symposium on Symposium on theory of computing - STOC '13},\n\tpublisher    = {ACM Press},\n\taddress      = {New York, New York, USA},\n\tpages        = 755,\n\tdoi          = {10.1145/2488608.2488704},\n\tisbn         = 9781450320290,\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Lee, Rao, Srivastava - 2013 - A new approach to computing maximum flows using electrical flows.pdf:pdf},\n\tmendeley-groups = {Algorithms/Maxflow}\n}\n@inproceedings{LS,\n\ttitle        = {Learning overcomplete representations},\n\tauthor       = {M. Lewicki and T. Sejnowski},\n\tyear         = 2000,\n\tbooktitle    = {Neural Computation},\n\tpages        = {337--365}\n}\n@article{LS99,\n\ttitle        = {Learning the parts of objects by non-negative matrix factorization},\n\tauthor       = {D. Lee and H. Seung},\n\tyear         = 1999,\n\tjournal      = {Nature},\n\tpages        = {788--791}\n}\n@article{LSI,\n\ttitle        = {Indexing by latent semantic analysis},\n\tauthor       = {S. Deerwester and S. Dumais and T. Landauer and G. Furnas and R. Harshman},\n\tyear         = 1990,\n\tjournal      = {JASIS},\n\tpages        = {391--407}\n}\n@inproceedings{LSS01,\n\ttitle        = {Predictive Representations of State},\n\tauthor       = {M. Littman and R. Sutton and S. Singh},\n\tyear         = 2001,\n\tbooktitle    = {Advances in Neural Information Processing Systems 14},\n\tpages        = {1555--1561}\n}\n@article{lstm,\n\ttitle        = {Long Short-Term Memory},\n\tauthor       = {Sepp Hochreiter and J{\\\"{u}}rgen Schmidhuber},\n\tyear         = 1997,\n\tjournal      = {Neural Computation},\n\tvolume       = 9,\n\tnumber       = 8,\n\tpages        = {1735--1780},\n\tdoi          = {10.1162/neco.1997.9.8.1735},\n\turl          = {http://dx.doi.org/10.1162/neco.1997.9.8.1735},\n\ttimestamp    = {Thu, 17 Nov 2011 16:24:23 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/journals/neco/HochreiterS97},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{lu08generative,\n\ttitle        = {A Generative Model for Parsing Natural Language to Meaning Representations},\n\tauthor       = {Wei Lu and Hwee Tou Ng and Wee Sun Lee and Luke S. Zettlemoyer},\n\tyear         = 2008,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {783--792}\n}\n@inproceedings{lu09generation,\n\ttitle        = {Natural Language Generation with Tree Conditional Random Fields},\n\tauthor       = {Wei Lu and Hwee Tou Ng and Wee Sun Lee},\n\tyear         = 2009,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {400--409}\n}\n@inproceedings{lu12probabilistic,\n\ttitle        = {A Probabilistic Forest-to-String Model for Language Generation from Typed Lambda Calculus Expressions},\n\tauthor       = {Wei Lu and Hwee Tou Ng},\n\tyear         = 2012,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1611--1622}\n}\n@inproceedings{lu2014large,\n\ttitle        = {Large scale canonical correlation analysis with iterative least squares},\n\tauthor       = {Lu, Yichao and Foster, Dean P},\n\tyear         = 2014,\n\tbooktitle    = {NIPS},\n\tpages        = {91--99}\n}\n@inproceedings{lu2016hierarchical,\n\ttitle        = {Hierarchical question-image co-attention for visual question answering},\n\tauthor       = {Jiasen Lu and Jianwei Yang and Dhruv Batra and Devi Parikh},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{lu2017best,\n\ttitle        = {Best of Both Worlds: Transferring Knowledge from Discriminative Learning to a Generative Visual Dialog Model},\n\tauthor       = {Jiasen Lu and Anitha Kannan and Jianwei Yang and Devi Parikh and Dhruv Batra},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{lu2017no,\n\ttitle        = {No need to worry about adversarial examples in object detection in autonomous vehicles},\n\tauthor       = {Jiajun Lu and Hussein Sibai and Evan Fabry and David Forsyth},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1707.03501}\n}\n@article{lu2019semi,\n\ttitle        = {Semi-supervised histology classification using deep multiple instance learning and contrastive predictive coding},\n\tauthor       = {Ming Y Lu and Richard J Chen and Jingwen Wang and Debora Dillon and Faisal Mahmood},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1910.10825}\n}\n@inproceedings{luan2017multi,\n\ttitle        = {Multi-Task Learning for Speaker-Role Adaptation in Neural Conversation Models},\n\tauthor       = {Yi Luan and Chris Brockett and Bill Dolan and Jianfeng Gao and Michel Galley},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics and International Joint Conference on Natural Language Processing (ACL-IJCNLP)},\n\tvolume       = 1,\n\tpages        = {605--614}\n}\n@inproceedings{LubyNisan1993,\n\ttitle        = {{A parallel approximation algorithm for positive linear programming}},\n\tauthor       = {Luby, Michael and Nisan, Noam},\n\tyear         = 1993,\n\tbooktitle    = {Proceedings of the twenty-fifth annual ACM symposium on Theory of computing - STOC '93},\n\tpublisher    = {ACM Press},\n\taddress      = {New York, New York, USA},\n\tpages        = {448--457},\n\tdoi          = {10.1145/167088.167211},\n\tisbn         = {0897915917},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Luby, Nisan - 1993 - A parallel approximation algorithm for positive linear programming.pdf:pdf},\n\tmendeley-groups = {Algorithms/Multiplicative Weight/LP}\n}\n@article{lucio2020molecule,\n\ttitle        = {De novo generation of hit-like molecules from gene expression signatures using artificial intelligence},\n\tauthor       = {Oscar Méndez-Lucio and Benoit Baillif and Djork-Arné Clevert and David Rouquié and Joerg Wichard},\n\tyear         = 2020,\n\tjournal      = {Nature Communications},\n\tvolume       = 11\n}\n@article{luedtke2008sample,\n\ttitle        = {A sample approximation approach for optimization with probabilistic constraints},\n\tauthor       = {Luedtke, James and Ahmed, Shabbir},\n\tyear         = 2008,\n\tjournal      = {SIAM Journal on Optimization},\n\tpublisher    = {SIAM},\n\tvolume       = 19,\n\tnumber       = 2,\n\tpages        = {674--699}\n}\n@article{luhn1958automatic,\n\ttitle        = {The Automatic Creation of Literature Abstracts},\n\tauthor       = {Henry P. Luhn},\n\tyear         = 1958,\n\tjournal      = {{IBM} Journal of Research and Development},\n\tvolume       = 2,\n\tpages        = {159--165}\n}\n@inproceedings{luketina2019survey,\n\ttitle        = {A Survey of Reinforcement Learning Informed by Natural Language},\n\tauthor       = {Jelena Luketina and Nantas Nardelli and Gregory Farquhar and Jakob Foerster and Jacob Andreas and Edward Grefenstette and Shimon Whiteson and Tim Rockt{\\\"{a}}schel},\n\tyear         = 2019,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@article{lukovsevivcius2009reservoir,\n\ttitle        = {Reservoir computing approaches to recurrent neural network training},\n\tauthor       = {Mantas Luko{\\v{s}}Evi{\\v{c}}Ius and Herbert Jaeger},\n\tyear         = 2009,\n\tjournal      = {Computer Science Review},\n\tvolume       = 3,\n\tnumber       = 3,\n\tpages        = {127--149}\n}\n@article{lundberg2016unexpected,\n\ttitle        = {An unexpected unity among methods for interpreting model predictions},\n\tauthor       = {Scott Lundberg and Su-In Lee},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1611.07478}\n}\n@article{lunetta2006land,\n\ttitle        = {Land-cover change detection using multi-temporal {MODIS} {NDVI} data},\n\tauthor       = {Ross Lunetta and Joseph F Knight and Jayantha Ediriwickremaand John G Lyon and L Dorsey Worthy},\n\tyear         = 2006,\n\tjournal      = {Remote sensing of environment},\n\tvolume       = 105,\n\tnumber       = 2,\n\tpages        = {142--154}\n}\n@book{luo2012regularity,\n\ttitle        = {Regularity and complexity in dynamical systems},\n\tauthor       = {Albert CJ Luo},\n\tyear         = 2012,\n\tpublisher    = {Springer}\n}\n@article{luo2013compact,\n\ttitle        = {Compact Model for Carbon Nanotube Field-Effect Transistors Including Nonidealities and Calibrated with Experimental Data Down to 9-nm Gate Length},\n\tauthor       = {Jieying Luo and Lan Wei and Chi-Shuen Lee and Aaron D. Franklin and Ximeng Guan and Eric Pop and Dimitri Antoniadis and Hon Sun Philip Wong},\n\tyear         = 2013,\n\tjournal      = {IEEE Transactions on Electron Devices},\n\tvolume       = 60,\n\tnumber       = 6,\n\tpages        = {1834--1843}\n}\n@article{luo2018algorithmic,\n\ttitle        = {Algorithmic framework for model-based deep reinforcement learning with theoretical guarantees},\n\tauthor       = {Luo, Yuping and Xu, Huazhe and Li, Yuanzhi and Tian, Yuandong and Darrell, Trevor and Ma, Tengyu},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1807.03858}\n}\n@inproceedings{luo2018efficient,\n\ttitle        = {Efficient contextual bandits in non-stationary worlds},\n\tauthor       = {Haipeng Luo and Chen-Yu Wei and Alekh Agarwal and John Langford},\n\tyear         = 2018,\n\tbooktitle    = {Conference On Learning Theory},\n\tpages        = {1739--1776}\n}\n@article{luo2018learning,\n\ttitle        = {Learning Personalized End-to-End Goal-Oriented Dialog},\n\tauthor       = {Liangchen Luo and Wenhao Huang and Qi Zeng and Zaiqing Nie and Xu Sun},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1811.04604}\n}\n@article{luo2019adaptive,\n\ttitle        = {Adaptive gradient methods with dynamic bound of learning rate},\n\tauthor       = {Luo, Liangchen and Xiong, Yuanhao and Liu, Yan and Sun, Xu},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1902.09843}\n}\n@inproceedings{luo2019algorithmic,\n\ttitle        = {Algorithmic Framework for Model-based Deep Reinforcement Learning with Theoretical Guarantees},\n\tauthor       = {Yuping Luo and Huazhe Xu and Yuanzhi Li and Yuandong Tian and Trevor Darrell and Tengyu Ma},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations},\n\turl          = {https://openreview.net/forum?id=BJe1E2R5KX}\n}\n@article{luo2019learning,\n\ttitle        = {Learning self-correctable policies and value functions from demonstrations with negative sampling},\n\tauthor       = {Luo, Yuping and Xu, Huazhe and Ma, Tengyu},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1907.05634}\n}\n@inproceedings{luo2019towards,\n\ttitle        = {Towards Understanding Regularization in Batch Normalization},\n\tauthor       = {Ping Luo and Xinjiang Wang and Wenqi Shao and Zhanglin Peng},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{luo2020learning,\n\ttitle        = {Learning Self-Correctable Policies and Value Functions from Demonstrations with Negative Sampling},\n\tauthor       = {Yuping Luo and Huazhe Xu and Tengyu Ma},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Learning Representations}\n}\n@inproceedings{luong2015rare,\n\ttitle        = {Addressing the Rare Word Problem in Neural Machine Translation},\n\tauthor       = {Minh-Thang Luong and Ilya Sutskever and Quoc V. Le and Oriol Vinyals and Wojciech Zaremba},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {11--19}\n}\n@inproceedings{luong2015translation,\n\ttitle        = {Effective Approaches to Attention-based Neural Machine Translation},\n\tauthor       = {Minh-Thang Luong and Hieu Pham and Christopher D. Manning},\n\tyear         = 2015,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1412--1421}\n}\n@inproceedings{luong2016iclr_multi,\n\ttitle        = {Multi-task Sequence to Sequence Learning},\n\tauthor       = {Minh-Thang Luong and Quoc V.   Le and Ilya   Sutskever and Oriol  Vinyals and Lukasz  Kaiser},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{LuXiao2013,\n\ttitle        = {On the complexity analysis of randomized block-coordinate descent methods},\n\tauthor       = {Lu, Zhaosong and Xiao, Lin},\n\tyear         = 2013,\n\tjournal      = {Mathematical Programming},\n\tpublisher    = {Springer},\n\tpages        = {1--28}\n}\n@inproceedings{LV06,\n\ttitle        = {Fast Algorithms for Logconcave Functions: Sampling, Rounding, Integration and Optimization},\n\tauthor       = {Lovasz, Laszlo and Vempala, Santosh},\n\tyear         = 2006,\n\tbooktitle    = {Proceedings of the 47th Annual IEEE Symposium on Foundations of Computer Science},\n\tpublisher    = {IEEE Computer Society},\n\taddress      = {Washington, DC, USA},\n\tseries       = {FOCS '06},\n\tpages        = {57--68},\n\tdoi          = {10.1109/FOCS.2006.28},\n\tisbn         = {0-7695-2720-5},\n\turl          = {http://dx.doi.org/10.1109/FOCS.2006.28},\n\tnumpages     = 12,\n\tacmid        = 1170488\n}\n@inproceedings{ly17,\n\ttitle        = {Convergence Analysis of Two-layer Neural Networks with {R}e{LU} Activation},\n\tauthor       = {Yuanzhi Li and Yang Yuan},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NIPS)},\n\tpublisher    = {http://arxiv.org/abs/1705.09886}\n}\n@phdthesis{lyapunov1892general,\n\ttitle        = {The general problem of the stability of motion (in Russian)},\n\tauthor       = {Aleksandr Mikhailovich Lyapunov},\n\tyear         = 1892,\n\tschool       = {Kharkov Mathematical Society}\n}\n@article{lyapunov1992general,\n\ttitle        = {The general problem of the stability of motion},\n\tauthor       = {Aleksandr Mikhailovich Lyapunov},\n\tyear         = 1992,\n\tjournal      = {International Journal of Control},\n\tvolume       = 55,\n\tnumber       = 3,\n\tpages        = {531--534}\n}\n@article{lygeros1999controllers,\n\ttitle        = {Controllers for reachability specifications for hybrid systems},\n\tauthor       = {John Lygeros and Claire Tomlin and Shankar Sastry},\n\tyear         = 1999,\n\tjournal      = {Automatica},\n\tvolume       = 35,\n\tnumber       = 3,\n\tpages        = {349--370}\n}\n@article{lygeros2015lecture,\n\ttitle        = {Lecture notes on linear system theory},\n\tauthor       = {Lygeros, John and Ramponi, Federico},\n\tyear         = 2015,\n\thowpublished = {\\url{http://home.mit.bme.hu/~virosztek/docs/mt_literature/LectureNotes.pdf}}\n}\n@article{lykouris2019corruption,\n\ttitle        = {Corruption robust exploration in episodic reinforcement learning},\n\tauthor       = {Lykouris, Thodoris and Simchowitz, Max and Slivkins, Aleksandrs and Sun, Wen},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1911.08689}\n}\n@article{lynch2019learning,\n\ttitle        = {Learning Latent Plans from Play},\n\tauthor       = {Corey Lynch and Mohi Khansari and Ted Xiao and Vikash Kumar and Jonathan Tompson and Sergey Levine and Pierre Sermanet},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1903.01973}\n}\n@article{lynch2020grounding,\n\ttitle        = {Grounding Language in Play},\n\tauthor       = {Corey Lynch and Pierre Sermanet},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2005.07648}\n}\n@article{lyu2019gradient,\n\ttitle        = {Gradient descent maximizes the margin of homogeneous neural networks},\n\tauthor       = {Lyu, Kaifeng and Li, Jian},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1906.05890}\n}\n@article{lyu2019ultra,\n\ttitle        = {Ultra-large library docking for discovering new chemotypes},\n\tauthor       = {Jiankun Lyu and Sheng Wang and Trent E Balius and Isha Singh and Anat Levit and Yurii S Moroz and Matthew J O’Meara and Tao Che and Enkhjargal Algaa and Kateryna Tolmachova and others},\n\tyear         = 2019,\n\tjournal      = {Nature},\n\tvolume       = 566,\n\tnumber       = 7743,\n\tpages        = {224--229}\n}\n@inproceedings{lyu2021contrasting,\n\ttitle        = {Contrasting Centralized and Decentralized Critics in Multi-Agent Reinforcement Learning},\n\tauthor       = {Lyu, Xueguang and Xiao, Yuchen and Daley, Brett and Amato, Christopher},\n\tyear         = 2021,\n\tbooktitle    = {Proceedings of the 20th International Conference on Autonomous Agents and MultiAgent Systems},\n\tpages        = {844--852}\n}\n@inproceedings{M,\n\ttitle        = {A Wavelet Tour of Signal Processing},\n\tauthor       = {S. Mallat},\n\tyear         = 1998,\n\tbooktitle    = {Academic-Press}\n}\n@inproceedings{ma2015finding,\n\ttitle        = {Finding Linear Structure in Large Datasets with Scalable Canonical Correlation Analysis},\n\tauthor       = {Ma, Zhuang and Lu, Yichao and Foster, Dean},\n\tyear         = 2015,\n\tbooktitle    = {ICML},\n\tpages        = {169--178}\n}\n@article{ma2016poly,\n\ttitle        = {Polynomial-time Tensor Decompositions with Sum-of-Squares},\n\tauthor       = {Tengyu Ma and Jonathan Shi and David Steurer},\n\tyear         = 2016,\n\tjournal      = {IEEE Symposium on Foundations of Computer Science (FOCS)},\n\turl          = {http://arxiv.org/abs/1610.01980},\n\ttimestamp    = {Wed, 02 Nov 2016 09:51:26 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/journals/corr/MaSS16},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{ma2016polynomial,\n\ttitle        = {Polynomial-time tensor decompositions with sum-of-squares},\n\tauthor       = {Ma, Tengyu and Shi, Jonathan and Steurer, David},\n\tyear         = 2016,\n\tmonth        = oct,\n\tjournal      = {ArXiv e-prints},\n\tbooktitle    = {2016 IEEE 57th Annual Symposium on Foundations of Computer Science (FOCS)},\n\tpages        = {438--446},\n\torganization = {IEEE},\n\tarchiveprefix = {arXiv},\n\teprint       = {1610.01980},\n\tprimaryclass = {cs.DS},\n\tkeywords     = {Computer Science - Data Structures and Algorithms, Computer Science - Learning},\n\tadsurl       = {http://adsabs.harvard.edu/abs/2016arXiv161001980M},\n\tadsnote      = {Provided by the SAO/NASA Astrophysics Data System}\n}\n@inproceedings{ma2017self,\n\ttitle        = {Self-paced co-training},\n\tauthor       = {Fan Ma and Deyu Meng and Qi Xie and Zina Li and Xuanyi Dong},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{ma2018implicit,\n\ttitle        = {Implicit regularization in nonconvex statistical estimation: Gradient descent converges linearly for phase retrieval and matrix completion},\n\tauthor       = {Ma, Cong and Wang, Kaizheng and Chi, Yuejie and Chen, Yuxin},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {3345--3354},\n\torganization = {PMLR}\n}\n@inproceedings{ma2018power,\n\ttitle        = {The power of interpolation: Understanding the effectiveness of {SGD} in modern over-parametrized learning},\n\tauthor       = {Siyuan Ma and Raef Bassily and Mikhail Belkin},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{ma2019implicit,\n\ttitle        = {Implicit regularization in nonconvex statistical estimation: Gradient descent converges linearly for phase retrieval, matrix completion, and blind deconvolution},\n\tauthor       = {Ma, Cong and Wang, Kaizheng and Chi, Yuejie and Chen, Yuxin},\n\tyear         = 2019,\n\tjournal      = {Foundations of Computational Mathematics},\n\tpublisher    = {Springer},\n\tpages        = {1--182}\n}\n@article{ma2020local,\n\ttitle        = {Why Do Local Methods Solve Nonconvex Problems?},\n\tauthor       = {Ma, Tengyu},\n\tyear         = 2020,\n\tjournal      = {Beyond the Worst-Case Analysis of Algorithms},\n\tpublisher    = {Cambridge University Press},\n\tpages        = 465\n}\n@misc{ma2021local,\n\ttitle        = {Why Do Local Methods Solve Nonconvex Problems?},\n\tauthor       = {Tengyu Ma},\n\tyear         = 2021,\n\teprint       = {2103.13462},\n\tarchiveprefix = {arXiv},\n\tprimaryclass = {cs.LG}\n}\n@inproceedings{maas2009one,\n\ttitle        = {One-Shot Learning with {B}ayesian Networks},\n\tauthor       = {Andrew L. Maas and Charles Kemp},\n\tyear         = 2009,\n\tbooktitle    = {Proceedings of The 31st Annual Meeting of The Cognitive Science Society}\n}\n@inproceedings{maas2013rectifier,\n\ttitle        = {Rectifier nonlinearities improve neural network acoustic models},\n\tauthor       = {Andrew L Maas and Awni Y Hannun and Andrew Y Ng},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{maas2014first,\n\ttitle        = {First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent {DNN}s},\n\tauthor       = {Andrew L Maas and Awni Y Hannun and Daniel Jurafsky and Andrew Y Ng},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1408.2873}\n}\n@article{maaten2008visualizing,\n\ttitle        = {Visualizing data using {t}-{SNE}},\n\tauthor       = {Laurens van der Maaten and Geoffrey Hinton},\n\tyear         = 2008,\n\tjournal      = {Journal of machine learning research},\n\tvolume       = 9,\n\tnumber       = {0},\n\tpages        = {2579--2605}\n}\n@inproceedings{maaten2013learning,\n\ttitle        = {Learning with marginalized corrupted features},\n\tauthor       = {Laurens van der Maaten and Minmin Chen and Stephen Tyree and Kilian Q Weinberger},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {410--418}\n}\n@inproceedings{maccartney2007natural,\n\ttitle        = {Natural Logic for Textual Inference},\n\tauthor       = {Bill MacCartney and Christopher D. Manning},\n\tyear         = 2007,\n\tbooktitle    = {ACL-PASCAL Workshop on Textual Entailment and Paraphrasing},\n\tpages        = {193--200}\n}\n@inproceedings{maccartney2008nli,\n\ttitle        = {Modeling Semantic Containment and Exclusion in Natural Language Inference},\n\tauthor       = {Bill MacCartney and Christopher D. Manning},\n\tyear         = 2008,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)}\n}\n@manual{maccartney2015sippy,\n\ttitle        = {SippyCup},\n\tauthor       = {Bill MacCartney},\n\tyear         = 2015,\n\thowpublished = {\\url{https://github.com/wcmac/sippycup}}\n}\n@article{macdonald2004biological,\n\ttitle        = {Biological age and 12-year cognitive change in older adults: findings from the Victoria Longitudinal Study},\n\tauthor       = {Stuart WS MacDonald and Roger A Dixon and Anna-Lisa Cohen and Janine E Hazlitt},\n\tyear         = 2004,\n\tjournal      = {Gerontology},\n\tvolume       = 50,\n\tnumber       = 2,\n\tpages        = {64--81}\n}\n@inproceedings{macenko2009method,\n\ttitle        = {A method for normalizing histology slides for quantitative analysis},\n\tauthor       = {Marc Macenko and Marc Niethammer and James S Marron and David Borland and John T Woosley and Xiaojun Guan and Charles Schmitt and Nancy E Thomas},\n\tyear         = 2009,\n\tbooktitle    = {2009 IEEE International Symposium on Biomedical Imaging: From Nano to Macro},\n\tpages        = {1107--1110}\n}\n@inproceedings{macglashan2015grounding,\n\ttitle        = {Grounding {E}nglish Commands to Reward Functions},\n\tauthor       = {J. MacGlashan and Monica Babes-Vroman and M. desJardins and M. Littman and S. Muresan and S. Squire and Stefanie Tellex and Dilip Arumugam and Lei Yang},\n\tyear         = 2015,\n\tbooktitle    = {Robotics: Science and Systems (RSS)}\n}\n@article{machado2017revisiting,\n\ttitle        = {Revisiting the arcade learning environment: Evaluation protocols and open problems for general agents},\n\tauthor       = {M. C. Machado and M. G. Bellemare and E. Talvitie and J. Veness and M. Hausknecht and M. Bowling},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1709.06009}\n}\n@article{maciel2011surgical,\n\ttitle        = {Surgical model-view-controller simulation software framework for local and collaborative applications},\n\tauthor       = {A. Maciel and G. Sankaranarayanan and T. Halic and V. Arikatla and Z. Lu and S. De},\n\tyear         = 2011,\n\tjournal      = {International Journal of Computer Assisted Radiology and Surgery},\n\tvolume       = 6,\n\tnumber       = 4,\n\tpages        = {457--471}\n}\n@techreport{mackay97hmm,\n\ttitle        = {Ensemble Learning for Hidden {M}arkov Models},\n\tauthor       = {David MacKay},\n\tyear         = 1997,\n\tinstitution  = {University of Cambridge}\n}\n@inproceedings{mackey2011divide,\n\ttitle        = {Divide-and-conquer matrix factorization},\n\tauthor       = {Mackey, Lester W and Jordan, Michael I and Talwalkar, Ameet},\n\tyear         = 2011,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {1134--1142}\n}\n@article{mackey2014matrix,\n\ttitle        = {Matrix concentration inequalities via the method of exchangeable pairs},\n\tauthor       = {Lester Mackey and Michael I. Jordan and Richard Y. Chen and Brendan Farrell and Joel A. Tropp},\n\tyear         = 2014,\n\tjournal      = {The Annals of Probability},\n\tvolume       = 42,\n\tnumber       = 3,\n\tpages        = {906--945}\n}\n@inproceedings{macmahon2006walk,\n\ttitle        = {Walk the talk: Connecting language, knowledge, and action in route instructions},\n\tauthor       = {Matt MacMahon and Brian Stankiewicz and Benjamin Kuipers},\n\tyear         = 2006,\n\tbooktitle    = {National Conference on Artificial Intelligence}\n}\n@article{macua15distributed,\n\ttitle        = {Distributed Policy Evaluation Under Multiple Behavior Strategies},\n\tauthor       = {Sergio Valcarcel Macua and Jianshu Chen and Santiago Zazo and Ali H. Sayed},\n\tyear         = 2015,\n\tjournal      = {IEEE Transactions on Automatic Control},\n\tvolume       = 60,\n\tnumber       = 5,\n\tpages        = {1260--1274}\n}\n@inproceedings{madaan2016numerical,\n\ttitle        = {Numerical relation extraction with minimal supervision},\n\tauthor       = {Aman Madaan and Ashish Mittal and Ganesh Ramakrishnan and Sunita Sarawagi},\n\tyear         = 2016,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{maddison2014structured,\n\ttitle        = {Structured generative models of natural source code},\n\tauthor       = {Chris Maddison and Daniel Tarlow},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {649--657}\n}\n@article{maddison2016concrete,\n\ttitle        = {The concrete distribution: A continuous relaxation of discrete random variables},\n\tauthor       = {Chris J Maddison and Andriy Mnih and Yee Whye Teh},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1611.00712}\n}\n@article{madnani2010generating,\n\ttitle        = {Generating phrasal and sentential paraphrases: A survey of data-driven methods},\n\tauthor       = {Nitin Madnani and Bonnie J Dorr},\n\tyear         = 2010,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 36,\n\tnumber       = 3,\n\tpages        = {341--387}\n}\n@article{madras2018learning,\n\ttitle        = {Learning adversarially fair and transferable representations},\n\tauthor       = {David Madras and Elliot Creager and Toniann Pitassi and Richard Zemel},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1802.06309}\n}\n@inproceedings{madras2019fairness,\n\ttitle        = {Fairness through causal awareness: Learning causal latent-variable models for biased data},\n\tauthor       = {David Madras and Elliot Creager and Toniann Pitassi and Richard Zemel},\n\tyear         = 2019,\n\tbooktitle    = {Proceedings of the Conference on Fairness, Accountability, and Transparency},\n\tpages        = {349--358}\n}\n@inproceedings{Madry2010,\n\ttitle        = {{Faster approximation schemes for fractional multicommodity flow problems via dynamic graph algorithms}},\n\tauthor       = {Madry, Aleksander},\n\tyear         = 2010,\n\tbooktitle    = {Proceedings of the 42nd ACM symposium on Theory of computing - STOC '10},\n\tpublisher    = {ACM Press},\n\taddress      = {New York, New York, USA},\n\tpages        = 121,\n\tdoi          = {10.1145/1806689.1806708},\n\tisbn         = 9781450300506,\n\tarchiveprefix = {arXiv},\n\tarxivid      = {arXiv:1003.5907v2},\n\teprint       = {arXiv:1003.5907v2},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Madry - 2010 - Faster approximation schemes for fractional multicommodity flow problems via dynamic graph algorithms.pdf:pdf},\n\tmendeley-groups = {Algorithms/Multiplicative Weight/Flow}\n}\n@inproceedings{Madry2013,\n\ttitle        = {{Navigating Central Path with Electrical Flows: From Flows to Matchings, and Back}},\n\tauthor       = {Madry, Aleksander},\n\tyear         = 2013,\n\tmonth        = oct,\n\tbooktitle    = {2013 IEEE 54th Annual Symposium on Foundations of Computer Science},\n\tpublisher    = {IEEE},\n\tpages        = {253--262},\n\tdoi          = {10.1109/FOCS.2013.35},\n\tisbn         = {978-0-7695-5135-7},\n\tmendeley-groups = {Algorithms/Maxflow}\n}\n@article{madry2017towards,\n\ttitle        = {Towards deep learning models resistant to adversarial attacks (published at {ICLR} 2018)},\n\tauthor       = {Aleksander Madry and Aleksandar Makelov and Ludwig Schmidt and Dimitris Tsipras and Adrian Vladu},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@inproceedings{madry2018towards,\n\ttitle        = {Towards deep learning models resistant to adversarial attacks},\n\tauthor       = {Aleksander Madry and Aleksandar Makelov and Ludwig Schmidt and Dimitris Tsipras and Adrian Vladu},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@misc{madrylab2019robustnesslib,\n\ttitle        = {Robustness (Python Library)},\n\tauthor       = {Logan Engstrom and Andrew Ilyas and Hadi Salman and Shibani Santurkar and Dimitris Tsipras},\n\tyear         = 2019,\n\thowpublished = {\\url{https://github.com/MadryLab/robustness}}\n}\n@inproceedings{maei10toward,\n\ttitle        = {Toward Off-Policy Learning Control with Function Approximation},\n\tauthor       = {Hamid Reza Maei and Csaba Szepesvári and Shalabh Bhatnagar and Richard S. Sutton},\n\tyear         = 2010,\n\tbooktitle    = {Proceedings of the 27th International Conference on Machine Learning (ICML)},\n\tpages        = {719--726}\n}\n@inproceedings{maei2010toward,\n\ttitle        = {Toward off-policy learning control with function approximation.},\n\tauthor       = {Maei, Hamid Reza and Szepesv{\\'a}ri, Csaba and Bhatnagar, Shalabh and Sutton, Richard S},\n\tyear         = 2010,\n\tbooktitle    = {ICML},\n\tpages        = {719--726}\n}\n@inproceedings{magnini2002right,\n\ttitle        = {Is It the Right Answer? Exploiting Web Redundancy for Answer Validation},\n\tauthor       = {Bernardo Magnini and Matteo Negri and Roberto Prevete and Hristo Tanev},\n\tyear         = 2002,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{magwene2003reconstructing,\n\ttitle        = {Reconstructing the temporal ordering of biological samples using microarray data},\n\tauthor       = {Paul M Magwene and Paul Lizardi and Junhyong Kim},\n\tyear         = 2003,\n\tjournal      = {Bioinformatics},\n\tvolume       = 19,\n\tnumber       = 7,\n\tpages        = {842--850}\n}\n@misc{mahadevan14Proximal,\n\ttitle        = {Proximal Reinforcement Learning: A New Theory of Sequential Decision Making in Primal-Dual Spaces},\n\tauthor       = {Sridhar Mahadevan and Bo Liu and Philip S. Thomas and William Dabney and Stephen Giguere and Nicholas Jacek and Ian Gemp and Ji Liu},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1405.6757},\n\tnote         = {CoRR abs/1405.6757}\n}\n@inproceedings{mahadevan2005proto,\n\ttitle        = {Proto-value functions: Developmental reinforcement learning},\n\tauthor       = {Mahadevan, Sridhar},\n\tyear         = 2005,\n\tbooktitle    = {Proceedings of the 22nd international conference on Machine learning},\n\tpages        = {553--560},\n\torganization = {ACM}\n}\n@article{mahadevan2009learning,\n\ttitle        = {Learning representation and control in {Markov} decision processes: {New} frontiers},\n\tauthor       = {Mahadevan, Sridhar},\n\tyear         = 2009,\n\tjournal      = {Foundations and Trends{\\textregistered} in Machine Learning},\n\tpublisher    = {Now Publishers, Inc.},\n\tvolume       = 1,\n\tnumber       = 4,\n\tpages        = {403--565}\n}\n@article{mahadevan2012sparse,\n\ttitle        = {Sparse q-learning with mirror descent},\n\tauthor       = {Mahadevan, Sridhar and Liu, Bo},\n\tyear         = 2012,\n\tjournal      = {arXiv preprint arXiv:1210.4893}\n}\n@article{mahadevan2014proximal,\n\ttitle        = {Proximal reinforcement learning: A new theory of sequential decision making in primal-dual spaces},\n\tauthor       = {Mahadevan, Sridhar and Liu, Bo and Thomas, Philip and Dabney, Will and Giguere, Steve and Jacek, Nicholas and Gemp, Ian and Liu, Ji},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1405.6757}\n}\n@article{mahajan2009planar,\n\ttitle        = {The planar {k}-means problem is {NP}-hard},\n\tauthor       = {Meena Mahajan and Prajakta Nimbhorkar and Kasturi Varadarajan},\n\tyear         = 2009,\n\tjournal      = {International Workshop on Algorithms and Computation},\n\tpages        = {274--285}\n}\n@article{mahajan2020domain,\n\ttitle        = {Domain generalization using causal matching},\n\tauthor       = {Mahajan, Divyat and Tople, Shruti and Sharma, Amit},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.07500}\n}\n@inproceedings{mahdavi,\n\ttitle        = {Linear convergence with condition number independent access of full gradients},\n\tauthor       = {Zhang, Lijun and Mahdavi, Mehrdad and Jin, Rong},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {980--988}\n}\n@inproceedings{MahdaviZhangJin2013-nonsc,\n\ttitle        = {Mixed optimization for smooth functions},\n\tauthor       = {Mahdavi, Mehrdad and Zhang, Lijun and Jin, Rong},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {674--682}\n}\n@inproceedings{mahendran2015understanding,\n\ttitle        = {Understanding deep image representations by inverting them},\n\tauthor       = {Aravindh Mahendran and Andrea Vedaldi},\n\tyear         = 2015,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@article{mahmud2018applications,\n\ttitle        = {Applications of deep learning and reinforcement learning to biological data},\n\tauthor       = {Mahmud, Mufti and Kaiser, Mohammed Shamim and Hussain, Amir and Vassanelli, Stefano},\n\tyear         = 2018,\n\tjournal      = {IEEE transactions on neural networks and learning systems},\n\tpublisher    = {IEEE},\n\tvolume       = 29,\n\tnumber       = 6,\n\tpages        = {2063--2079}\n}\n@article{mahoney2008tensor,\n\ttitle        = {Tensor-{CUR} decompositions for tensor-based data},\n\tauthor       = {Mahoney, Michael W and Maggioni, Mauro and Drineas, Petros},\n\tyear         = 2008,\n\tjournal      = {SIAM Journal on Matrix Analysis and Applications},\n\tpublisher    = {SIAM},\n\tvolume       = 30,\n\tnumber       = 3,\n\tpages        = {957--987}\n}\n@article{mahoney2009cur,\n\ttitle        = {{CUR} matrix decompositions for improved data analysis},\n\tauthor       = {Mahoney, Michael W and Drineas, Petros},\n\tyear         = 2009,\n\tjournal      = {Proceedings of the National Academy of Sciences},\n\tpublisher    = {National Acad Sciences},\n\tvolume       = 106,\n\tnumber       = 3,\n\tpages        = {697--702}\n}\n@article{mahoney2011randomized,\n\ttitle        = {Randomized algorithms for matrices and data},\n\tauthor       = {Mahoney, Michael W},\n\tyear         = 2011,\n\tjournal      = {Foundations and Trends{\\textregistered} in Machine Learning},\n\tpublisher    = {Now Publishers Inc.},\n\tvolume       = 3,\n\tnumber       = 2,\n\tpages        = {123--224}\n}\n@incollection{mairal2008discriminative,\n\ttitle        = {Discriminative sparse image models for class-specific edge detection and image interpretation},\n\tauthor       = {Mairal, Julien and Leordeanu, Marius and Bach, Francis and Hebert, Martial and Ponce, Jean},\n\tyear         = 2008,\n\tbooktitle    = {Computer Vision--ECCV 2008},\n\tpublisher    = {Springer},\n\tpages        = {43--56},\n\towner        = {gewor_000},\n\ttimestamp    = {2013.11.10}\n}\n@article{Mairal2015-MISO,\n\ttitle        = {{Incremental Majorization-Minimization Optimization with Application to Large-Scale Machine Learning}},\n\tauthor       = {Mairal, Julien},\n\tyear         = 2015,\n\tmonth        = apr,\n\tjournal      = {SIAM Journal on Optimization},\n\tvolume       = 25,\n\tnumber       = 2,\n\tpages        = {829--855},\n\tdoi          = {10.1137/140957639},\n\tissn         = {1052-6234},\n\turl          = {http://epubs.siam.org/doi/10.1137/140957639},\n\tnote         = {Preliminary version appeared in ICML 2013},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Mairal - 2015 - Incremental Majorization-Minimization Optimization with Application to Large-Scale Machine Learning.pdf:pdf},\n\tkeywords     = {1,10,1137,140957639,90c06,90c25,90c26,ams subject classifications,convex optimization,doi,introduction,majorization-minimization,minimizing upper bounds of,nonconvex optimization,the,the principle of successively},\n\tmendeley-groups = {Optimization/Variance Reduction}\n}\n@inproceedings{mairesse2010phrase,\n\ttitle        = {Phrase-based statistical language generation using graphical models and active learning},\n\tauthor       = {François Mairesse and Milica Gašić and Filip Jurčíček and Simon Keizer and Blaise Thomson and Kai Yu and Steve Young},\n\tyear         = 2010,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {1552--1561}\n}\n@article{maji2013fine,\n\ttitle        = {Fine-grained visual classification of aircraft},\n\tauthor       = {Maji, Subhransu and Rahtu, Esa and Kannala, Juho and Blaschko, Matthew and Vedaldi, Andrea},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1306.5151}\n}\n@inproceedings{makarychev2014bilu,\n\ttitle        = {Bilu-linial stable instances of max cut and minimum multiway cut},\n\tauthor       = {Makarychev, Konstantin and Makarychev, Yury and Vijayaraghavan, Aravindan},\n\tyear         = 2014,\n\tbooktitle    = {Proceedings of the Twenty-Fifth Annual ACM-SIAM Symposium on Discrete Algorithms},\n\tpages        = {890--906},\n\torganization = {Society for Industrial and Applied Mathematics}\n}\n@inproceedings{makarychev2014constant,\n\ttitle        = {Constant factor approximation for balanced cut in the pie model},\n\tauthor       = {Makarychev, Konstantin and Makarychev, Yury and Vijayaraghavan, Aravindan},\n\tyear         = 2014,\n\tbooktitle    = {Proceedings of the 46th Annual ACM Symposium on Theory of Computing},\n\tpages        = {41--49},\n\torganization = {ACM}\n}\n@article{makarychev2015learning,\n\ttitle        = {Learning Communities in the Presence of Errors},\n\tauthor       = {Konstantin Makarychev and Yury Makarychev and Aravindan Vijayaraghavan},\n\tyear         = 2015,\n\tjournal      = {arXiv}\n}\n@article{makkuva2019optimal,\n\ttitle        = {Optimal transport mapping via input convex neural networks},\n\tauthor       = {Makkuva, Ashok Vardhan and Taghvaei, Amirhossein and Oh, Sewoong and Lee, Jason D},\n\tyear         = 2020,\n\tjournal      = {International Conference on Machine Learning (ICML)}\n}\n@article{malach2021connection,\n\ttitle        = {The connection between approximation, depth separation and learnability in neural networks},\n\tauthor       = {Malach, Eran and Yehudai, Gilad and Shalev-Shwartz, Shai and Shamir, Ohad},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2102.00434}\n}\n@inproceedings{malakasiotis2011generate,\n\ttitle        = {A generate and rank approach to sentence paraphrasing},\n\tauthor       = {Prodromos Malakasiotis and Ion Androutsopoulos},\n\tyear         = 2011,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {96--106}\n}\n@inproceedings{malek14linear,\n\ttitle        = {Linear Programming for Large-Scale {Markov} Decision Problems},\n\tauthor       = {Yasin Abbasi-Yadkori and Peter L. Bartlett and Alan Malek},\n\tyear         = 2014,\n\tbooktitle    = {Proceedings of the 31st International Conference on Machine Learning},\n\tpages        = {496--504}\n}\n@article{malik06thegap,\n\ttitle        = {On the gap between the quadratic integer programming problem and its semidefinite relaxation},\n\tauthor       = {U. Malik and Imad M. Jaimoukha and G. D. Halikias and S. K. Gungah},\n\tyear         = 2006,\n\tjournal      = {Mathematical Programming},\n\tvolume       = 107,\n\tnumber       = 3,\n\tpages        = {505--515}\n}\n@article{malik2019calibrated,\n\ttitle        = {Calibrated model-based deep reinforcement learning},\n\tauthor       = {Malik, Ali and Kuleshov, Volodymyr and Song, Jiaming and Nemer, Danny and Seymour, Harlan and Ermon, Stefano},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1906.08312},\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{malinowski2015ask,\n\ttitle        = {Ask your neurons: A neural-based approach to answering questions about images},\n\tauthor       = {Mateusz Malinowski and Marcus Rohrbach and Mario Fritz},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Computer Vision (ICCV)},\n\tpages        = {1--9}\n}\n@misc{MALLET,\n\ttitle        = {MALLET: A Machine Learning for Language Toolkit},\n\tauthor       = {McCallum, Andrew Kachites},\n\tyear         = 2002,\n\turl          = {http://mallet.cs.umass.edu}\n}\n@article{malliavin1995gaussian,\n\ttitle        = {Gaussian Sobolev Spaces and Stochastic Calculus of Variations},\n\tauthor       = {Malliavin, Paul},\n\tyear         = 1995\n}\n@inproceedings{malmaud2014cooking,\n\ttitle        = {Cooking with Semantics},\n\tauthor       = {J. Malmaud and E. Wagner and N. Chang and K. Murphy},\n\tyear         = 2014,\n\tbooktitle    = {ACL Workshop on Semantic Parsing},\n\tpages        = {33--38}\n}\n@article{maluuba2016frames,\n\ttitle        = {Frames: A Corpus for Adding Memory to Goal-Oriented Dialogue Systems},\n\tauthor       = {Layla El Asri and Hannes Schulz and Shikhar Sharma and Jeremie Zumer and Justin Harris and Emery Fine and Rahul Mehrotra and Kaheer Suleman},\n\tyear         = 2016,\n\tjournal      = {Maluuba Technical Report}\n}\n@inproceedings{mandelkar2019scaling,\n\ttitle        = {Scaling Robot Supervision to Hundreds of Hours with RoboTurk: Robotic Manipulation Dataset through Human Reasoning and Dexterity},\n\tauthor       = {Ajay Mandlekar and Jonathan Booher and Max Spero and Albert Tung and Anchit Gupta and Yuke Zhu and Animesh Garg and Silvio Savarese and Li Fei-Fei},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Intelligent Robots and Systems (IROS)}\n}\n@book{mandic2001recurrent,\n\ttitle        = {Recurrent neural networks for prediction: learning algorithms, architectures and stability},\n\tauthor       = {Mandic, Danilo P and Chambers, Jonathon},\n\tyear         = 2001,\n\tpublisher    = {John Wiley \\& Sons, Inc.}\n}\n@article{mangalam2019do,\n\ttitle        = {Do deep neural networks learn shallow learnable examples first?},\n\tauthor       = {Mangalam, Karttikeya and Prabhu, Vinay},\n\tyear         = 2019,\n\tmonth        = jun\n}\n@inproceedings{mani1999tipster,\n\ttitle        = {The {TIPSTER} {SUMMAC} text summarization evaluation},\n\tauthor       = {Inderjeet Mani and Gary Klein and Lynette Hirschman and Therese Firmin and David House and Beth Sundheim},\n\tyear         = 1999,\n\tbooktitle    = {European Association for Computational Linguistics (EACL)}\n}\n@article{mania2018simple,\n\ttitle        = {Simple random search provides a competitive approach to reinforcement learning},\n\tauthor       = {Mania, Horia and Guy, Aurelia and Recht, Benjamin},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1803.07055}\n}\n@inproceedings{mann07expectation,\n\ttitle        = {Simple, Robust, Scalable Semi-supervised Learning via Expectation Regularization},\n\tauthor       = {Gideon Mann and Andrew McCallum},\n\tyear         = 2007,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {593--600}\n}\n@inproceedings{mann08ge,\n\ttitle        = {Generalized Expectation Criteria for Semi-Supervised Learning of Conditional Random Fields},\n\tauthor       = {Gideon Mann and Andrew McCallum},\n\tyear         = 2008,\n\tbooktitle    = {Human Language Technology and Association for Computational Linguistics (HLT/ACL)},\n\tpages        = {870--878}\n}\n@article{mann2010generalized,\n\ttitle        = {Generalized expectation criteria for semi-supervised learning with weakly labeled data},\n\tauthor       = {Gideon S Mann and Andrew McCallum},\n\tyear         = 2010,\n\tjournal      = {Journal of machine learning research},\n\tvolume       = 11,\n\tnumber       = {0},\n\tpages        = {955--984}\n}\n@inproceedings{mannem2010question,\n\ttitle        = {Question generation from paragraphs at UPenn: {QGSTEC} system description},\n\tauthor       = {Prashanth Mannem and Rashmi Prasad and Aravind Joshi},\n\tyear         = 2010,\n\tbooktitle    = {Proceedings of QG2010: The Third Workshop on Question Generation},\n\tpages        = {84--91}\n}\n@book{manning2008ir,\n\ttitle        = {Introduction to information retrieval},\n\tauthor       = {Christopher Manning and Prabhakar Raghavan and Hinrich Sch{\\\"u}tze},\n\tyear         = 2008,\n\tpublisher    = {Cambridge University Press},\n\tvolume       = 1\n}\n@inproceedings{manning2011pos,\n\ttitle        = {Part-of-Speech Tagging from 97% to 100%: Is It Time for Some Linguistics?},\n\tauthor       = {Christopher D. Manning},\n\tyear         = 2011,\n\tbooktitle    = {International Conference on Computational Linguistics and Intelligent Text Processing (CICLing)}\n}\n@inproceedings{manning2014stanford,\n\ttitle        = {The {S}tanford {C}ore{NLP} natural language processing toolkit},\n\tauthor       = {Christopher D. Manning and Mihai Surdeanu and John Bauer and Jenny Finkel and Steven J. Bethard and Davic McClosky},\n\tyear         = 2014,\n\tbooktitle    = {ACL system demonstrations}\n}\n@book{manning99nlp,\n\ttitle        = {Foundations of Statistical Natural Language Processing},\n\tauthor       = {Chris Manning and Hinrich Schütze},\n\tyear         = 1999,\n\tpublisher    = {MIT Press}\n}\n@article{mannor2004sample,\n\ttitle        = {The sample complexity of exploration in the multi-armed bandit problem},\n\tauthor       = {Mannor, Shie and Tsitsiklis, John N},\n\tyear         = 2004,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 5,\n\tnumber       = {Jun},\n\tpages        = {623--648}\n}\n@article{mannor2013algorithmic,\n\ttitle        = {Algorithmic aspects of mean--variance optimization in Markov decision processes},\n\tauthor       = {Mannor, Shie and Tsitsiklis, John N},\n\tyear         = 2013,\n\tjournal      = {European Journal of Operational Research},\n\tpublisher    = {Elsevier},\n\tvolume       = 231,\n\tnumber       = 3,\n\tpages        = {645--653}\n}\n@inproceedings{manshadi2013integrating,\n\ttitle        = {Integrating Programming by Example and Natural Language Programming},\n\tauthor       = {Mehdi Manshadi and Daniel Gildea and James Allen},\n\tyear         = 2013,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@article{mansimov2019generalized,\n\ttitle        = {A Generalized Framework of Sequence Generation with Application to Undirected Sequence Models},\n\tauthor       = {Elman Mansimov and Alex Wang and Kyunghyun Cho},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1905.12790}\n}\n@phdthesis{mansinghka09thesis,\n\ttitle        = {Natively Probabilistic Computation},\n\tauthor       = {Vikash Mansinghka},\n\tyear         = 2009,\n\tschool       = {Massachusetts Institute of Technology}\n}\n@inproceedings{mansinghka2013approximate,\n\ttitle        = {Approximate {B}ayesian image interpretation using generative probabilistic graphics programs},\n\tauthor       = {Vikash Mansinghka and Tejas D. Kulkarni and Yura N. Perov and Josh Tenenbaum},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1520--1528}\n}\n@article{manski1985semiparametric,\n\ttitle        = {Semiparametric analysis of discrete response: Asymptotic properties of the maximum score estimator},\n\tauthor       = {Charles F. Manski},\n\tyear         = 1985,\n\tjournal      = {Journal of Econometrics},\n\tvolume       = 27,\n\tnumber       = 3,\n\tpages        = {313--333}\n}\n@article{manski1988identification,\n\ttitle        = {Identification of binary response models},\n\tauthor       = {Charles F. Manski},\n\tyear         = 1988,\n\tjournal      = {Journal of the American Statistical Association (JASA)},\n\tvolume       = 83,\n\tnumber       = 403,\n\tpages        = {729--738}\n}\n@inproceedings{mansour1999complexity,\n\ttitle        = {On the complexity of policy iteration},\n\tauthor       = {Mansour, Yishay and Singh, Satinder},\n\tyear         = 1999,\n\tbooktitle    = {Proceedings of the Fifteenth conference on Uncertainty in artificial intelligence},\n\tpages        = {401--408},\n\torganization = {Morgan Kaufmann Publishers Inc.}\n}\n@inproceedings{mansour2009dams,\n\ttitle        = {Domain Adaptation with Multiple Sources},\n\tauthor       = {Yishay Mansour and Mehryar Mohri and Afshin Rostamizadeh},\n\tyear         = 2009,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1041--1048}\n}\n@article{mansour2009domain,\n\ttitle        = {Domain adaptation: Learning bounds and algorithms},\n\tauthor       = {Mansour, Yishay and Mohri, Mehryar and Rostamizadeh, Afshin},\n\tyear         = 2009,\n\tjournal      = {arXiv preprint arXiv:0902.3430},\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@article{manurung2000towards,\n\ttitle        = {Towards a Computational Model of Poetry Generation},\n\tauthor       = {Hisar Manurung and Graeme Ritchie and Henry Thompson},\n\tyear         = 2000,\n\tjournal      = {The University of Edinburgh Technical Report}\n}\n@inproceedings{mao2015deep,\n\ttitle        = {Deep captioning with multimodal recurrent neural networks (m-{RNN})},\n\tauthor       = {Junhua Mao and Wei Xu and Yi Yang and Jiang Wang and Zhiheng Huang and Alan Yuille},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@misc{MaoJieming2016-email,\n\tauthor       = {Mao, Jieming},\n\tyear         = 2016,\n\thowpublished = {private communication}\n}\n@inproceedings{MAR10,\n\ttitle        = {Deep learning via Hessian-free optimization},\n\tauthor       = {Martens, James},\n\tyear         = 2010,\n\tbooktitle    = {Proceedings of the 27th International Conference on Machine Learning (ICML-10)},\n\tpages        = {735--742}\n}\n@inproceedings{marcheggiani2014hierarchical,\n\ttitle        = {Hierarchical Multi-Label Conditional Random Fields for Aspect-Oriented Opinion Mining},\n\tauthor       = {D Marcheggiani and O T\\\"ackstr\\\"om and A Esuli and F Sebastiani},\n\tyear         = 2014,\n\tbooktitle    = {ECIR}\n}\n@inproceedings{marcovitz2016reverse,\n\ttitle        = {``{R}everse {G}enomics'' Predicts Function of Human Conserved Noncoding Elements},\n\tauthor       = {Amir Marcovitz and Robin Jia and Gill Bejerano},\n\tyear         = 2016,\n\tbooktitle    = {Molecular Biology and Evolution (MBE)}\n}\n@article{marcus1993building,\n\ttitle        = {Building a large annotated corpus of English: The Penn Treebank},\n\tauthor       = {Marcus, Mitchell and Santorini, Beatrice and Marcinkiewicz, Mary Ann},\n\tyear         = 1993\n}\n@manual{marcus1999ptb,\n\ttitle        = {Treebank-3},\n\tauthor       = {Mitchell Marcus and Beatrice Santorini and Mary Ann Marcinkiewicz and Ann Taylor},\n\tyear         = 1999\n}\n@article{marcus93treebank,\n\ttitle        = {Building a large annotated corpus of {E}nglish: the {P}enn {T}reebank},\n\tauthor       = {M. P. Marcus and M. A. Marcinkiewicz and B. Santorini},\n\tyear         = 1993,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 19,\n\tpages        = {313--330}\n}\n@inproceedings{marelli2014sick,\n\ttitle        = {A {SICK} cure for the evaluation of compositional distributional semantic models},\n\tauthor       = {Marco Marelli and Stefano Menini and Marco Baroni and Luisa Bentivogli and Raffaella bernardi and Roberto Zamparelli},\n\tyear         = 2014,\n\tbooktitle    = {Language Resources and Evaluation Conference (LREC)}\n}\n@article{margolick2015accelerating,\n\ttitle        = {Accelerating aging research: how can we measure the rate of biologic aging?},\n\tauthor       = {Joseph B Margolick and Luigi Ferrucci},\n\tyear         = 2015,\n\tjournal      = {Experimental Gerontology},\n\tvolume       = 64,\n\tpages        = {78--80}\n}\n@article{markman1988exclusivity,\n\ttitle        = {Children’s Use of Mutual Exclusivity to Constrain the Meanings of Words},\n\tauthor       = {E.M. Markman and G. F. Wachtel},\n\tyear         = 1988,\n\tjournal      = {Cognitive Psychology},\n\tvolume       = 20,\n\tpages        = {125--157}\n}\n@article{markman1990constraints,\n\ttitle        = {Constraints children place on word meanings},\n\tauthor       = {E.M. Markman},\n\tyear         = 1990,\n\tjournal      = {Cognitive Science},\n\tvolume       = 14,\n\tpages        = {57--77}\n}\n@article{maron1998framework,\n\ttitle        = {A framework for multiple-instance learning},\n\tauthor       = {Maron, Oded and Lozano-P{\\'e}rez, Tom{\\'a}s},\n\tyear         = 1998,\n\tjournal      = {Advances in neural information processing systems},\n\tpages        = {570--576}\n}\n@article{maronna1976robust,\n\ttitle        = {Robust estimation of multivariate location and scatter},\n\tauthor       = {Ricardo A. Maronna},\n\tyear         = 1976,\n\tjournal      = {Annals of Statistics},\n\tvolume       = 4,\n\tnumber       = 1,\n\tpages        = {51--67}\n}\n@book{maronna2006robust,\n\ttitle        = {Robust Statistics: Theory and Methods},\n\tauthor       = {Ricardo A. Maronna and Douglas R. Martin and Victor J. Yohai},\n\tyear         = 2006,\n\tpublisher    = {Wiley}\n}\n@article{marskin2001markov,\n\ttitle        = {Markov perfect equilibrium. {I}. {O}bservable actions},\n\tauthor       = {Maskin, Eric and Tirole, Jean},\n\tyear         = 2001,\n\tjournal      = {J. Econom. Theory},\n\tvolume       = 100,\n\tnumber       = 2,\n\tpages        = {191--219},\n\tdoi          = {10.1006/jeth.2000.2785},\n\tissn         = {0022-0531},\n\turl          = {https://doi.org/10.1006/jeth.2000.2785},\n\tfjournal     = {Journal of Economic Theory},\n\tmrclass      = {91A20},\n\tmrnumber     = 1860033,\n\tmrreviewer   = {Roy Gardner}\n}\n@inproceedings{martens2010deep,\n\ttitle        = {Deep learning via Hessian-free optimization},\n\tauthor       = {James Martens},\n\tyear         = 2010,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {735--742}\n}\n@conference{marthi2002decayed,\n\ttitle        = {Decayed MCMC Filtering},\n\tauthor       = {Marthi, Bhaskara and Pasula, Hanna and Russell, Stuart J. and Peres, Yuval},\n\tyear         = 2002,\n\tbooktitle    = {UAI},\n\tpages        = {319--326}\n}\n@inproceedings{marthi2005concurrent,\n\ttitle        = {Concurrent hierarchical reinforcement learning},\n\tauthor       = {B. Marthi and C. Guestrin},\n\tyear         = 2005,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@inproceedings{martin2017count,\n\ttitle        = {Count-based exploration in feature space for reinforcement learning},\n\tauthor       = {J. Martin and S. N. Sasikumar and T. Everitt and M. Hutter},\n\tyear         = 2017,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@article{martin2018implicit,\n\ttitle        = {Implicit self-regularization in deep neural networks: Evidence from random matrix theory and implications for learning},\n\tauthor       = {Martin, Charles H and Mahoney, Michael W},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1810.01075}\n}\n@inproceedings{martins2016softmax,\n\ttitle        = {From softmax to sparsemax: A sparse model of attention and multi-label classification},\n\tauthor       = {Andre Martins and Ramon Astudillo},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1614--1623}\n}\n@inproceedings{marton2009improved,\n\ttitle        = {Improved statistical machine translation using monolingually-derived paraphrases},\n\tauthor       = {Yuval Marton and Chris Callison-Burch and Philip Resnik},\n\tyear         = 2009,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {381--390}\n}\n@article{marzoev2020unnatural,\n\ttitle        = {Unnatural Language Processing: Bridging the Gap Between Synthetic and Natural Language Data},\n\tauthor       = {Alana Marzoev and S. Madden and M. Kaashoek and Michael J. Cafarella and Jacob Andreas},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2004.13645}\n}\n@inproceedings{masaum2012open,\n\ttitle        = {Open language learning for information extraction},\n\tauthor       = {Mausam and Michael Schmitz and Robert Bart and Stephen Soderland and Oren Etzioni},\n\tyear         = 2012,\n\tbooktitle    = {Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP/CoNLL)},\n\tpages        = {523--534}\n}\n@inproceedings{mason2014domain,\n\ttitle        = {Domain-Specific Image Captioning},\n\tauthor       = {Rebecca Mason and Eugene Charniak},\n\tyear         = 2014,\n\tbooktitle    = {Computational Natural Language Learning (CoNLL)},\n\tpages        = {2--10}\n}\n@inproceedings{massalin1987superoptimizer,\n\ttitle        = {Superoptimizer -- A Look at the Smallest Program},\n\tauthor       = {Henry Massalin},\n\tyear         = 1987,\n\tbooktitle    = {Architectural Support for Programming Languages and Operating Systems (ASPLOS)}\n}\n@article{massart1990tight,\n\ttitle        = {The tight constant in the Dvoretzky-Kiefer-Wolfowitz inequality},\n\tauthor       = {Pascal Massart},\n\tyear         = 1990,\n\tjournal      = {The annals of Probability},\n\tpages        = {1269--1283}\n}\n@article{massart2007concentration,\n\ttitle        = {Concentration inequalities and model selection},\n\tauthor       = {Massart, Pascal},\n\tyear         = 2007,\n\tpublisher    = {Springer}\n}\n@inproceedings{massoulie2014community,\n\ttitle        = {Community detection thresholds and the weak {R}amanujan property},\n\tauthor       = {Laurent Massouli{\\'e}},\n\tyear         = 2014,\n\tbooktitle    = {Symposium on Theory of Computing (STOC)},\n\tpages        = {694--703}\n}\n@book{mathews2006complex,\n\ttitle        = {Complex Analysis for Mathematics and Engineering},\n\tauthor       = {Mathews, John H. and Howell, Russell W.},\n\tyear         = 2006,\n\tmonth        = jan,\n\tday          = {09},\n\tpublisher    = {Jones \\& Bartlett Pub},\n\tisbn         = 9780763737481,\n\tedition      = 5,\n\thowpublished = {Hardcover},\n\tkeywords     = {math\\_phys, textbook},\n\tlccn         = 2005031562,\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@article{matloff1984use,\n\ttitle        = {Use of covariates in randomized response settings},\n\tauthor       = {Norman S Matloff},\n\tyear         = 1984,\n\tjournal      = {Statistics \\& Probability Letters},\n\tvolume       = 2,\n\tnumber       = 1,\n\tpages        = {31--34}\n}\n@article{matrone2012real,\n\ttitle        = {Real-time myoelectric control of a multi-fingered hand prosthesis using principal components analysis},\n\tauthor       = {Giulia C Matrone and Christian Cipriani and Maria Chiara Carrozza and Giovanni Magenes},\n\tyear         = 2012,\n\tjournal      = {Journal of Neuroengineering and Rehabilitation},\n\tvolume       = 9,\n\tpages        = {40--40}\n}\n@inproceedings{matsuzaki05latent,\n\ttitle        = {Probabilistic {CFG} with Latent Annotations},\n\tauthor       = {T. Matsuzaki and Y. Miyao and J. Tsujii},\n\tyear         = 2005,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {75--82}\n}\n@article{matthew2012adadelta,\n\ttitle        = {{ADADELTA:} An Adaptive Learning Rate Method},\n\tauthor       = {Matthew D. Zeiler},\n\tyear         = 2012,\n\tjournal      = {arXiv preprint arXiv:1212.5701}\n}\n@article{matthews2018gaussian,\n\ttitle        = {Gaussian process behaviour in wide deep neural networks},\n\tauthor       = {Matthews, Alexander G de G and Rowland, Mark and Hron, Jiri and Turner, Richard E and Ghahramani, Zoubin},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1804.11271}\n}\n@inproceedings{matuszek2012grounded,\n\ttitle        = {A joint model of language and perception for grounded attribute learning},\n\tauthor       = {Cynthia Matuszek and Nicholas FitzGerald and Luke Zettlemoyer and Liefeng Bo and Dieter Fox},\n\tyear         = 2012,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1671--1678}\n}\n@inproceedings{matuszek2012learning,\n\ttitle        = {Learning to parse natural language commands to a robot control system},\n\tauthor       = {C. Matuszek and E. Herbst and L. Zettlemoyer and D. Fox},\n\tyear         = 2012,\n\tbooktitle    = {International Symposium on Experimental Robotics (ISER)}\n}\n@inproceedings{matuszek2018groundedlang,\n\ttitle        = {Grounded Language Learning: Where Robotics and {NLP} Meet},\n\tauthor       = {Cynthia Matuszek},\n\tyear         = 2018,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@inproceedings{maurer2009empirical,\n\ttitle        = {Empirical {B}ernstein bounds and sample variance penalization},\n\tauthor       = {Maurer, Andreas and Pontil, Massimiliano},\n\tyear         = 2009,\n\tbooktitle    = {Conference on Learning Theory}\n}\n@article{maurer2016benefit,\n\ttitle        = {The benefit of multitask representation learning},\n\tauthor       = {Maurer, Andreas and Pontil, Massimiliano and Romera-Paredes, Bernardino},\n\tyear         = 2016,\n\tjournal      = {The Journal of Machine Learning Research},\n\tpublisher    = {JMLR. org}\n}\n@article{maxwell2018MLremotesensing,\n\ttitle        = {Implementation of machine-learning classification in remote sensing: an applied review},\n\tauthor       = {Aaron E. Maxwell and Timothy A. Warner and Fang Fang},\n\tyear         = 2018,\n\tjournal      = {International Journal of Remote Sensing},\n\tvolume       = 39,\n\tnumber       = 9,\n\tpages        = {2784--2817}\n}\n@inproceedings{mayfield2012evaluating,\n\ttitle        = {Evaluating the quality of a knowledge base populated from text},\n\tauthor       = {James Mayfield and Tim Finin},\n\tyear         = 2012,\n\tbooktitle    = {Joint Workshop on Automatic Knowledge Base Construction and Web-scale Knowledge Extraction}\n}\n@article{mayzlin2006promotional,\n\ttitle        = {Promotional chat on the Internet},\n\tauthor       = {Dina Mayzlin},\n\tyear         = 2006,\n\tjournal      = {Marketing Science},\n\tvolume       = 25,\n\tnumber       = 2,\n\tpages        = {155--163}\n}\n@techreport{mayzlin2012promotional,\n\ttitle        = {Promotional reviews: An empirical investigation of online review manipulation},\n\tauthor       = {Dina Mayzlin and Yaniv Dover and Judith A. Chevalier},\n\tyear         = 2012,\n\tinstitution  = {National Bureau of Economic Research}\n}\n@inproceedings{mazare2018training,\n\ttitle        = {Training Millions of Personalized Dialogue Agents},\n\tauthor       = {Pierre-Emmanuel Mazar\\'{e} and Samuel Humeau and Martin Raison and Antoine Bordes},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {2775--2779}\n}\n@article{mazumder2010spectral,\n\ttitle        = {Spectral regularization algorithms for learning large incomplete matrices},\n\tauthor       = {Mazumder, Rahul and Hastie, Trevor and Tibshirani, Robert},\n\tyear         = 2010,\n\tjournal      = {Journal of machine learning research},\n\tvolume       = 11,\n\tnumber       = {Aug},\n\tpages        = {2287--2322}\n}\n@article{mazumder2018towards,\n\ttitle        = {Towards a Continuous Knowledge Learning Engine for Chatbots},\n\tauthor       = {Sahisnu Mazumder and Nianzu Ma and Bing Liu},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1802.06024}\n}\n@article{mcallester2013pac,\n\ttitle        = {A {PAC}-{B}ayesian tutorial with a dropout bound},\n\tauthor       = {David McAllester},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1307.2118}\n}\n@inproceedings{mccallum00memm,\n\ttitle        = {Maximum Entropy {M}arkov Models for Information Extraction and Segmentation},\n\tauthor       = {Andrew McCallum and Dayne Freitag and Fernando Pereira},\n\tyear         = 2000,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{mccallum2003efficiently,\n\ttitle        = {Efficiently inducing features of conditional random fields},\n\tauthor       = {Andrew McCallum},\n\tyear         = 2003,\n\tbooktitle    = {Uncertainty in Artificial Intelligence (UAI)},\n\tpages        = {403--410}\n}\n@inproceedings{mccann2017learned,\n\ttitle        = {Learned in translation: Contextualized word vectors},\n\tauthor       = {Bryan McCann and James Bradbury and Caiming Xiong and Richard Socher},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {6297--6308}\n}\n@article{mccann2018natural,\n\ttitle        = {The natural language decathlon: Multitask learning as question answering},\n\tauthor       = {Bryan McCann and Nitish Shirish Keskar and Caiming Xiong and Richard Socher},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1806.08730}\n}\n@inproceedings{mccarthy1984some,\n\ttitle        = {Some expert systems need common sense},\n\tauthor       = {John McCarthy},\n\tyear         = 1984,\n\tbooktitle    = {Proceedings of a symposium on Computer culture: The scientific, intellectual, and social impact of the computer}\n}\n@inproceedings{mccarthy2002ls,\n\ttitle        = {Lexical Substitution as a Task for {WSD} Evaluation},\n\tauthor       = {Diana McCarthy},\n\tyear         = 2002,\n\tbooktitle    = {{SIGLEX}/{SENSEVAL} Workshop on Word Sense Disambiguation: Recent Successes and Future Direction}\n}\n@inproceedings{mccarthy2007semeval,\n\ttitle        = {Sem{E}val-2007 Task 10: {E}nglish Lexical Substitution Task},\n\tauthor       = {Diana McCarthy and Roberto Navigli},\n\tyear         = 2007,\n\tbooktitle    = {Workshop on Semantic Evaluations}\n}\n@article{mcclearn1997biogerontologic,\n\ttitle        = {Biogerontologic theories},\n\tauthor       = {Gerald E McClearn},\n\tyear         = 1997,\n\tjournal      = {Experimental Gerontology},\n\tvolume       = 32,\n\tnumber       = 1,\n\tpages        = {3--10}\n}\n@article{mccloskey1989catastrphic,\n\ttitle        = {Catastrophic interference in connectionist networks: The sequential learning problem},\n\tauthor       = {Michael McCloskey and Neal J Cohen},\n\tyear         = 1989,\n\tjournal      = {The psychology of learning and motivation},\n\tvolume       = 24\n}\n@article{mccloskey2020machine,\n\ttitle        = {Machine Learning on {DNA}-Encoded Libraries: A New Paradigm for Hit Finding},\n\tauthor       = {Kevin McCloskey and Eric A Sigel and Steven Kearnes and Ling Xue and Xia Tian and Dennis Moccia and Diana Gikunju and Sana Bazzaz and Betty Chan and Matthew A Clark and others},\n\tyear         = 2020,\n\tjournal      = {Journal of Medicinal Chemistry}\n}\n@inproceedings{mcclosky06self,\n\ttitle        = {Reranking and Self-Training for Parser Adaptation},\n\tauthor       = {David McClosky and Eugene Charniak and Mark Johnson},\n\tyear         = 2006,\n\tbooktitle    = {International Conference on Computational Linguistics and Association for Computational Linguistics (COLING/ACL)}\n}\n@inproceedings{mcclosky2006effective,\n\ttitle        = {Effective self-training for parsing},\n\tauthor       = {David McClosky and Eugene Charniak and Mark Johnson},\n\tyear         = 2006,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@phdthesis{mcclosky2010any,\n\ttitle        = {Any domain parsing: automatic domain adaptation for natural language parsing},\n\tauthor       = {David McClosky},\n\tyear         = 2010,\n\tschool       = {Brown University}\n}\n@inproceedings{mccoy2019right,\n\ttitle        = {Right for the Wrong Reasons: Diagnosing Syntactic Heuristics in Natural Language Inference},\n\tauthor       = {R Thomas McCoy and Ellie Pavlick and Tal Linzen},\n\tyear         = 2019,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{mccoy2020berts,\n\ttitle        = {BERTs of a feather do not generalize together: Large variability in generalization across models with similar test set performance},\n\tauthor       = {R. Thomas McCoy and Junghyun Min and Tal Linzen},\n\tyear         = 2020,\n\tbooktitle    = {Proceedings of the Third BlackBoxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP @ EMNLP}\n}\n@inproceedings{mcdonald05online,\n\ttitle        = {Online Large-Margin Training of Dependency Parsers},\n\tauthor       = {Ryan McDonald and Koby Crammer and Fernando Pereira},\n\tyear         = 2005,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {91--98}\n}\n@inproceedings{mcgovern2001subgoals,\n\ttitle        = {Automatic Discovery of Subgoals in Reinforcement Learning using Diverse Density},\n\tauthor       = {Amy McGovern and Andrew G. Barto},\n\tyear         = 2001,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{mchugh1997lore,\n\ttitle        = {Lore: A database management system for semistructured data},\n\tauthor       = {Jason McHugh and Serge Abiteboul and Roy Goldman and Dallan Quass and Jennifer Widom},\n\tyear         = 1997,\n\tjournal      = {SIGMOD record},\n\tvolume       = 26\n}\n@article{mckenzie2018robust,\n\ttitle        = {A New Algorithm for the Robust Semi-Random Independent Set Problem},\n\tauthor       = {Theo McKenzie and Hermish Mehta and Luca Trevisan},\n\tyear         = 2018,\n\tjournal      = {arXiv}\n}\n@inproceedings{mckeown2005summaries,\n\ttitle        = {Do Summaries Help? A Task-Based Evaluation of Multi-Document Summarization},\n\tauthor       = {Kathleen Mckeown and Rebecca J Passonneau and David K Elson and Julia Hirschberg},\n\tyear         = 2005,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)}\n}\n@article{mckinney2020international,\n\ttitle        = {International evaluation of an {AI} system for breast cancer screening},\n\tauthor       = {Scott Mayer McKinney and Marcin Sieniek and Varun Godbole and Jonathan Godwin and Natasha Antropova and Hutan Ashrafian and Trevor Back and Mary Chesus and Greg C Corrado and Ara Darzi and others},\n\tyear         = 2020,\n\tjournal      = {Nature},\n\tvolume       = 577,\n\tnumber       = 7788,\n\tpages        = {89--94}\n}\n@book{mclachlan2004finite,\n\ttitle        = {Finite mixture models},\n\tauthor       = {Geoffrey McLachlan and David Peel},\n\tyear         = 2004,\n\tpublisher    = {John Wiley \\& Sons}\n}\n@inproceedings{mcmahan2004online,\n\ttitle        = {Online Geometric Optimization in the Bandit Setting Against an Adaptive Adversary},\n\tauthor       = {McMahan, H Brendan and Blum, Avrim},\n\tyear         = 2004,\n\tbooktitle    = {COLT 2004},\n\tvolume       = 17,\n\tpages        = 109,\n\torganization = {Springer}\n}\n@article{McMahan2011,\n\ttitle        = {{A Unified View of Regularized Dual Averaging and Mirror Descent with Implicit Updates}},\n\tauthor       = {McMahan, H. Brendan},\n\tyear         = 2011,\n\tmonth        = sep,\n\tjournal      = {arXiv preprint arXiv:1009.3240},\n\tnote         = {Previously appeared in AISTATS 2011 as a conference paper entitled ``{Follow-the-regularized-leader and mirror descent: Equivalence theorems and l1 regularization}''},\n\tabstract     = {We study three families of online convex optimization algorithms: follow-the-proximally-regularized-leader (FTRL-Proximal), regularized dual averaging (RDA), and composite-objective mirror descent. We first prove equivalence theorems that show all of these algorithms are instantiations of a general FTRL update. This provides theoretical insight on previous experimental observations. In particular, even though the FOBOS composite mirror descent algorithm handles L1 regularization explicitly, it has been observed that RDA is even more effective at producing sparsity. Our results demonstrate that FOBOS uses subgradient approximations to the L1 penalty from previous rounds, leading to less sparsity than RDA, which handles the cumulative penalty in closed form. The FTRL-Proximal algorithm can be seen as a hybrid of these two, and outperforms both on a large, real-world dataset. Our second contribution is a unified analysis which produces regret bounds that match (up to logarithmic terms) or improve the best previously known bounds. This analysis also extends these algorithms in two important ways: we support a more general type of composite objective and we analyze implicit updates, which replace the subgradient approximation of the current loss function with an exact optimization.},\n\tannote       = {This is presumably the journal version of \"Follow-the-regularized-leader and mirror descent: Equivalence theorems and l1 regularization\"},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {1009.3240},\n\teprint       = {1009.3240},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/McMahan - 2011 - A Unified View of Regularized Dual Averaging and Mirror Descent with Implicit Updates.pdf:pdf},\n\tkeywords     = {bounds,follow-the-leader algorithms,online convex optimization,online learning,regret,subgradient methods},\n\tmendeley-groups = {Optimization/Gradient Descent Theory}\n}\n@inproceedings{mcmahan2013ad,\n\ttitle        = {Ad click prediction: a view from the trenches},\n\tauthor       = {Brendan H. Mc{M}ahan and Gary Holt and D Sculley and Michael Young and Dietmar Ebner and Julian Grady and Lan Nie and Todd Phillips and Eugene Davydov and Daniel Golovin and others},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)}\n}\n@inproceedings{McMahanStreeter2010,\n\ttitle        = {{Adaptive Bound Optimization for Online Convex Optimization}},\n\tauthor       = {McMahan, H. Brendan and Streeter, Matthew},\n\tyear         = 2010,\n\tmonth        = feb,\n\tbooktitle    = {Proceedings of the 23rd Annual Conference on Learning Theory - COLT '10},\n\tabstract     = {We introduce a new online convex optimization algorithm that adaptively chooses its regularization function based on the loss functions observed so far. This is in contrast to previous algorithms that use a fixed regularization function such as L2-squared, and modify it only via a single time-dependent parameter. Our algorithm's regret bounds are worst-case optimal, and for certain realistic classes of loss functions they are much better than existing bounds. These bounds are problem-dependent, which means they can exploit the structure of the actual problem instance. Critically, however, our algorithm does not need to know this structure in advance. Rather, we prove competitive guarantees that show the algorithm provides a bound within a constant factor of the best possible bound (of a certain functional form) in hindsight.},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {1002.4908},\n\teprint       = {1002.4908},\n\tmendeley-groups = {Optimization/Gradient Descent Theory}\n}\n@inproceedings{mcmillan06lazy,\n\ttitle        = {Lazy Abstraction with Interpolants},\n\tauthor       = {Ken McMillan},\n\tyear         = 2006,\n\tbooktitle    = {Computer Aided Verification (CAV)},\n\tpages        = {123--136}\n}\n@article{mcnamara2017provably,\n\ttitle        = {Provably fair representations},\n\tauthor       = {Daniel McNamara and Cheng Soon Ong and Robert C Williamson},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1710.04394}\n}\n@inproceedings{McSherry01,\n\ttitle        = {Spectral Partitioning of Random Graphs},\n\tauthor       = {F. McSherry},\n\tyear         = 2001,\n\tbooktitle    = {FOCS}\n}\n@inproceedings{mcsherry2001spectral,\n\ttitle        = {Spectral partitioning of random graphs},\n\tauthor       = {McSherry, Frank},\n\tyear         = 2001,\n\tbooktitle    = {Proceedings 42nd IEEE Symposium on Foundations of Computer Science},\n\tpages        = {529--537},\n\torganization = {IEEE}\n}\n@inproceedings{mcwilliams2013correlated,\n\ttitle        = {Correlated random features for fast semi-supervised learning},\n\tauthor       = {B. McWilliams and D. Balduzzi and J. Buhmann},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {440--448}\n}\n@inproceedings{meehan1977tale,\n\ttitle        = {{TALE-SPIN}, An Interactive Program that Writes Stories},\n\tauthor       = {James R Meehan},\n\tyear         = 1977,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@inproceedings{Megretski08,\n\ttitle        = {Convex Optimization in Robust Identification of Nonlinear Feedback},\n\tauthor       = {Alexandre Megretski},\n\tyear         = 2008,\n\tbooktitle    = {Proceedings of the 47th Conference on Decision and Control},\n\tdate-added   = {2016-04-02 18:56:48 +0000},\n\tdate-modified = {2016-04-02 18:58:18 +0000}\n}\n@inproceedings{mehta2006trajectory,\n\ttitle        = {On Trajectory Representation for Scientific Features},\n\tauthor       = {Mehta, S. and Parthasarathy, S. and Machiraju, R.},\n\tyear         = 2006,\n\tmonth        = dec,\n\tbooktitle    = {ICDM '06. IEEE Sixth International Conference on Data Mining},\n\tpages        = {997--1001},\n\tdoi          = {10.1109/ICDM.2006.120},\n\tissn         = {1550-4786},\n\tkeywords     = {\n\t\tmotion parameters;scientific features;shape parameters;trajectory\n\n\t\trepresentation algorithms;feature extraction;geometry;image motion\n\n\t\tanalysis;image representation;\n\t}\n}\n@inproceedings{mei2015security,\n\ttitle        = {The Security of Latent {D}irichlet Allocation},\n\tauthor       = {Shike Mei and Xiaojin Zhu},\n\tyear         = 2015,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)}\n}\n@inproceedings{mei2015teaching,\n\ttitle        = {Using Machine Teaching to Identify Optimal Training-Set Attacks on Machine Learners},\n\tauthor       = {Shike Mei and Xiaojin Zhu},\n\tyear         = 2015,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@article{mei2016landscape,\n\ttitle        = {The landscape of empirical risk for non-convex losses},\n\tauthor       = {Mei, Song and Bai, Yu and Montanari, Andrea},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1607.06534}\n}\n@inproceedings{mei2016listen,\n\ttitle        = {Listen, attend, and walk: Neural mapping of navigational instructions to action sequences},\n\tauthor       = {Hongyuan Mei and Mohit Bansal and Matthew R Walter},\n\tyear         = 2016,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{mei2016what,\n\ttitle        = {What to talk about and how? Selective Generation using {LSTM}s with Coarse-to-Fine Alignment},\n\tauthor       = {Hongyuan Mei and Mohit Bansal and Matthew R. Walter},\n\tyear         = 2016,\n\tbooktitle    = {Human Language Technology and North American Association for Computational Linguistics (HLT/NAACL)}\n}\n@inproceedings{mei2017coherent,\n\ttitle        = {Coherent Dialogue with Attention-Based Language Models},\n\tauthor       = {Hongyuan Mei and Mohit Bansal and Matthew R. Walter},\n\tyear         = 2017,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@misc{mei2017landscape,\n\ttitle        = {The Landscape of Empirical Risk for Non-convex Losses},\n\tauthor       = {Song Mei and Yu Bai and Andrea Montanari},\n\tyear         = 2017,\n\teprint       = {1607.06534},\n\tarchiveprefix = {arXiv},\n\tprimaryclass = {stat.ML}\n}\n@article{mei2017solving,\n\ttitle        = {Solving SDPs for synchronization and MaxCut problems via the Grothendieck inequality},\n\tauthor       = {Mei, Song and Misiakiewicz, Theodor and Montanari, Andrea and Oliveira, Roberto I},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1703.08729}\n}\n@article{mei2018mean,\n\ttitle        = {A Mean Field View of the Landscape of Two-Layers Neural Networks},\n\tauthor       = {Mei, Song and Montanari, Andrea and Nguyen, Phan-Minh},\n\tyear         = 2018,\n\tjournal      = {Proceedings of the National Academy of Sciences},\n\tpages        = {E7665--E7671}\n}\n@article{mei2019generalization,\n\ttitle        = {The generalization error of random features regression: Precise asymptotics and double descent curve},\n\tauthor       = {Song Mei and Andrea Montanari},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1908.05355}\n}\n@article{meinshausen2009,\n\ttitle        = {Lasso-type recovery of sparse representations for high-dimensional data},\n\tauthor       = {Meinshausen, Nicolai and Yu, Bin},\n\tyear         = 2009,\n\tmonth        = {02},\n\tjournal      = {Ann. Statist.},\n\tpublisher    = {The Institute of Mathematical Statistics},\n\tvolume       = 37,\n\tnumber       = 1,\n\tpages        = {246--270},\n\tdoi          = {10.1214/07-AOS582},\n\turl          = {http://dx.doi.org/10.1214/07-AOS582},\n\tfjournal     = {The Annals of Statistics}\n}\n@article{meinshausen2015maximin,\n\ttitle        = {Maximin effects in inhomogeneous large-scale data},\n\tauthor       = {Nicolai Meinshausen and Peter B\\\"uhlmann},\n\tyear         = 2015,\n\tjournal      = {Annals of Statistics},\n\tvolume       = 43\n}\n@article{meister2017data,\n\ttitle        = {A Data Prism: Semi-Verified Learning in the Small-Alpha Regime},\n\tauthor       = {Michela Meister and Gregory Valiant},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@inproceedings{melamed04smt,\n\ttitle        = {Statistical Machine Translation by Parsing},\n\tauthor       = {I. Dan Melamed},\n\tyear         = 2004,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{melamud2013context,\n\ttitle        = {A Two Level Model for Context Sensitive Inference Rules},\n\tauthor       = {Oren Melamud and Jonathan Berant and Ido Dagan and Jacob Goldberger and Idan Szpektor},\n\tyear         = 2013,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{melamud2015modeling,\n\ttitle        = {Modeling word meaning in context with substitute vectors},\n\tauthor       = {Melamud, Oren and Dagan, Ido and Goldberger, Jacob},\n\tyear         = 2015,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@inproceedings{melis2018towards,\n\ttitle        = {Towards robust interpretability with self-explaining neural networks},\n\tauthor       = {David Alvarez Melis and Tommi Jaakkola},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {7775--7784}\n}\n@inproceedings{mellish00dependencies,\n\ttitle        = {A Representation for Complex and Evolving Data Dependencies in Generation},\n\tauthor       = {C. Mellish and R. Evans and L. Cahill and C. Doran and D. Paiva and M. Reape and D. Scott and N. Tipper},\n\tyear         = 2000,\n\tbooktitle    = {Applied Natural Language Processing and North American Association for Computational Linguistics (ANLP/NAACL)},\n\tpages        = {119--126}\n}\n@inproceedings{melo2007q,\n\ttitle        = {Q-learning with linear function approximation},\n\tauthor       = {Melo, Francisco S and Ribeiro, M Isabel},\n\tyear         = 2007,\n\tbooktitle    = {International Conference on Computational Learning Theory},\n\tpages        = {308--322},\n\torganization = {Springer}\n}\n@inproceedings{melo2008analysis,\n\ttitle        = {An analysis of reinforcement learning with function approximation},\n\tauthor       = {Melo, Francisco S and Meyn, Sean P and Ribeiro, M Isabel},\n\tyear         = 2008,\n\tbooktitle    = {Proceedings of the 25th international conference on Machine learning},\n\tpages        = {664--671},\n\torganization = {ACM}\n}\n@article{menard2020fast,\n\ttitle        = {Fast active learning for pure exploration in reinforcement learning},\n\tauthor       = {M{\\'e}nard, Pierre and Domingues, Omar Darwiche and Jonsson, Anders and Kaufmann, Emilie and Leurent, Edouard and Valko, Michal},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.13442}\n}\n@inproceedings{mendonca2019guided,\n\ttitle        = {Guided meta-policy search},\n\tauthor       = {Russell Mendonca and Abhishek Gupta and Rosen Kralev and Pieter Abbeel and Sergey Levine and Chelsea Finn},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {9653--9664}\n}\n@article{meng2017magnet,\n\ttitle        = {MagNet: a Two-Pronged Defense against Adversarial Examples},\n\tauthor       = {Dongyu Meng and Hao Chen},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1705.09064}\n}\n@article{meng2020dynamic,\n\ttitle        = {Dynamic of Stochastic Gradient Descent with State-Dependent Noise},\n\tauthor       = {Meng, Qi and Gong, Shiqi and Chen, Wei and Ma, Zhi-Ming and Liu, Tie-Yan},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.13719}\n}\n@article{mengdi2017primal,\n\ttitle        = {Primal-Dual {\\(\\pi\\)} Learning: Sample Complexity and Sublinear Run Time for Ergodic Markov Decision Problems},\n\tauthor       = {Mengdi Wang},\n\tyear         = 2017,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1710.06100},\n\tarchiveprefix = {arXiv},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org},\n\tbiburl       = {http://dblp.org/rec/bib/journals/corr/abs-1710-06100},\n\teprint       = {1710.06100},\n\ttimestamp    = {Wed, 01 Nov 2017 19:05:42 +0100}\n}\n@inproceedings{menon2012ranking,\n\ttitle        = {Predicting accurate probabilities with a ranking loss},\n\tauthor       = {Aditya Krishna Menon and Xiaoqian Jiang and Shankar Vembu and Charles Elkan and Lucila Ohno{-}Machado},\n\tyear         = 2012,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{menon2013machine,\n\ttitle        = {A machine learning framework for programming by example},\n\tauthor       = {A Menon and Omer Tamuz and Sumit Gulwani and Butler Lampson and A Kalai},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{merialdo94tagging,\n\ttitle        = {Tagging {E}nglish text with a probabilistic model},\n\tauthor       = {Bernard Merialdo},\n\tyear         = 1994,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 20,\n\tpages        = {155--171}\n}\n@article{MerikoskiKumar2004,\n\ttitle        = {Inequalities For Spreads Of Matrix Sums And Products},\n\tauthor       = {Jorma K. Merikoski and Ravinder Kumar},\n\tyear         = 2004,\n\tjournal      = {Applied Mathematics E-Notes},\n\tvolume       = 4,\n\tpages        = {150--159}\n}\n@article{merity2016pointer,\n\ttitle        = {Pointer Sentinel Mixture Models},\n\tauthor       = {Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1609.07843}\n}\n@article{merler2019diversity,\n\ttitle        = {Diversity in faces},\n\tauthor       = {Michele Merler and Nalini Ratha and Rogerio S Feris and John R Smith},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1901.10436}\n}\n@article{mertens1982stochastic,\n\ttitle        = {Stochastic games have a value},\n\tauthor       = {Mertens, Jean-Francois and Neyman, Abraham},\n\tyear         = 1982,\n\tjournal      = {Proceedings of the National Academy of Sciences},\n\tpublisher    = {National Academy of Sciences},\n\tvolume       = 79,\n\tnumber       = 6,\n\tpages        = 2145\n}\n@article{merton1979normative,\n\ttitle        = {The normative structure of science},\n\tauthor       = {Robert K Merton},\n\tyear         = 1979,\n\tjournal      = {The sociology of science: Theoretical and empirical investigations},\n\tpages        = {267--278}\n}\n@article{mesnil2014using,\n\ttitle        = {Using recurrent neural networks for slot filling in spoken language understanding},\n\tauthor       = {Gr{\\'e}goire Mesnil and Yann Dauphin and Kaisheng Yao and Yoshua Bengio and Li Deng and Dilek Hakkani-Tur and Xiaodong He and Larry Heck and Gokhan Tur and Dong Yu},\n\tyear         = 2014,\n\tjournal      = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},\n\tvolume       = 23\n}\n@article{METIS1998,\n\ttitle        = {A Fast and High Quality Multilevel Scheme for Partitioning Irregular Graphs},\n\tauthor       = {Karypis, George and Kumar, Vipin},\n\tyear         = 1998,\n\tmonth        = jan,\n\tjournal      = {SIAM Journal on Scientific Computing},\n\tvolume       = 20,\n\tnumber       = 1,\n\tpages        = {359--392}\n}\n@article{metropolis1953equation,\n\ttitle        = {Equation of state calculations by fast computing machines},\n\tauthor       = {Nicholas Metropolis and Arianna W. Rosenbluth and Marshall N. Rosenbluth and Augusta H. Teller and Edward Teller},\n\tyear         = 1953,\n\tjournal      = {The journal of chemical physics},\n\tvolume       = 21,\n\tnumber       = 6,\n\tpages        = {1087--1092}\n}\n@article{metropolis1953equations,\n\ttitle        = {Equations of State Calculations by Fast Computing Machines},\n\tauthor       = {N. Metropolis and A.W. Rosenbluth and M.N. Rosenbluth and A.H. Teller and E. Teller},\n\tyear         = 1953,\n\tjournal      = {Journal of Chemical Physics},\n\tvolume       = 21,\n\tnumber       = 6,\n\tpages        = {1087--1092}\n}\n@inproceedings{metsis2006spam,\n\ttitle        = {Spam Filtering with Naive {B}ayes -- Which Naive {B}ayes?},\n\tauthor       = {Vangelis Metsis and Ion Androutsopoulos and Georgios Paliouras},\n\tyear         = 2006,\n\tbooktitle    = {CEAS},\n\tvolume       = 17,\n\tpages        = {28--69}\n}\n@inproceedings{mey2016softlabeled,\n\ttitle        = {A Soft-Labeled Self-Training Approach},\n\tauthor       = {Alexander Mey and Marco Loog},\n\tyear         = 2016,\n\tbooktitle    = {d International Conference on Pattern Recognition}\n}\n@article{meyer1973inverse,\n\ttitle        = {Generalised Inversion of Modified Matrices},\n\tauthor       = {Carl D. Meyer},\n\tyear         = 1973,\n\tjournal      = {SIAM Journal on Applied Mathematics},\n\tvolume       = 24,\n\tnumber       = 3,\n\tpages        = {315--323}\n}\n@article{meyer2014identifying,\n\ttitle        = {Identifying and mitigating bias in next-generation sequencing methods for chromatin biology},\n\tauthor       = {Clifford A Meyer and X Shirley Liu},\n\tyear         = 2014,\n\tjournal      = {Nature Reviews Genetics},\n\tvolume       = 15,\n\tnumber       = 11,\n\tpages        = {709--721}\n}\n@article{meyn1994computable,\n\ttitle        = {Computable bounds for geometric convergence rates of {M}arkov chains},\n\tauthor       = {SP Meyn and RL Tweedie},\n\tyear         = 1994,\n\tjournal      = {The Annals of Applied Probability},\n\tvolume       = 4,\n\tnumber       = 4,\n\tpages        = {981--1011}\n}\n@article{mh18,\n\ttitle        = {When Recurrent Models Don't Need To Be Recurrent},\n\tauthor       = {Miller, John and Hardt, Moritz},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.10369}\n}\n@article{mianjy2018implicit,\n\ttitle        = {On the implicit bias of dropout},\n\tauthor       = {Mianjy, Poorya and Arora, Raman and Vidal, Rene},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1806.09777}\n}\n@article{mianjy2019dropout,\n\ttitle        = {On dropout and nuclear norm regularization},\n\tauthor       = {Mianjy, Poorya and Arora, Raman},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1905.11887}\n}\n@inproceedings{miao2016language,\n\ttitle        = {Language as a Latent Variable: Discrete Generative Models for Sentence Compression},\n\tauthor       = {Yishu Miao and Phil Blunsom},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {319--328}\n}\n@inproceedings{michael2018qamr,\n\ttitle        = {Crowdsourcing Question--Answer Meaning Representations},\n\tauthor       = {Julian Michael and Gabriel Stanovsky and Luheng He and Ido Dagan and Luke Zettlemoyer},\n\tyear         = 2018,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{michaeli2015nonparametric,\n\ttitle        = {Nonparametric Canonical Correlation Analysis},\n\tauthor       = {Michaeli, Tomer and Wang, Weiran and Livescu, Karen},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint},\n\tvolume       = {abs/1511.04839}\n}\n@inproceedings{michel2019adversarial,\n\ttitle        = {On Evaluation of Adversarial Perturbations for Sequence-to-Sequence Models},\n\tauthor       = {Paul Michel and Xian Li and Graham Neubig and Juan Miguel Pino},\n\tyear         = 2019,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@book{michele2004eliminating,\n\ttitle        = {Eliminating Health Disparities: Measurement and Data Needs},\n\tauthor       = {Michele Ver Ploeg and Edward Perrin and Panel on DHHS Collection of Race and Ethnic Data},\n\tyear         = 2004,\n\tpublisher    = {National Academies Press}\n}\n@inproceedings{mikolov2011extensions,\n\ttitle        = {Extensions of recurrent neural network language model},\n\tauthor       = {Mikolov, Tomas and Kombrink, Stefan and Burget, Lukas and Cernocky, JH and Khudanpur, Sanjeev},\n\tyear         = 2011,\n\tbooktitle    = {Acoustics, Speech and Signal Processing (ICASSP), 2011 IEEE International Conference on},\n\tpages        = {5528--5531},\n\torganization = {IEEE}\n}\n@inproceedings{mikolov2013distributed,\n\ttitle        = {Distributed representations of words and phrases and their compositionality},\n\tauthor       = {Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S. and Dean, Jeff},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems}\n}\n@article{mikolov2013efficient,\n\ttitle        = {Efficient estimation of word representations in vector space},\n\tauthor       = {Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey},\n\tyear         = 2013,\n\tjournal      = {Proceedings of the International Conference on Learning Representations}\n}\n@inproceedings{mikolov2013linguistic,\n\ttitle        = {Linguistic Regularities in Continuous Space Word Representations},\n\tauthor       = {Mikolov, Tomas and Yih, Wen-tau and Zweig, Geoffrey},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},\n\tvolume       = 13,\n\tpages        = {746--751}\n}\n@article{mikolov2014learning,\n\ttitle        = {Learning Longer Memory in Recurrent Neural Networks},\n\tauthor       = {Tomas Mikolov and Armand Joulin and Sumit Chopra and Michael Mathieu and Marc'Aurelio Ranzato},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1412.7753}\n}\n@inproceedings{milch2005approximate,\n\ttitle        = {Approximate inference for infinite contingent {B}ayesian networks},\n\tauthor       = {Brian Milch and Bhaskara Marthi and David Sontag and Stuart Russell and Daniel L Ong and Andrey Kolobov},\n\tyear         = 2005,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)},\n\tpages        = {238--245}\n}\n@inproceedings{milch2005blog,\n\ttitle        = {Blog: Probabilistic models with unknown objects},\n\tauthor       = {Brian Milch and Bhaskara Marthi and Stuart Russell and David Sontag and Daniel L. Ong and Andrey Kolobov},\n\tyear         = 2005,\n\tbooktitle    = {In IJCAI},\n\tpages        = {1352--1359}\n}\n@inproceedings{miller04ner,\n\ttitle        = {Name Tagging with Word Clusters and Discriminative Training},\n\tauthor       = {S. Miller and J. Guinness and A. Zamanian},\n\tyear         = 2004,\n\tbooktitle    = {Human Language Technology and North American Association for Computational Linguistics (HLT/NAACL)},\n\tpages        = {337--342}\n}\n@article{miller1956psychological,\n\ttitle        = {Some Psychological Methods for Evaluating the Quality of Translations},\n\tauthor       = {George A. Miller and J. G. Beebe-Center},\n\tyear         = 1956,\n\tjournal      = {Mechanical Translation},\n\tvolume       = 3,\n\tpages        = {73--80}\n}\n@article{miller1981inverse,\n\ttitle        = {On the inverse of the sum of matrices},\n\tauthor       = {Kenneth S Miller},\n\tyear         = 1981,\n\tjournal      = {Mathematics magazine},\n\tvolume       = 54,\n\tnumber       = 2,\n\tpages        = {67--72}\n}\n@article{miller1990empirical,\n\ttitle        = {An empirical study of the reliability of {UNIX} utilities},\n\tauthor       = {Barton P Miller and Louis Fredriksen and Bryan So},\n\tyear         = 1990,\n\tjournal      = {Communications of the ACM},\n\tvolume       = 33,\n\tnumber       = 12,\n\tpages        = {32--44}\n}\n@article{miller2005eliciting,\n\ttitle        = {Eliciting informative feedback: The peer-prediction method},\n\tauthor       = {Nolan Miller and Paul Resnick and Richard Zeckhauser},\n\tyear         = 2005,\n\tjournal      = {Management Science},\n\tvolume       = 51,\n\tnumber       = 9,\n\tpages        = {1359--1373}\n}\n@inproceedings{miller2008inky,\n\ttitle        = {{I}nky: a sloppy command line for the web with rich visual feedback},\n\tauthor       = {Robert C Miller and Victoria H Chou and Michael Bernstein and Greg Little and Max Van Kleek and David Karger and mc schraefel},\n\tyear         = 2008,\n\tbooktitle    = {User Interface Software and Technology (UIST)},\n\tpages        = {131--140}\n}\n@article{miller2012geometric,\n\ttitle        = {A geometric approach to robotic laundry folding},\n\tauthor       = {S. Miller and J. Van Den Berg and M. Fritz and T. Darrell and K. Goldberg and P. Abbeel},\n\tyear         = 2012,\n\tjournal      = {International Journal of Robotics Research (IJRR)},\n\tvolume       = 31\n}\n@inproceedings{miller2016keyvalue,\n\ttitle        = {Key-value memory networks for directly reading documents},\n\tauthor       = {Alexander Miller and Adam Fisch and Jesse Dodge and Amir Hossein Karimi and Antoine Bordes and Jason Weston},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@phdthesis{miller2016thesis,\n\ttitle        = {Adjusting Sense Representations for Word Sense Disambiguation and Automatic Pun Interpretation},\n\tauthor       = {Tristan Miller},\n\tyear         = 2016,\n\tschool       = {Technische Universität Darmstadt}\n}\n@inproceedings{miller2017parlai,\n\ttitle        = {ParlAI: A Dialog Research Software Platform},\n\tauthor       = {Alexander H Miller and Will Feng and Adam Fisch and Jiasen Lu and Dhruv Batra and Antoine Bordes and Devi Parikh and Jason Weston},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {79--84}\n}\n@inproceedings{miller2017semeval,\n\ttitle        = {{S}em{E}val-2017 Task 7: Detection and Interpretation of {E}nglish Puns},\n\tauthor       = {Tristan Miller and Christian Hempelmann and Iryna Gurevych},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the 11th International Workshop on Semantic Evaluation}\n}\n@article{miller2020effect,\n\ttitle        = {The Effect of Natural Distribution Shift on Question Answering Models},\n\tauthor       = {John Miller and Karl Krauth and Benjamin Recht and Ludwig Schmidt},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2004.14444}\n}\n@inproceedings{miller2021line,\n\ttitle        = {Accuracy on the Line: on the Strong Correlation Between Out-of-Distribution and In-Distribution Generalization},\n\tauthor       = {John Miller and Rohan Taori and Aditi Raghunathan and Shiori Sagawa and Pang Wei Koh and Vaishaal Shankar and Percy Liang and Yair Carmon and Ludwig Schmidt},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{miller96statistical,\n\ttitle        = {A Fully Statistical Approach to Natural Language Interfaces},\n\tauthor       = {Scott Miller and David Stallard and Robert Bobrow and Richard Schwartz},\n\tyear         = 1996,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {55--61}\n}\n@inproceedings{milletari2016v,\n\ttitle        = {V-net: Fully convolutional neural networks for volumetric medical image segmentation},\n\tauthor       = {Milletari, Fausto and Navab, Nassir and Ahmadi, Seyed-Ahmad},\n\tyear         = 2016,\n\tbooktitle    = {3D Vision (3DV), 2016 Fourth International Conference on},\n\tpages        = {565--571},\n\torganization = {IEEE}\n}\n@article{milovanovic2004relationships,\n\ttitle        = {Relationships between platelets and inflammatory markers in rheumatoid arthritis},\n\tauthor       = {Micha Milovanovic and E Nilsson and Petter J{\\\"a}remo},\n\tyear         = 2004,\n\tjournal      = {Clinica {C}himica {A}cta},\n\tvolume       = 343,\n\tnumber       = 1,\n\tpages        = {237--240}\n}\n@inproceedings{mimno2011optimizing,\n\ttitle        = {Optimizing Semantic Coherence in Topic Models},\n\tauthor       = {David Mimno and Hanna Wallach and Edmund Talley and Miriam Leenders and Andrew McCallum},\n\tyear         = 2011,\n\tbooktitle    = {EMNLP}\n}\n@inproceedings{min2013distant,\n\ttitle        = {Distant Supervision for Relation Extraction with an Incomplete Knowledge Base},\n\tauthor       = {Bonan Min and Ralph Grishman and Li Wan and Chang Wang and David Gondek},\n\tyear         = 2013,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)},\n\tpages        = {777--782}\n}\n@inproceedings{min2017transfer,\n\ttitle        = {Question Answering through Transfer Learning from Large Fine-grained Supervision Data},\n\tauthor       = {Sewon Min and Minjoon Seo and Hannaneh Hajishirzi},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{min2019compositional,\n\ttitle        = {Compositional Questions Do Not Necessitate Multi-hop Reasoning},\n\tauthor       = {Sewon Min and Eric Wallace and Sameer Singh and Matt Gardner and Hannaneh Hajishirzi and Luke Zettlemoyer},\n\tyear         = 2019,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{min2019discrete,\n\ttitle        = {A discrete hard em approach for weakly supervised question answering},\n\tauthor       = {Sewon Min and Danqi Chen and Hannaneh Hajishirzi and Luke Zettlemoyer},\n\tyear         = 2019,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{min2020curious,\n\ttitle        = {The curious case of adversarially robust models: More data can help, double descend, or hurt generalization},\n\tauthor       = {Yifei Min and Lin Chen and Amin Karbasi},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2002.11080}\n}\n@book{minc1988nonnegative,\n\ttitle        = {Nonnegative matrices},\n\tauthor       = {Minc, Henryk},\n\tyear         = 1988,\n\tpublisher    = {Wiley}\n}\n@inproceedings{minka2001ep,\n\ttitle        = {Expectation propagation for approximate {B}ayesian inference},\n\tauthor       = {Thomas P. Minka},\n\tyear         = 2001,\n\tbooktitle    = {Uncertainty in Artificial Intelligence (UAI)},\n\tpages        = {362--369}\n}\n@inproceedings{mintz2009distant,\n\ttitle        = {Distant supervision for relation extraction without labeled data},\n\tauthor       = {Mike Mintz and Steven Bills and Rion Snow and Dan Jurafsky},\n\tyear         = 2009,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {1003--1011}\n}\n@inproceedings{mirkin2009discourse,\n\ttitle        = {Addressing Discourse and Document Structure in the {RTE} Search Task},\n\tauthor       = {Shachar Mirkin and Roy Bar-Haim and Jonathan Berant and Ido Dagan and Eyal Shnarch and Asher Stern and Idan Szpektor},\n\tyear         = {2009 2009},\n\tbooktitle    = {Text Analysis Conference}\n}\n@inproceedings{mirkin2010,\n\ttitle        = {Recognising Entailment within Discourse},\n\tauthor       = {Shachar Mirkin and Jonathan Berant and Ido Dagan and Eyal Shnarch},\n\tyear         = 2010,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)}\n}\n@article{mirrokni2015tight,\n\ttitle        = {Tight Bounds for Approximate {C}arath{\\'e}odory and Beyond},\n\tauthor       = {Vahab Mirrokni and Renato Paes Leme and Adrian Vladu and Sam Chiu-wai Wong},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1512.08602}\n}\n@article{mishra2017simple,\n\ttitle        = {A simple neural attentive meta-learner},\n\tauthor       = {Nikhil Mishra and Mostafa Rohaninejad and Xi Chen and Pieter Abbeel},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1707.03141}\n}\n@inproceedings{misra2014tell,\n\ttitle        = {Tell {M}e {D}ave: Context-sensitive grounding of natural language to mobile manipulation instructions},\n\tauthor       = {DK Misra and J Sung and K Lee and A Saxena},\n\tyear         = 2014,\n\tbooktitle    = {Robotics: Science and Systems (RSS)}\n}\n@inproceedings{misra2015environment,\n\ttitle        = {Environment-Driven Lexicon Induction for High-Level Instructions},\n\tauthor       = {Dipendra K. Misra and Kejia Tao and Percy Liang and Ashutosh Saxena},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{misra2017mapping,\n\ttitle        = {Mapping Instructions and Visual Observations to Actions with Reinforcement Learning},\n\tauthor       = {Dipendra K. Misra and John Langford and Yoav Artzi},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{misra2018learning,\n\ttitle        = {Learning by asking questions},\n\tauthor       = {Ishan Misra and Ross Girshick and Rob Fergus and Martial Hebert and Abhinav Gupta and Laurens Van Der Maaten},\n\tyear         = 2018,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {11--20}\n}\n@inproceedings{misra2018mapping,\n\ttitle        = {Mapping Instructions to Actions in {3D} Environments with Visual Goal Prediction},\n\tauthor       = {Dipendra Misra and Andrew Bennett and Valts Blukis and Eyvind Niklasson and Max Shatkhin and Artzi, Yoav},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{misra2018policy,\n\ttitle        = {Policy Shaping and Generalized Update Equations for Semantic Parsing from Denotations},\n\tauthor       = {Dipendra Misra and Ming-Wei Chang and Xiaodong He and Wen-tau Yih},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{misra2019kinematic,\n\ttitle        = {Kinematic State Abstraction and Provably Efficient Rich-Observation Reinforcement Learning},\n\tauthor       = {Misra, Dipendra and Henaff, Mikael and Krishnamurthy, Akshay and Langford, John},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:1911.05815},\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {6961--6971},\n\torganization = {PMLR}\n}\n@inproceedings{misra2020self,\n\ttitle        = {Self-supervised learning of pretext-invariant representations},\n\tauthor       = {Misra, Ishan and Maaten, Laurens van der},\n\tyear         = 2020,\n\tbooktitle    = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},\n\tpages        = {6707--6717}\n}\n@inproceedings{mitchell1977version,\n\ttitle        = {Version spaces: A candidate elimination approach to rule learning},\n\tauthor       = {Tom M Mitchell},\n\tyear         = 1977,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)},\n\tpages        = {305--310}\n}\n@article{mitchell2005time,\n\ttitle        = {A time-dependent {H}amilton-{J}acobi formulation of reachable sets for continuous dynamic games},\n\tauthor       = {Ian M. Mitchell and Alexandre M. Bayen and Claire J. Tomlin},\n\tyear         = 2005,\n\tjournal      = {IEEE Transactions on Automatic Control},\n\tvolume       = 50,\n\tnumber       = 7,\n\tpages        = {947--957}\n}\n@inproceedings{mitchell2015nell,\n\ttitle        = {Never-ending learning},\n\tauthor       = {Tom Mitchell and William Cohen and Estevam Hruschka and Partha Talukdar and Bo Yang and Justin Betteridge and Andrew Carlson and B Dalvi and Matt Gardner and Bryan Kisiel and others},\n\tyear         = 2015,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{Mitliagkas2013-streamPCA,\n\ttitle        = {Memory limited, streaming PCA},\n\tauthor       = {Mitliagkas, Ioannis and Caramanis, Constantine and Jain, Prateek},\n\tyear         = 2013,\n\tbooktitle    = {NIPS},\n\tpages        = {2886--2894}\n}\n@inproceedings{mitliagkas2016asynchrony,\n\ttitle        = {Asynchrony begets momentum, with an application to deep learning},\n\tauthor       = {Mitliagkas, Ioannis and Zhang, Ce and Hadjis, Stefan and R{\\'e}, Christopher},\n\tyear         = 2016,\n\tbooktitle    = {2016 54th Annual Allerton Conference on Communication, Control, and Computing (Allerton)},\n\tpages        = {997--1004},\n\torganization = {IEEE}\n}\n@inproceedings{miwa2016end,\n\ttitle        = {End-to-end relation extraction using lstms on sequences and tree structures},\n\tauthor       = {Makoto Miwa and Mohit Bansal},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{mixed-SCORE,\n\ttitle        = {Estimating network memberships by simplex vertex hunting},\n\tauthor       = {Jin, Jiashun and Ke, Zheng Tracy and Luo, Shengming},\n\tyear         = 2017,\n\tjournal      = {arXiv:1708.07852}\n}\n@article{miyato2015distributional,\n\ttitle        = {Distributional smoothing with virtual adversarial training},\n\tauthor       = {Takeru Miyato and Shin-ichi Maeda and Masanori Koyama and Ken Nakae and Shin Ishii},\n\tyear         = 2015,\n\tjournal      = {arXiv}\n}\n@inproceedings{miyato2017adversarial,\n\ttitle        = {Adversarial Training Methods for Semi-Supervised Text Classification},\n\tauthor       = {Takeru Miyato and Andrew M. Dai and Ian Goodfellow},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{miyato2018virtual,\n\ttitle        = {Virtual adversarial training: a regularization method for supervised and semi-supervised learning},\n\tauthor       = {Takeru Miyato and Shin-ichi Maeda and Shin Ishii and Masanori Koyama},\n\tyear         = 2018,\n\tjournal      = {IEEE Transactions on Pattern Analysis and Machine Intelligence}\n}\n@misc{mjt_dlt,\n\ttitle        = {Deep learning theory lecture notes},\n\tauthor       = {Matus Telgarsky},\n\tyear         = 2021,\n\tnote         = {Version: 2021-10-27 v0.0-e7150f2d (alpha)},\n\thowpublished = {\\url{https://mjt.cs.illinois.edu/dlt/}}\n}\n@inproceedings{mkbck10,\n\ttitle        = {Recurrent neural network based language model},\n\tauthor       = {Mikolov, Tom{\\'a}{\\v{s}} and Karafi{\\'a}t, Martin and Burget, Luk{\\'a}{\\v{s}} and {\\v{C}}ernock{\\`y}, Jan and Khudanpur, Sanjeev},\n\tyear         = 2010,\n\tbooktitle    = {Eleventh Annual Conference of the International Speech Communication Association}\n}\n@article{mm18,\n\ttitle        = {On the Connection Between Learning Two-Layers Neural Networks and Tensor Decomposition},\n\tauthor       = {Mondelli, Marco and Montanari, Andrea},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1802.07301}\n}\n@article{mmn18,\n\ttitle        = {A mean field view of the landscape of two-layer neural networks},\n\tauthor       = {Mei, Song and Montanari, Andrea and Nguyen, Phan-Minh},\n\tyear         = 2018,\n\tjournal      = {Proceedings of the National Academy of Sciences},\n\tpublisher    = {National Acad Sciences},\n\tvolume       = 115,\n\tnumber       = 33,\n\tpages        = {E7665--E7671}\n}\n@inproceedings{mmstv18,\n\ttitle        = {Towards deep learning models resistant to adversarial attacks},\n\tauthor       = {Madry, Aleksander and Makelov, Aleksandar and Schmidt, Ludwig and Tsipras, Dimitris and Vladu, Adrian},\n\tyear         = 2018,\n\tbooktitle    = {ICLR},\n\tpublisher    = {arXiv preprint arXiv:1706.06083}\n}\n@inproceedings{MMV12,\n\ttitle        = {Approximation algorithms for semi-random partitioning problems},\n\tauthor       = {Konstantin Makarychev and Yury Makarychev and Aravindan Vijayaraghavan},\n\tyear         = 2012,\n\tbooktitle    = {STOC '12},\n\tpages        = {367--384},\n\tee           = {http://doi.acm.org/10.1145/2213977.2214013}\n}\n@inproceedings{mnih2007three,\n\ttitle        = {Three new graphical models for statistical language modelling},\n\tauthor       = {Mnih, Andriy and Hinton, Geoffrey},\n\tyear         = 2007,\n\tbooktitle    = {Proceedings of the 24th International Conference on Machine Learning}\n}\n@inproceedings{mnih2008empirical,\n\ttitle        = {Empirical Berstein stopping},\n\tauthor       = {Volodymyr Mnih and Csaba Szepesv{'{a}}ri and Jean-Yves Audibert},\n\tyear         = 2008,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{mnih2009scalable,\n\ttitle        = {A scalable hierarchical distributed language model},\n\tauthor       = {Mnih, Andriy and Hinton, Geoffrey},\n\tyear         = 2009,\n\tbooktitle    = {Advances in neural information processing systems}\n}\n@inproceedings{mnih2012fast,\n\ttitle        = {A fast and simple algorithm for training neural probabilistic language models},\n\tauthor       = {Mnih, Andriy and Teh, Yee Whye},\n\tyear         = 2012,\n\tbooktitle    = {Proceedings of the 29th International Conference on Machine Learning}\n}\n@inproceedings{mnih2013learning,\n\ttitle        = {Learning word embeddings efficiently with noise-contrastive estimation},\n\tauthor       = {Mnih, Andriy and Kavukcuoglu, Koray},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems}\n}\n@article{mnih2013playing,\n\ttitle        = {Playing atari with deep reinforcement learning},\n\tauthor       = {Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Graves, Alex and Antonoglou, Ioannis and Wierstra, Daan and Riedmiller, Martin},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1312.5602}\n}\n@article{mnih2015human,\n\ttitle        = {Human-level control through deep reinforcement learning},\n\tauthor       = {Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Rusu, Andrei A and Veness, Joel and Bellemare, Marc G and Graves, Alex and Riedmiller, Martin and Fidjeland, Andreas K and Ostrovski, Georg and others},\n\tyear         = 2015,\n\tjournal      = {nature},\n\tpublisher    = {Nature Publishing Group},\n\tvolume       = 518,\n\tnumber       = 7540,\n\tpages        = {529--533}\n}\n@inproceedings{mnih2016asynchronous,\n\ttitle        = {Asynchronous methods for deep reinforcement learning},\n\tauthor       = {Mnih, Volodymyr and Badia, Adria Puigdomenech and Mirza, Mehdi and Graves, Alex and Lillicrap, Timothy and Harley, Tim and Silver, David and Kavukcuoglu, Koray},\n\tyear         = 2016,\n\tbooktitle    = {International conference on machine learning},\n\tpages        = {1928--1937}\n}\n@article{mnist,\n\ttitle        = {{MNIST} handwritten digit database},\n\tauthor       = {LeCun, Yann and Cortes, Corinna},\n\tyear         = 2010,\n\turl          = {http://yann.lecun.com/exdb/mnist/},\n\tadded-at     = {2010-06-28T21:16:30.000+0200},\n\tbiburl       = {http://www.bibsonomy.org/bibtex/2935bad99fa1f65e03c25b315aa3c1032/mhwombat},\n\tgroups       = {public},\n\thowpublished = {http://yann.lecun.com/exdb/mnist/},\n\tinterhash    = {21b9d0558bd66279df9452562df6e6f3},\n\tintrahash    = {935bad99fa1f65e03c25b315aa3c1032},\n\tkeywords     = {MSc _checked character_recognition mnist network neural},\n\tlastchecked  = {2016-01-14 14:24:11},\n\ttimestamp    = {2016-01-14T15:24:40.000+0100},\n\tusername     = {mhwombat}\n}\n@inproceedings{modi2019sample,\n\ttitle        = {Sample complexity of reinforcement learning using linearly combined model ensembles},\n\tauthor       = {Modi, Aditya and Jiang, Nan and Tewari, Ambuj and Singh, Satinder},\n\tyear         = 2020,\n\tbooktitle    = {Conference on Artificial Intelligence and Statistics},\n\tpages        = {2010--2020},\n\torganization = {PMLR}\n}\n@misc{modis2015landcover,\n\ttitle        = {{MOD09A1} {MODIS}/Terra surface reflectance 8-Day {L3} global 500m {SIN} grid {V006}},\n\tauthor       = {E. Vermote},\n\tyear         = 2015,\n\thowpublished = {\\url{https://doi.org/10.5067/MODIS/MOD09A1.006}}\n}\n@article{moffitt1993identification,\n\ttitle        = {Identification and estimation of dynamic models with a time series of repeated cross-sections},\n\tauthor       = {Robert Moffitt},\n\tyear         = 1993,\n\tjournal      = {Journal of Econometrics},\n\tvolume       = 59,\n\tnumber       = 1,\n\tpages        = {99--123}\n}\n@inproceedings{mohri2012new,\n\ttitle        = {New analysis and algorithm for learning with drifting distributions},\n\tauthor       = {Mohri, Mehryar and Medina, Andres Munoz},\n\tyear         = 2012,\n\tbooktitle    = {Algorithmic Learning Theory},\n\tpages        = {124--138},\n\torganization = {Springer}\n}\n@book{mohri2018foundations,\n\ttitle        = {Foundations of machine learning},\n\tauthor       = {Mohri, Mehryar and Rostamizadeh, Afshin and Talwalkar, Ameet},\n\tyear         = 2018,\n\tpublisher    = {MIT press},\n\tisbn         = {026201825X, 9780262018258}\n}\n@inproceedings{mohri2019agnostic,\n\ttitle        = {Agnostic federated learning},\n\tauthor       = {Mehryar Mohri and Gary Sivek and Ananda Theertha Suresh},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {4615--4625}\n}\n@inproceedings{moitra2010settling,\n\ttitle        = {Settling the polynomial learnability of mixtures of gaussians},\n\tauthor       = {Moitra, Ankur and Valiant, Gregory},\n\tyear         = 2010,\n\tbooktitle    = {Foundations of Computer Science (FOCS), 2010 51st Annual IEEE Symposium on},\n\tpages        = {93--102},\n\tdoi          = {10.1109/FOCS.2010.15},\n\turl          = {http://dx.doi.org/10.1109/FOCS.2010.15},\n\torganization = {IEEE},\n\tcrossref     = {DBLP:conf/focs/2010},\n\ttimestamp    = {Tue, 16 Dec 2014 09:57:23 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/focs/MoitraV10},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org},\n\tfile         = {:D\\:\\\\Documents\\\\ResearchD\\\\Thesis\\\\Citations\\\\1004.4223v1.pdf:PDF},\n\towner        = {rongge}\n}\n@inproceedings{moitra2015robust,\n\ttitle        = {How robust are reconstruction thresholds for community detection?},\n\tauthor       = {Moitra, Ankur and Perry, William and Wein, Alexander S},\n\tyear         = 2016,\n\tjournal      = {FOCS}\n}\n@inproceedings{MoitraValiant:GaussianMixture,\n\ttitle        = {Settling the polynomial learnability of mixtures of Gaussians},\n\tauthor       = {A. Moitra and G. Valiant},\n\tyear         = 2010,\n\tbooktitle    = {FOCS}\n}\n@inproceedings{moldovan2002lcc,\n\ttitle        = {{LCC} Tools for Question Answering},\n\tauthor       = {Dan I Moldovan and Sanda M Harabagiu and Roxana Girju and Paul Morarescu and V Finley Lacatusu and Adrian Novischi and Adriana Badulescu and Orest Bolohan},\n\tyear         = 2002,\n\tbooktitle    = {TREC}\n}\n@inproceedings{moldovan2012risk,\n\ttitle        = {Risk aversion in Markov decision processes via near optimal Chernoff bounds},\n\tauthor       = {Moldovan, Teodor M and Abbeel, Pieter},\n\tyear         = 2012,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {3131--3139}\n}\n@inproceedings{moldovan2012safe,\n\ttitle        = {Safe Exploration in {M}arkov Decision Processes},\n\tauthor       = {Teodor M. Moldovan and Pieter Abbeel},\n\tyear         = 2012,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1711--1718}\n}\n@article{moller1995multivariate,\n\ttitle        = {Multivariate polynomial equations with multiple zeros solved by matrix eigenproblems},\n\tauthor       = {H Michael M{\\\"o}ller and Hans J Stetter},\n\tyear         = 1995,\n\tjournal      = {Numerische Mathematik},\n\tvolume       = 70,\n\tnumber       = 3,\n\tpages        = {311--329}\n}\n@article{monajemi2013deterministic,\n\ttitle        = {{Deterministic Matrices Matching the Compressed Sensing Phase Transitions of Gaussian Random Matrices}},\n\tauthor       = {Monajemi, Hatef and Jafarpour, Sina and Gavish, Matan and { Stat 330/CME 362 Collaboration} and Donoho, David L},\n\tyear         = 2013,\n\tjournal      = {Proceedings of the National Academy of Sciences},\n\tpublisher    = {National Acad Sciences},\n\tvolume       = 110,\n\tnumber       = 4,\n\tpages        = {1181--1186}\n}\n@article{monajemi2019painless,\n\ttitle        = {Ambitious Data Science Can Be Painless},\n\tauthor       = {Hatef Monajemi and Riccardo Murri and Eric Jonas and Percy Liang and Victoria Stodden and David L. Donoho},\n\tyear         = 2019,\n\tjournal      = {Harvard Data Science Review},\n\tvolume       = 1\n}\n@inproceedings{monemizadeh20101,\n\ttitle        = {1-pass relative-error lp-sampling with applications},\n\tauthor       = {Monemizadeh, Morteza and Woodruff, David P},\n\tyear         = 2010,\n\tbooktitle    = {Proceedings of the twenty-first annual ACM-SIAM symposium on Discrete Algorithms},\n\tpages        = {1143--1160},\n\torganization = {SIAM}\n}\n@article{monro51stochastic,\n\ttitle        = {A Stochastic Approximation Method},\n\tauthor       = {Herbert Robbins and Sutton Monro},\n\tyear         = 1951,\n\tjournal      = {Annals of Mathematical Statistics},\n\tvolume       = 22,\n\tnumber       = 3,\n\tpages        = {400--407}\n}\n@inproceedings{monroe2015pragmatics,\n\ttitle        = {Learning in the {R}ational {S}peech {A}cts Model},\n\tauthor       = {Will Monroe  and  Christopher Potts},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of 20th {A}msterdam {C}olloquium}\n}\n@article{monroe2017colors,\n\ttitle        = {Colors in Context: A Pragmatic Neural Model for Grounded Language Understanding},\n\tauthor       = {Will Monroe and Robert XD Hawkins and Noah D Goodman and Christopher Potts},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1703.10186}\n}\n@incollection{montague1970english,\n\ttitle        = {{E}nglish as a Formal Language},\n\tauthor       = {Richard Montague},\n\tyear         = 1970,\n\tbooktitle    = {Linguaggi nella Societ\\`{a} e nella Tecnica},\n\tpages        = {189--224}\n}\n@inproceedings{montague73ptq,\n\ttitle        = {The proper treatment of quantification in ordinary {E}nglish},\n\tauthor       = {Richard Montague},\n\tyear         = 1973,\n\tbooktitle    = {Approaches to Natural Language},\n\tpages        = {221--242}\n}\n@article{montasser2019vc,\n\ttitle        = {{VC} Classes are Adversarially Robustly Learnable, but Only Improperly},\n\tauthor       = {Omar Montasser and Steve Hanneke and Nathan Srebro},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1902.04217}\n}\n@article{monteiro2003first,\n\ttitle        = {First-and second-order methods for semidefinite programming},\n\tauthor       = {Renato DC Monteiro},\n\tyear         = 2003,\n\tjournal      = {Mathematical Programming},\n\tvolume       = 97,\n\tnumber       = 1,\n\tpages        = {209--244}\n}\n@article{montesano2008learning,\n\ttitle        = {Learning Object Affordances: From Sensory--Motor Coordination to Imitation},\n\tauthor       = {L. Montesano and M. Lopes and A. Bernardino and J. Santos-Victor},\n\tyear         = 2008,\n\tjournal      = {Robotics, IEEE Transactions on},\n\tvolume       = 24,\n\tnumber       = 1,\n\tpages        = {15--26}\n}\n@inproceedings{mooney2008learning,\n\ttitle        = {Learning to Connect Language and Perception},\n\tauthor       = {R. Mooney},\n\tyear         = 2008,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tpages        = {1598--1601}\n}\n@inproceedings{moontae2014low,\n\ttitle        = {Low-dimensional Embeddings for Interpretable Anchor-based Topic Inference},\n\tauthor       = {Moontae Lee and David Mimno},\n\tyear         = 2014,\n\tbooktitle    = {EMNLP}\n}\n@inproceedings{moore04improving,\n\ttitle        = {Improving {IBM} Word Alignment Model 1},\n\tauthor       = {Robert C. Moore},\n\tyear         = 2004,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {518--525}\n}\n@incollection{moore1991variable,\n\ttitle        = {Variable resolution dynamic programming: Efficiently learning action maps in multivariate real-valued state-spaces},\n\tauthor       = {Moore, Andrew W},\n\tyear         = 1991,\n\tbooktitle    = {Machine Learning Proceedings 1991},\n\tpublisher    = {Elsevier},\n\tpages        = {333--337}\n}\n@inproceedings{moore2005making,\n\ttitle        = {\n\t\tMaking scheduling \"cool\": temperature-aware workload placement in\n\n\t\tdata centers\n\t},\n\tauthor       = {\n\t\tMoore, Justin and Chase, Jeff and Ranganathan, Parthasarathy and\n\n\t\tSharma, Ratnesh\n\t},\n\tyear         = 2005,\n\tbooktitle    = {Proceedings of the annual conference on USENIX Annual Technical Conference},\n\tlocation     = {Anaheim, CA},\n\tpublisher    = {USENIX Association},\n\taddress      = {Berkeley, CA, USA},\n\tseries       = {ATEC '05},\n\tpages        = {5--5},\n\tacmid        = 1247365,\n\tnumpages     = 1\n}\n@inproceedings{moore2006weatherman,\n\ttitle        = {\n\t\tWeatherman: Automated, Online and Predictive Thermal Mapping and\n\n\t\tManagement for Data Centers\n\t},\n\tauthor       = {Moore, J. and Chase, J.S. and Ranganathan, P.},\n\tyear         = 2006,\n\tmonth        = jun,\n\tbooktitle    = {ICAC '06. IEEE International Conference on Autonomic Computing},\n\tpages        = {155--164},\n\tdoi          = {10.1109/ICAC.2006.1662394}\n}\n@article{moore2020expanded,\n\ttitle        = {Expanded encyclopaedias of {DNA} elements in the human and mouse genomes},\n\tauthor       = {Jill E Moore and Michael J Purcaro and Henry E Pratt and Charles B Epstein and Noam Shoresh and Jessika Adrian and Trupti Kawli and Carrie A Davis and Alexander Dobin and Rajinder Kaul and others},\n\tyear         = 2020,\n\tjournal      = {Nature},\n\tvolume       = 583,\n\tnumber       = 7818,\n\tpages        = {699--710}\n}\n@inproceedings{moosavi2016deepfool,\n\ttitle        = {Deepfool: a simple and accurate method to fool deep neural networks},\n\tauthor       = {Seyed-Mohsen Moosavi-Dezfooli and Alhussein Fawzi and Pascal Frossard},\n\tyear         = 2016,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {2574--2582}\n}\n@inproceedings{moosavidezfooli2017universal,\n\ttitle        = {Universal adversarial perturbations},\n\tauthor       = {Seyed-Mohsen Moosavi-Dezfooli and Alhussein Fawzi and Omar Fawzi and Pascal Frossard},\n\tyear         = 2017,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@inproceedings{mordatch2018emergence,\n\ttitle        = {Emergence of Grounded Compositional Language in Multi-Agent Populations},\n\tauthor       = {Igor Mordatch and Pieter Abbeel},\n\tyear         = 2018,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{moreno1998recursive,\n\ttitle        = {A recursive algorithm for the forced alignment of very long audio segments},\n\tauthor       = {Pedro J Moreno and Christopher F Joerg and Jean-Manuel Van Thong and Oren Glickman},\n\tyear         = 1998,\n\tbooktitle    = {ICSLP},\n\tvolume       = 98,\n\tpages        = {2711--2714}\n}\n@inproceedings{morimura2010nonparametric,\n\ttitle        = {Nonparametric return distribution approximation for reinforcement learning},\n\tauthor       = {Morimura, Tetsuro and Sugiyama, Masashi and Kashima, Hisashi and Hachiya, Hirotaka and Tanaka, Toshiyuki},\n\tyear         = 2010,\n\tbooktitle    = {Proceedings of the 27th International Conference on International Conference on Machine Learning},\n\tpages        = {799--806}\n}\n@article{moroshko2020implicit,\n\ttitle        = {Implicit bias in deep linear classification: Initialization scale vs training accuracy},\n\tauthor       = {Moroshko, Edward and Gunasekar, Suriya and Woodworth, Blake and Lee, Jason D and Srebro, Nathan and Soudry, Daniel},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.06738}\n}\n@inproceedings{MorrisPeres03,\n\ttitle        = {Evolving sets and mixing},\n\tauthor       = {Morris, Ben and Peres, Yuval},\n\tyear         = 2003,\n\tlocation     = {San Diego, CA, USA},\n\tpublisher    = {ACM},\n\tseries       = {STOC '03},\n\tpages        = {279--286},\n\tnumpages     = 8\n}\n@inproceedings{MOS,\n\ttitle        = {Learning juntas},\n\tauthor       = {Mossel, Elchanan and O'Donnell, Ryan and Servedio, Rocco P.},\n\tyear         = 2003,\n\tbooktitle    = {Proceedings of the thirty-fifth annual ACM symposium on Theory of computing},\n\tlocation     = {San Diego, CA, USA},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {STOC '03},\n\tpages        = {206--212},\n\tisbn         = {1-58113-674-9},\n\tnumpages     = 7,\n\tkeywords     = {fourier, juntas, learning, relevant variables, uniform distribution}\n}\n@inproceedings{moshkovitz2017mixing,\n\ttitle        = {Mixing implies lower bounds for space bounded learning},\n\tauthor       = {Dana Moshkovitz and Michal Moshkovitz},\n\tyear         = 2017,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@misc{moss2005guidelines,\n\ttitle        = {\n\t\tGuidelines for Assessing Power and Cooling Requirements in the Data\n\n\t\tCenter\n\t},\n\tauthor       = {David Moss},\n\tyear         = 2005,\n\thowpublished = {Available at \\url{http://www.dell.com/downloads/global/power/ps3q05-20050115-Moss.pdf}}\n}\n@inproceedings{mossel2005learning,\n\ttitle        = {Learning nonsingular phylogenies and hidden Markov models},\n\tauthor       = {Mossel, Elchanan and Roch, S{\\'e}bastien},\n\tyear         = 2005,\n\tbooktitle    = {Proceedings of the thirty-seventh annual ACM symposium on Theory of computing},\n\tpages        = {366--375},\n\torganization = {ACM}\n}\n@article{mossel2012stochastic,\n\ttitle        = {Stochastic block models and reconstruction},\n\tauthor       = {Elchanan Mossel and Joe Neeman and Allan Sly},\n\tyear         = 2012,\n\tjournal      = {arXiv}\n}\n@article{mossel2013belief,\n\ttitle        = {Belief propagation, robust reconstruction, and optimal recovery of block models},\n\tauthor       = {Elchanan Mossel and Joe Neeman and Allan Sly},\n\tyear         = 2013,\n\tjournal      = {arXiv}\n}\n@article{mossel2013proof,\n\ttitle        = {A proof of the block model threshold conjecture},\n\tauthor       = {Elchanan Mossel and Joe Neeman and Allan Sly},\n\tyear         = 2013,\n\tjournal      = {arXiv}\n}\n@inproceedings{mossel2015consistency,\n\ttitle        = {Consistency thresholds for the planted bisection model},\n\tauthor       = {Elchanan Mossel and Joe Neeman and Allan Sly},\n\tyear         = 2015,\n\tbooktitle    = {Symposium on Theory of Computing (STOC)},\n\tpages        = {69--75}\n}\n@inproceedings{mostafazadeh2016corpus,\n\ttitle        = {A corpus and cloze evaluation for deeper understanding of commonsense stories},\n\tauthor       = {Nasrin Mostafazadeh and Nathanael Chambers and Xiaodong He and Devi Parikh and Dhruv Batra and Lucy Vanderwende and Pushmeet Kohli and James Allen},\n\tyear         = 2016,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{Motwani1994,\n\ttitle        = {Computing Roots of Graphs is Hard},\n\tauthor       = {Rajeev Motwani and Madhu Sudan},\n\tyear         = 1994,\n\tjournal      = {DISCRETE APPLIED MATHEMATICS},\n\tvolume       = 54,\n\tpages        = {54--81}\n}\n@inproceedings{mou2017coupling,\n\ttitle        = {Coupling distributed and symbolic execution for natural language queries},\n\tauthor       = {Lili Mou and Zhengdong Lu and Hang Li and Zhi Jin},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{mou2018generalization,\n\ttitle        = {Generalization bounds of sgld for non-convex learning: Two theoretical viewpoints},\n\tauthor       = {Mou, Wenlong and Wang, Liwei and Zhai, Xiyu and Zheng, Kai},\n\tyear         = 2018,\n\tmonth        = {06--09 Jul},\n\tjournal      = {arXiv preprint arXiv:1707.05947},\n\tbooktitle    = {Conference on Learning Theory},\n\tpublisher    = {PMLR},\n\taddress      = {},\n\tseries       = {Proceedings of Machine Learning Research},\n\tvolume       = 75,\n\tpages        = {605--638},\n\turl          = {http://proceedings.mlr.press/v75/mou18a.html},\n\teditor       = {Bubeck, S\\'ebastien and Perchet, Vianney and Rigollet, Philippe},\n\tpdf          = {http://proceedings.mlr.press/v75/mou18a/mou18a.pdf},\n\tabstract     = {We study the generalization errors of \\emph{non-convex} regularized ERM procedures using Stochastic Gradient Langevin Dynamics (SGLD). Two theories are proposed with non-asymptotic discrete-time analysis, using stability and PAC-Bayesian theory respectively. The stability-based theory obtains a bound of $O\\left(\\frac{1}{n}L\\sqrt{\\beta T_N}\\right)$, where $L$ is Lipschitz parameter, $\\beta$ is inverse temperature, and $T_N$ is the sum of step sizes. For PAC-Bayesian theory, though the bound has a slower $O(1/\\sqrt{n})$ rate, the contribution of each step decays exponentially through time, and the uniform Lipschitz constant is also replaced by actual norms of gradients along the optimization trajectory. Our bounds have reasonable dependence on aggregated step sizes, and do not explicitly depend on dimensions, norms or other capacity measures of the parameter. The bounds characterize how the noises in the algorithm itself controls the statistical learning behavior in non-convex problems, without uniform convergence in the hypothesis space, which sheds light on the effect of training algorithms on the generalization error for deep neural networks.}\n}\n@article{mou2020sample,\n\ttitle        = {On the Sample Complexity of Reinforcement Learning with Policy Space Generalization},\n\tauthor       = {Mou, Wenlong and Wen, Zheng and Chen, Xi},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2008.07353}\n}\n@incollection{moulines2011non,\n\ttitle        = {Non-Asymptotic Analysis of Stochastic Approximation Algorithms for Machine Learning},\n\tauthor       = {Moulines, Eric and Bach, Francis R.},\n\tyear         = 2011,\n\tbooktitle    = {Advances in Neural Information Processing Systems 24},\n\tpages        = {451--459},\n\turl          = {http://papers.nips.cc/paper/4316-non-asymptotic-analysis-of-stochastic-approximation-algorithms-for-machine-learning.pdf}\n}\n@inproceedings{mourao2012learning,\n\ttitle        = {Learning strips operators from noisy and incomplete observations},\n\tauthor       = {K. Mourao and L.  Zettlemoyer and R. Petrick and M. Steedman},\n\tyear         = 2012,\n\tbooktitle    = {Uncertainty in Artificial Intelligence (UAI)}\n}\n@inproceedings{mouratidis2006continuous,\n\ttitle        = {Continuous nearest neighbor monitoring in road networks},\n\tauthor       = {\n\t\tMouratidis, Kyriakos and Yiu, Man Lung and Papadias, Dimitris and\n\n\t\tMamoulis, Nikos\n\t},\n\tyear         = 2006,\n\tbooktitle    = {\n\t\tProceedings of the 32nd international conference on Very large data\n\n\t\tbases\n\t},\n\tlocation     = {Seoul, Korea},\n\tpublisher    = {VLDB Endowment},\n\tseries       = {VLDB '06},\n\tpages        = {43--54},\n\tacmid        = 1164133,\n\tnumpages     = 12\n}\n@article{MOV2012,\n\ttitle        = {A local spectral method for graphs: with applications to improving graph partitions and exploring data graphs locally},\n\tauthor       = {Mahoney, Michael W. and Orecchia, Lorenzo and Vishnoi, Nisheeth K.},\n\tyear         = 2012,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 13,\n\tpages        = {2339--2365}\n}\n@article{mozaffari2015systematic,\n\ttitle        = {Systematic poisoning attacks on and defenses for machine learning in healthcare},\n\tauthor       = {Mehran Mozaffari-Kermani and Susmita Sur-Kolay and Anand Raghunathan and Niraj K. Jha},\n\tyear         = 2015,\n\tjournal      = {IEEE Journal of Biomedical and Health Informatics},\n\tvolume       = 19,\n\tnumber       = 6,\n\tpages        = {1893--1905}\n}\n@article{mozaffarian2011changes,\n\ttitle        = {Changes in diet and lifestyle and long-term weight gain in women and men},\n\tauthor       = {Dariush Mozaffarian and Tao Hao and Eric B Rimm and Walter C Willett and Frank B Hu},\n\tyear         = 2011,\n\tjournal      = {New England Journal of Medicine},\n\tvolume       = 364,\n\tnumber       = 25,\n\tpages        = {2392--2404}\n}\n@article{mozannar2020consistent,\n\ttitle        = {Consistent Estimators for Learning to Defer to an Expert},\n\tauthor       = {Hussein Mozannar and David Sontag},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.01862}\n}\n@article{MR06,\n\ttitle        = {Learning Nonsingular Phylogenies and Hidden {M}arkov Models},\n\tauthor       = {Elchanan Mossel and S\\'{e}bastian Roch},\n\tyear         = 2006,\n\tjournal      = {Annals of Applied Probability},\n\tvolume       = 16,\n\tnumber       = 2,\n\tpages        = {583--614}\n}\n@article{mr18,\n\ttitle        = {The Computational Complexity of Training {R}e{LU}(s)},\n\tauthor       = {Manurangsi, Pasin and Reichman, Daniel},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1810.04207}\n}\n@book{MR2442439,\n\ttitle        = {Stochastic approximation: a dynamical systems viewpoint},\n\tauthor       = {Borkar, Vivek S.},\n\tyear         = 2008,\n\tpublisher    = {Cambridge University Press, Cambridge},\n\tpages        = {x+164},\n\tisbn         = {978-0-521-51592-4},\n\tmrclass      = {60-01 (39-01 62L20 93E03 93E10)},\n\tmrnumber     = 2442439,\n\tmrreviewer   = {Oleg N. Granichin}\n}\n@inproceedings{mrksic2016counterfitting,\n\ttitle        = {Counter-fitting Word Vectors to Linguistic Constraints},\n\tauthor       = {Nikola Mrk\\v{s}i\\'{c} and Diarmuid \\'{O} S\\'{e}aghdha and Blaise Thomson and Milica Ga\\v{s}i\\'{c} and Lina Rojas-Barahona and Pei-Hao Su and David Vandyke and Tsung-Hsien Wen and Steve Young},\n\tyear         = 2016,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{MRsurvey,\n\ttitle        = {Lattice-Based Cryptography},\n\tauthor       = {Micciancio, Daniele and Regev, Oded},\n\tyear         = 2009,\n\tjournal      = {Post Quantum Cryptography},\n\tpublisher    = {Springer Publishing Company, Heidelberg},\n\tpages        = {147--191}\n}\n@article{ms08,\n\ttitle        = {Finite-time bounds for fitted value iteration},\n\tauthor       = {Munos, R{\\'e}mi and Szepesv{\\'a}ri, Csaba},\n\tyear         = 2008,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 9,\n\tnumber       = {May},\n\tpages        = {815--857}\n}\n@inproceedings{MS13,\n\ttitle        = {A Polynomial Time Algorithm for Lossy Population Recovery},\n\tauthor       = {Ankur Moitra and Michael E. Saks},\n\tyear         = 2013,\n\tbooktitle    = {54th Annual {IEEE} Symposium on Foundations of Computer Science, {FOCS} 2013, 26-29 October, 2013, Berkeley, CA, {USA}},\n\tpages        = {110--116},\n\tdoi          = {10.1109/FOCS.2013.20},\n\turl          = {http://dx.doi.org/10.1109/FOCS.2013.20},\n\tcrossref     = {DBLP:conf/focs/2013},\n\ttimestamp    = {Mon, 24 Aug 2015 19:09:00 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/focs/MoitraS13},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@article{mt18,\n\ttitle        = {Robust Spectral Filtering and Anomaly Detection},\n\tauthor       = {Marecek, Jakub and Tchrakian, Tigran},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1808.01181}\n}\n@article{mu2013square,\n\ttitle        = {Square Deal: Lower Bounds and Improved Relaxations for Tensor Recovery},\n\tauthor       = {Mu, Cun and Huang, Bo and Wright, John and Goldfarb, Donald},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1307.5870}\n}\n@inproceedings{mu2020shaping,\n\ttitle        = {Shaping Visual Representations with Language for Few-shot Classification},\n\tauthor       = {Jesse Mu and Percy Liang and Noah Goodman},\n\tyear         = 2020,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{muandet2013domain,\n\ttitle        = {Domain Generalization via Invariant Feature Representation},\n\tauthor       = {Krikamol Muandet and David Balduzzi and Bernhard Schölkopf},\n\tyear         = 2013,\n\tmonth        = {17--19 Jun},\n\tbooktitle    = {Proceedings of the 30th International Conference on Machine Learning},\n\tpublisher    = {PMLR},\n\taddress      = {Atlanta, Georgia, USA},\n\tseries       = {Proceedings of Machine Learning Research},\n\tvolume       = 28,\n\tnumber       = 1,\n\tpages        = {10--18},\n\turl          = {http://proceedings.mlr.press/v28/muandet13.html},\n\teditor       = {Sanjoy Dasgupta and David McAllester}\n}\n@article{mudrakarta2018it,\n\ttitle        = {It was the training data pruning too!},\n\tauthor       = {Pramod Kaushik Mudrakarta and Ankur Taly and Mukund Sundararajan and Kedar Dhamdhere},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1803.04579}\n}\n@inproceedings{mudrakarta2018question,\n\ttitle        = {Did the Model Understand the Question?},\n\tauthor       = {Pramod K. Mudrakarta and Ankur Taly and Mukund Sundararajan and Kedar Dhamdhere},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{mueller1974,\n\ttitle        = {The Utilitarian Contract: A Generalization of Rawls' Theory of Justice},\n\tauthor       = {Dennis C. Mueller and Robert D. Tollison and Thomas D. Willet},\n\tyear         = 1974,\n\tjournal      = {Theory and Decision},\n\tvolume       = 4,\n\tpages        = {345--367}\n}\n@phdthesis{mueller2013semantic,\n\ttitle        = {Methods for Learning Structured Prediction in Semantic Segmentation of Natural Images},\n\tauthor       = {Andreas Mueller},\n\tyear         = 2013,\n\tschool       = {University of Bonn}\n}\n@inproceedings{mueller2017sequence,\n\ttitle        = {Sequence to better sequence: continuous revision of combinatorial structures},\n\tauthor       = {Jonas Mueller and David Gifford and Tommi Jaakkola},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {2536--2544}\n}\n@article{muelling2015autonomy,\n\ttitle        = {Autonomy infused teleoperation with application to {BCI} manipulation},\n\tauthor       = {Katharina Muelling and Arun Venkatraman and Jean-Sebastien Valois and John Downey and Jeffrey Weiss and Shervin Javdani and Martial Hebert and Andrew B Schwartz and Jennifer L Collinger and J Andrew Bagnell},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1503.05451}\n}\n@phdthesis{mugtussids2000flight,\n\ttitle        = {Flight Data Processing Techniques to Identify Unusual Events},\n\tauthor       = {Mugtussids, Iossif B.},\n\tyear         = 2000,\n\tschool       = {Virginia Tech}\n}\n@inproceedings{muhlgay2019value,\n\ttitle        = {Value-based Search in Execution Space for Mapping Instructions to Programs},\n\tauthor       = {Dor Muhlgay and Jonathan Herzig and Jonathan Berant},\n\tyear         = 2019,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{mukherjee2009spatio,\n\ttitle        = {\n\t\tSpatio-temporal thermal-aware job scheduling to minimize energy consumption\n\n\t\tin virtualized heterogeneous data centers\n\t},\n\tauthor       = {\n\t\tMukherjee, Tridib and Banerjee, Ayan and Varsamopoulos, Georgios\n\n\t\tand Gupta, Sandeep K. S. and Rungta, Sanjay\n\t},\n\tyear         = 2009,\n\tjournal      = {Computer Networks},\n\tvolume       = 53,\n\tnumber       = 17,\n\tpages        = {2888--2904}\n}\n@inproceedings{munos2005error,\n\ttitle        = {Error bounds for approximate value iteration},\n\tauthor       = {Munos, R{\\'e}mi},\n\tyear         = 2005,\n\tbooktitle    = {Proceedings of the National Conference on Artificial Intelligence},\n\tvolume       = 20,\n\tnumber       = 2,\n\tpages        = 1006,\n\torganization = {Menlo Park, CA; Cambridge, MA; London; AAAI Press; MIT Press; 1999}\n}\n@inproceedings{munoz2017towards,\n\ttitle        = {Towards poisoning of deep learning algorithms with back-gradient optimization},\n\tauthor       = {Luis Mu{\\~n}oz-Gonz{\\'a}lez and Battista Biggio and Ambra Demontis and Andrea Paudice and Vasin Wongrassamee and Emil C Lupu and Fabio Roli},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the 10th ACM Workshop on Artificial Intelligence and Security},\n\tpages        = {27--38}\n}\n@article{murata94neural,\n\ttitle        = {Network Information Criterion---Determining the Number of Hidden Units for an Artificial Neural Network Model},\n\tauthor       = {N. Murata and S. Yoshizawa and S. Amari},\n\tyear         = 1994,\n\tjournal      = {IEEE Transactions on Neural Networks},\n\tvolume       = 5,\n\tnumber       = 6,\n\tpages        = {865--872}\n}\n@article{murdoch1998exact,\n\ttitle        = {Exact sampling from a continuous state space},\n\tauthor       = {Duncan J Murdoch and Peter J Green},\n\tyear         = 1998,\n\tjournal      = {Scandinavian Journal of Statistics},\n\tvolume       = 25,\n\tnumber       = 3,\n\tpages        = {483--502}\n}\n@article{murphy1973new,\n\ttitle        = {A new vector partition of the probability score},\n\tauthor       = {Murphy, Allan H},\n\tyear         = 1973,\n\tjournal      = {Journal of applied Meteorology},\n\tvolume       = 12,\n\tnumber       = 4,\n\tpages        = {595--600}\n}\n@article{murphy1973vector,\n\ttitle        = {A new vector partition of the probability score},\n\tauthor       = {Allan H Murphy},\n\tyear         = 1973,\n\tjournal      = {Journal of Applied Meteorology},\n\tvolume       = 12,\n\tnumber       = 4,\n\tpages        = {595--600}\n}\n@article{murphy1977reliability,\n\ttitle        = {Reliability of Subjective Probability Forecasts of Precipitation and Temperature},\n\tauthor       = {Allan H. Murphy and Robert L. Winkler},\n\tyear         = 1977,\n\tjournal      = {Journal of the Royal Statistical Society. Series C (Applied Statistics)},\n\tvolume       = 26,\n\tpages        = {41--47}\n}\n@book{murphy2010nearsynonym,\n\ttitle        = {Lexical Meaning},\n\tauthor       = {M. Lynne Murphy},\n\tyear         = 2010,\n\tpublisher    = {Cambridge University Press}\n}\n@article{murray2008notes,\n\ttitle        = {Notes on the {KL}-divergence between a {M}arkov chain and its equilibrium distribution},\n\tauthor       = {Ian Murray and Ruslan Salakhutdinov},\n\tyear         = 2008,\n\tjournal      = {preprint}\n}\n@article{murty1987some,\n\ttitle        = {Some NP-complete problems in quadratic and nonlinear programming},\n\tauthor       = {Murty, Katta G and Kabadi, Santosh N},\n\tyear         = 1987,\n\tjournal      = {Mathematical programming},\n\tpublisher    = {Springer},\n\tvolume       = 39,\n\tnumber       = 2,\n\tpages        = {117--129}\n}\n@article{murty2012m,\n\ttitle        = {$O(m)$ Bound on Number of Iterations in Sphere Methods for LP},\n\tauthor       = {Murty, Katta G.},\n\tyear         = 2012,\n\tjournal      = {Algorithmic Operations Research},\n\tvolume       = 7,\n\tnumber       = 1,\n\tpages        = {30--40}\n}\n@inproceedings{murty2020expbert,\n\ttitle        = {{ExpBERT}: Representation Engineering with Natural Language Explanations},\n\tauthor       = {Shikhar Murty and Pang Wei Koh and Percy Liang},\n\tyear         = 2020,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{musco2015randomized,\n\ttitle        = {Randomized block krylov methods for stronger and faster approximate singular value decomposition},\n\tauthor       = {Musco, Cameron and Musco, Christopher},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {1396--1404}\n}\n@article{muskens96combine,\n\ttitle        = {Combining Montague semantics and discourse representation},\n\tauthor       = {Reinhard Muskens},\n\tyear         = 1996,\n\tjournal      = {Linguistics and Philosophy},\n\tvolume       = 19,\n\tnumber       = 2,\n\tpages        = {143--186}\n}\n@article{muslea2001hierarchical,\n\ttitle        = {Hierarchical wrapper induction for semistructured information sources},\n\tauthor       = {Ion Muslea and Steven Minton and Craig A Knoblock},\n\tyear         = 2001,\n\tjournal      = {Autonomous Agents and Multi-Agent Systems},\n\tvolume       = 4,\n\tnumber       = 1,\n\tpages        = {93--114}\n}\n@inproceedings{mussmann2018accuracy,\n\ttitle        = {On the Relationship between Data Efficiency and Error in Active Learning},\n\tauthor       = {Stephen Mussmann and Percy Liang},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{mussmann2018gbs,\n\ttitle        = {Generalized Binary Search For Split-Neighborly Problems},\n\tauthor       = {Stephen Mussmann and Percy Liang},\n\tyear         = 2018,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)}\n}\n@inproceedings{mussmann2018sgd,\n\ttitle        = {Uncertainty Sampling is Preconditioned Stochastic Gradient Descent on Zero-One Loss},\n\tauthor       = {Stephen Mussmann and Percy Liang},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{mussmann2020pairwise,\n\ttitle        = {On the Importance of Adaptive Data Collection for Extremely Imbalanced Pairwise Tasks},\n\tauthor       = {Stephen Mussmann and Robin Jia and Percy Liang},\n\tyear         = 2020,\n\tbooktitle    = {Findings of Empirical Methods in Natural Language Processing (Findings of EMNLP)}\n}\n@article{muthukumar2020harmless,\n\ttitle        = {Harmless interpolation of noisy data in regression},\n\tauthor       = {Vidya Muthukumar and Kailas Vodrahalli and Vignesh Subramanian and Anant Sahai},\n\tyear         = 2020,\n\tjournal      = {IEEE Journal on Selected Areas in Information Theory},\n\tvolume       = 1,\n\tnumber       = 1,\n\tpages        = {67--83}\n}\n@book{myers1990classical,\n\ttitle        = {Classical and modern regression with applications},\n\tauthor       = {Myers, Raymond H and Myers, Raymond H},\n\tyear         = 1990,\n\tpublisher    = {Duxbury press Belmont, CA},\n\tvolume       = 2\n}\n@inproceedings{MZ,\n\ttitle        = {Matching pursuits with time-frequency dictionaries},\n\tauthor       = {S. Mallat and Z. Zhang},\n\tyear         = 1993,\n\tbooktitle    = {IEEE Trans. on Signal Processing},\n\tpages        = {3397--3415}\n}\n@inproceedings{nabi2018fair,\n\ttitle        = {Fair inference on outcomes},\n\tauthor       = {Razieh Nabi and Ilya Shpitser},\n\tyear         = 2018,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@article{nachum2016improving,\n\ttitle        = {Improving policy gradient by exploring under-appreciated rewards},\n\tauthor       = {Nachum, Ofir and Norouzi, Mohammad and Schuurmans, Dale},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1611.09321}\n}\n@article{nachum2019algaedice,\n\ttitle        = {Algaedice: Policy gradient from arbitrary experience},\n\tauthor       = {Nachum, Ofir and Dai, Bo and Kostrikov, Ilya and Chow, Yinlam and Li, Lihong and Schuurmans, Dale},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1912.02074}\n}\n@article{nacson2018convergence,\n\ttitle        = {Convergence of gradient descent on separable data},\n\tauthor       = {Nacson, Mor Shpigel and Lee, Jason and Gunasekar, Suriya and Savarese, Pedro HP and Srebro, Nathan and Soudry, Daniel},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1803.01905},\n\tbooktitle    = {The 22nd International Conference on Artificial Intelligence and Statistics},\n\tpages        = {3420--3428},\n\torganization = {PMLR}\n}\n@article{nacson2019lexicographic,\n\ttitle        = {Lexicographic and depth-sensitive margins in homogeneous and non-homogeneous deep models},\n\tauthor       = {Nacson, Mor Shpigel and Gunasekar, Suriya and Lee, Jason D and Srebro, Nathan and Soudry, Daniel},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1905.07325}\n}\n@article{nadeem2020stereoset,\n\ttitle        = {Stereoset: Measuring stereotypical bias in pretrained language models},\n\tauthor       = {Moin Nadeem and Anna Bethke and Siva Reddy},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2004.09456}\n}\n@article{nadler2009semi,\n\ttitle        = {Semi-supervised learning with the graph laplacian: The limit of infinite unlabelled data},\n\tauthor       = {Nadler, Boaz and Srebro, Nathan and Zhou, Xueyuan},\n\tyear         = 2009,\n\tjournal      = {Advances in neural information processing systems},\n\tpublisher    = {Citeseer},\n\tvolume       = 22,\n\tpages        = {1330--1338}\n}\n@article{naeini2014binary,\n\ttitle        = {Binary Classifier Calibration: Non-parametric approach},\n\tauthor       = {Mahdi Pakdaman Naeini and Gregory F. Cooper and Milos Hauskrecht},\n\tyear         = 2014,\n\tjournal      = {arXiv}\n}\n@inproceedings{naeini2015obtaining,\n\ttitle        = {Obtaining well calibrated probabilities using bayesian binning},\n\tauthor       = {Naeini, Mahdi Pakdaman and Cooper, Gregory F and Hauskrecht, Milos},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the... AAAI Conference on Artificial Intelligence. AAAI Conference on Artificial Intelligence},\n\tvolume       = 2015,\n\tpages        = 2901,\n\torganization = {NIH Public Access}\n}\n@article{nagabandi2018learning,\n\ttitle        = {Learning to adapt in dynamic, real-world environments through meta-reinforcement learning},\n\tauthor       = {Anusha Nagabandi and Ignasi Clavera and Simin Liu and Ronald S Fearing and Pieter Abbeel and Sergey Levine and Chelsea Finn},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1803.11347}\n}\n@inproceedings{nagabandi2018neural,\n\ttitle        = {Neural network dynamics for model-based deep reinforcement learning with model-free fine-tuning},\n\tauthor       = {A. Nagabandi and G. Kahn and R. S. Fearing and S. Levine},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Robotics and Automation (ICRA)},\n\tpages        = {7559--7566}\n}\n@inproceedings{nagabandi2020deep,\n\ttitle        = {Deep dynamics models for learning dexterous manipulation},\n\tauthor       = {Nagabandi, Anusha and Konolige, Kurt and Levine, Sergey and Kumar, Vikash},\n\tyear         = 2020,\n\tbooktitle    = {Conference on Robot Learning},\n\tpages        = {1101--1112},\n\torganization = {PMLR}\n}\n@article{nagarajan2019deterministic,\n\ttitle        = {Deterministic PAC-Bayesian generalization bounds for deep networks via generalizing noise-resilience},\n\tauthor       = {Nagarajan, Vaishnavh and Kolter, J Zico},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1905.13344}\n}\n@article{nagarajan2019uniform,\n\ttitle        = {Uniform convergence may be unable to explain generalization in deep learning},\n\tauthor       = {Vaishnavh Nagarajan and J Zico Kolter},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1902.04742}\n}\n@article{nagarajan2020understanding,\n\ttitle        = {Understanding the failure modes of out-of-distribution generalization},\n\tauthor       = {Vaishnavh Nagarajan and Anders Andreassen and Behnam Neyshabur},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.15775}\n}\n@article{nagumo1942lage,\n\ttitle        = {{\\\"U}ber die lage der integralkurven gew{\\\"o}hnlicher differentialgleichungen},\n\tauthor       = {Nagumo, Mitio},\n\tyear         = 1942,\n\tjournal      = {Proceedings of the Physico-Mathematical Society of Japan. 3rd Series},\n\tpublisher    = {THE PHYSICAL SOCIETY OF JAPAN, The Mathematical Society of Japan},\n\tvolume       = 24,\n\tpages        = {551--559}\n}\n@inproceedings{naik1992meta,\n\ttitle        = {Meta-neural networks that learn by learning},\n\tauthor       = {Devang K Naik and Richard J Mammone},\n\tyear         = 1992,\n\tbooktitle    = {[Proceedings 1992] IJCNN International Joint Conference on Neural Networks},\n\tvolume       = 1,\n\tpages        = {437--442}\n}\n@inproceedings{naik2018stress,\n\ttitle        = {Stress Test Evaluation for Natural Language Inference},\n\tauthor       = {Aakanksha Naik and Abhilasha Ravichander and Norman Sadeh and Carolyn Rose and Graham Neubig},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)},\n\tpages        = {2340--2353}\n}\n@inproceedings{naim2014unsupervised,\n\ttitle        = {Unsupervised Alignment of Natural Language Instructions with Video Segments},\n\tauthor       = {I. Naim and Y. Song and Q. Liu and H. Kautz and J. Luo and D. Gildea},\n\tyear         = 2014,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{nair2010rectified,\n\ttitle        = {Rectified linear units improve restricted {boltzmann} machines},\n\tauthor       = {V. Nair and G. E. Hinton},\n\tyear         = 2010,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {807--814}\n}\n@article{nair2017overcoming,\n\ttitle        = {Overcoming Exploration in Reinforcement Learning with Demonstrations},\n\tauthor       = {A. Nair and B. McGrew and M. Andrychowicz and W. Zaremba and P. Abbeel},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1709.10089}\n}\n@inproceedings{najafi2019robustness,\n\ttitle        = {Robustness to Adversarial Perturbations in Learning from Incomplete Data},\n\tauthor       = {Amir Najafi and Shin-ichi Maeda and Masanori Koyama and Takeru Miyato},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{nakamura1988assessment,\n\ttitle        = {Assessment of biological age by principal component analysis},\n\tauthor       = {E Nakamura and K Miyao and T Ozeki},\n\tyear         = 1988,\n\tjournal      = {Mechanisms of Ageing and Development},\n\tvolume       = 46,\n\tnumber       = 1,\n\tpages        = {1--18}\n}\n@article{nakkiran2019adversarial,\n\ttitle        = {Adversarial robustness may be at odds with simplicity},\n\tauthor       = {Preetum Nakkiran},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1901.00532}\n}\n@article{nakkiran2019deep,\n\ttitle        = {Deep double descent: Where bigger models and more data hurt},\n\tauthor       = {Preetum Nakkiran and Gal Kaplun and Yamini Bansal and Tristan Yang and Boaz Barak and Ilya Sutskever},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1912.02292}\n}\n@article{nakkiran2019sgd,\n\ttitle        = {Sgd on neural networks learns functions of increasing complexity},\n\tauthor       = {Nakkiran, Preetum and Kaplun, Gal and Kalimeris, Dimitris and Yang, Tristan and Edelman, Benjamin L and Zhang, Fred and Barak, Boaz},\n\tyear         = 2019,\n\tmonth        = {May},\n\tjournal      = {arXiv preprint arXiv:1905.11604},\n\tpages        = {arXiv:1905.11604},\n\tkeywords     = {Computer Science - Machine Learning, Computer Science - Neural and Evolutionary Computing, Statistics - Machine Learning},\n\teid          = {arXiv:1905.11604},\n\tarchiveprefix = {arXiv},\n\teprint       = {1905.11604},\n\tprimaryclass = {cs.LG},\n\tadsurl       = {https://ui.adsabs.harvard.edu/abs/2019arXiv190511604N},\n\tadsnote      = {Provided by the SAO/NASA Astrophysics Data System}\n}\n@article{nakkiran2020optimal,\n\ttitle        = {Optimal regularization can mitigate double descent},\n\tauthor       = {Nakkiran, Preetum and Venkat, Prayaag and Kakade, Sham and Ma, Tengyu},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2003.01897}\n}\n@inproceedings{nakkiran2021optimal,\n\ttitle        = {Optimal Regularization can Mitigate Double Descent},\n\tauthor       = {Preetum Nakkiran and Prayaag Venkat and Sham M. Kakade and Tengyu Ma},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Learning Representations},\n\turl          = {https://openreview.net/forum?id=7R7fAoUygoa}\n}\n@article{nallapati2016abstractive,\n\ttitle        = {Abstractive text summarization using sequence-to-sequence rnns and beyond},\n\tauthor       = {Ramesh Nallapati and Bowen Zhou and Caglar Gulcehre and Bing Xiang and others},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1602.06023}\n}\n@article{nam2020learning,\n\ttitle        = {Learning from failure: Training debiased classifier from biased classifier},\n\tauthor       = {Junhyun Nam and Hyuntak Cha and Sungsoo Ahn and Jaeho Lee and Jinwoo Shin},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.02561}\n}\n@inproceedings{namkoong2016stochastic,\n\ttitle        = {Stochastic Gradient Methods for Distributionally Robust Optimization with f-Divergences},\n\tauthor       = {Hongseok Namkoong and John Duchi},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{namkoong2017variance,\n\ttitle        = {Variance regularization with convex objectives},\n\tauthor       = {Hongseok Namkoong and John Duchi},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{Naor2012,\n\ttitle        = {SPARSE QUADRATIC FORMS AND THEIR GEOMETRIC APPLICATIONS [after {B}atson, {S}pielman and {S}rivastava]},\n\tauthor       = {Naor, Assaf},\n\tyear         = 2012,\n\tjournal      = {Ast{\\'e}risque},\n\tpublisher    = {Soci{\\'e}t{\\'e} math{\\'e}matique de France}\n}\n@article{narasimhan2015language,\n\ttitle        = {Language understanding for text-based games using deep reinforcement learning},\n\tauthor       = {Karthik Narasimhan and Tejas Kulkarni and Regina Barzilay},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1506.08941}\n}\n@inproceedings{narasimhan2015machine,\n\ttitle        = {Machine comprehension with discourse relations},\n\tauthor       = {Karthik Narasimhan and Regina Barzilay},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{narasimhan2016improving,\n\ttitle        = {Improving information extraction by acquiring external evidence with reinforcement learning},\n\tauthor       = {Karthik Narasimhan and Adam Yala and Regina Barzilay},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1603.07954}\n}\n@phdthesis{narayanan1997knowledge,\n\ttitle        = {Knowledge-based Action Representations for Metaphor and Aspect (KARMA)},\n\tauthor       = {Srinivas Sankara Narayanan},\n\tyear         = 1997,\n\tschool       = {University of California Berkeley at Berkeley}\n}\n@inproceedings{narodytska2017blackbox,\n\ttitle        = {Simple Black-Box Adversarial Perturbations for Deep Networks},\n\tauthor       = {Nina Narodytska and Shiva Prasad Kasiviswanathan},\n\tyear         = 2017,\n\tbooktitle    = {IEEE Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)}\n}\n@article{narodytska2017verifying,\n\ttitle        = {Verifying Properties of Binarized Deep Neural Networks},\n\tauthor       = {Nina Narodytska and Shiva Prasad Kasiviswanathan and Leonid Ryzhyk and Mooly Sagiv and Toby Walsh},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1709.06662}\n}\n@article{nash1950bargaining,\n\ttitle        = {The Bargaining Problem},\n\tauthor       = {John Nash},\n\tyear         = 1950,\n\tjournal      = {Econometrica},\n\tvolume       = 18,\n\tpages        = {155--162}\n}\n@article{nash1951non,\n\ttitle        = {Non-cooperative games},\n\tauthor       = {Nash, John},\n\tyear         = 1951,\n\tjournal      = {Annals of mathematics},\n\tpublisher    = {JSTOR},\n\tpages        = {286--295}\n}\n@article{nash1951noncooperative,\n\ttitle        = {Non-Cooperative Games},\n\tauthor       = {John Nash},\n\tyear         = 1951,\n\tjournal      = {Annals of Mathematics},\n\tvolume       = 54\n}\n@inproceedings{nasrabadi2011robust,\n\ttitle        = {Robust lasso with missing and grossly corrupted observations},\n\tauthor       = {Nasser M. Nasrabadi and Trac D. Tran and Nam Nguyen},\n\tyear         = 2011,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{natarajan1995sparse,\n\ttitle        = {Sparse approximate solutions to linear systems},\n\tauthor       = {Balas K. Natarajan},\n\tyear         = 1995,\n\tjournal      = {SIAM Journal on Computing},\n\tvolume       = 24,\n\tnumber       = 2,\n\tpages        = {227--234}\n}\n@article{natori1998scaling,\n\ttitle        = {Scaling limit of digital circuits due to thermal noise},\n\tauthor       = {Natori, Kenji and Sano, Nobuyuki},\n\tyear         = 1998,\n\tjournal      = {Journal of applied physics},\n\tvolume       = 83,\n\tnumber       = 10,\n\tpages        = {5019--5024}\n}\n@article{navigli2009word,\n\ttitle        = {Word sense disambiguation: A survey},\n\tauthor       = {Roberto Navigli},\n\tyear         = 2009,\n\tjournal      = {ACM Computing Surveys (CSUR)}\n}\n@article{nayebi2017biologically,\n\ttitle        = {Biologically inspired protection of deep networks from adversarial attacks},\n\tauthor       = {Aran Nayebi and Surya Ganguli},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1703.09202}\n}\n@techreport{neal00dp,\n\ttitle        = {{M}arkov Chain Sampling Methods for {D}irichlet Process Mixture Models},\n\tauthor       = {R. Neal},\n\tyear         = 2000,\n\tinstitution  = {Department of Statistics, University of Toronto (U. Toronto)}\n}\n@article{neal2003slice,\n\ttitle        = {Slice Sampling},\n\tauthor       = {Radford M. Neal},\n\tyear         = 2003,\n\tjournal      = {Annals of Statistics},\n\tvolume       = 31,\n\tnumber       = 3,\n\tpages        = {705--767}\n}\n@article{neal2011mcmc,\n\ttitle        = {MCMC using Hamiltonian dynamics},\n\tauthor       = {Neal, Radford M and others},\n\tyear         = 2011,\n\tjournal      = {Handbook of markov chain monte carlo},\n\tvolume       = 2,\n\tnumber       = 11,\n\tpages        = 2\n}\n@inproceedings{neal98gem,\n\ttitle        = {A new view of the {EM} algorithm that justifies incremental, sparse and other variants},\n\tauthor       = {R. M. Neal and G. E. Hinton},\n\tyear         = 1998,\n\tbooktitle    = {Learning in Graphical Models},\n\tpages        = {355--368}\n}\n@inproceedings{neal98incremental,\n\ttitle        = {A view of the {EM} algorithm that justifies incremental, sparse, and other variants},\n\tauthor       = {R. Neal and G. Hinton},\n\tyear         = 1998,\n\tbooktitle    = {Learning in Graphical Models}\n}\n@article{NecoaraClipici2013,\n\ttitle        = {Efficient parallel coordinate descent algorithm for convex optimization problems with separable constraints: application to distributed MPC},\n\tauthor       = {Necoara, Ion and Clipici, Dragos},\n\tyear         = 2013,\n\tjournal      = {Journal of Process Control},\n\tpublisher    = {Elsevier},\n\tvolume       = 23,\n\tnumber       = 3,\n\tpages        = {243--253}\n}\n@inproceedings{Ned10,\n\ttitle        = {Random projection algorithms for convex set intersection problems},\n\tauthor       = {Nedi{\\'c}, Angelia},\n\tyear         = 2010,\n\tbooktitle    = {49th IEEE Conference on Decision and Control (CDC)},\n\tpages        = {7655--7660}\n}\n@article{Ned11,\n\ttitle        = {Random algorithms for convex minimization problems},\n\tauthor       = {Nedi\\'c, Angelia},\n\tyear         = 2011,\n\tjournal      = {Math. Program.},\n\tvolume       = 129,\n\tnumber       = {2, Ser. B},\n\tpages        = {225--253},\n\tdoi          = {10.1007/s10107-011-0468-9},\n\tissn         = {0025-5610},\n\tfjournal     = {Mathematical Programming},\n\tmrclass      = {90C25 (90C15 90C34)},\n\tmrnumber     = 2837881,\n\tmrreviewer   = {Teemu Pennanen}\n}\n@incollection{nedic2001convergence,\n\ttitle        = {Convergence rate of incremental subgradient algorithms},\n\tauthor       = {Nedi\\'c, Angelia and Bertsekas, Dimitri},\n\tyear         = 2001,\n\tbooktitle    = {Stochastic optimization: algorithms and applications},\n\tseries       = {Appl. Optim.},\n\tvolume       = 54,\n\tpages        = {223--264},\n\tdoi          = {10.1007/978-1-4757-6594-6_11},\n\turl          = {http://dx.doi.org/10.1007/978-1-4757-6594-6_11},\n\tmrclass      = {90C25 (90C52)},\n\tmrnumber     = 1835501,\n\tmrreviewer   = {A. M. Galperin}\n}\n@article{nedic2003least,\n\ttitle        = {Least squares policy evaluation algorithms with linear function approximation},\n\tauthor       = {Nedi{\\'c}, A and Bertsekas, Dimitri P},\n\tyear         = 2003,\n\tjournal      = {Discrete Event Dynamic Systems},\n\tpublisher    = {Springer},\n\tvolume       = 13,\n\tnumber       = {1-2},\n\tpages        = {79--110}\n}\n@inproceedings{needell2014kaczmarz,\n\ttitle        = {Stochastic Gradient Descent, Weighted Sampling, and the Randomized Kaczmarz algorithm},\n\tauthor       = {Deanna Needell and Nathan Srebro and Rachel Ward},\n\tyear         = 2014,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{neelakantan2015adding,\n\ttitle        = {Adding gradient noise improves learning for very deep networks},\n\tauthor       = {Neelakantan, Arvind and Vilnis, Luke and Le, Quoc V and Sutskever, Ilya and Kaiser, Lukasz and Kurach, Karol and Martens, James},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1511.06807}\n}\n@inproceedings{neelakantan2015compositional,\n\ttitle        = {Compositional Vector Space Models for Knowledge Base Completion},\n\tauthor       = {Arvind Neelakantan and Benjamin Roth and Andrew McCallum},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{neelakantan2016neural,\n\ttitle        = {Neural Programmer: Inducing Latent Programs with Gradient Descent},\n\tauthor       = {Arvind Neelakantan and Quoc V. Le and Ilya Sutskever},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{neelakantan2017learning,\n\ttitle        = {Learning a natural language interface with neural programmer},\n\tauthor       = {Arvind Neelakantan and Quoc V Le and Martin Abadi and Andrew McCallum and Dario Amodei},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{negahban2012restricted,\n\ttitle        = {Restricted strong convexity and weighted matrix completion: Optimal bounds with noise},\n\tauthor       = {Negahban, Sahand and Wainwright, Martin J},\n\tyear         = 2012,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 13,\n\tnumber       = {May},\n\tpages        = {1665--1697}\n}\n@inproceedings{negrea2019information,\n\ttitle        = {Information-Theoretic Generalization Bounds for SGLD via Data-Dependent Estimates},\n\tauthor       = {Negrea, Jeffrey and Haghifam, Mahdi and Dziugaite, Gintare Karolina and Khisti, Ashish and Roy, Daniel M},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {11013--11023}\n}\n@inproceedings{nekoto2020participatory,\n\ttitle        = {Participatory Research for Low-resourced Machine Translation: A Case Study in {African} Languages},\n\tauthor       = {Wilhelmina Nekoto and Vukosi Marivate and Tshinondiwa Matsila and Timi Fasubaa and Tajudeen Kolawole and Taiwo Fagbohungbe and Solomon Oluwole Akinola and Shamsuddee Hassan Muhammad and Salomon Kabongo and Salomey Osei and Sackey Freshia and Rubungo Andre Niyongabo and Ricky Macharm and Perez Ogayo and Orevaoghene Ahia and Musie Meressa and Mofe Adeyemi and Masabata Mokgesi-Selinga and Lawrence Okegbemi and Laura Jane Martinus and Kolawole Tajudeen and Kevin Degila and Kelechi Ogueji and Kathleen Siminyu and Julia Kreutzer and Jason Webster and Jamiil Toure Ali and Jade Abbott and Iroro Orife and Ignatius Ezeani and Idris Abdulkabir Dangana and Herman Kamper and Hady Elsahar and Goodness Duru and Ghollah Kioko and Espoir Murhabazi and Elan van Biljon and Daniel Whitenack and Christopher Onyefuluchi and Chris Emezue and Bonaventure Dossou and Blessing Sibanda and Blessing Itoro Bassey and Ayodele Olabiyi and Arshath Ramkilowan and Alp Öktem and Adewale Akinfaderin and Abdallah Bashir},\n\tyear         = 2020,\n\tbooktitle    = {Findings of Empirical Methods in Natural Language Processing (Findings of EMNLP)}\n}\n@article{nelson2008exploiting,\n\ttitle        = {Exploiting Machine Learning to Subvert Your Spam Filter},\n\tauthor       = {Blaine Nelson and Marco Barreno and Fuching Jack Chi and Anthony D Joseph and Benjamin IP Rubinstein and Udam Saini and Charles A Sutton and J Doug Tygar and Kai Xia},\n\tyear         = 2008,\n\tjournal      = {LEET},\n\tvolume       = 8,\n\tpages        = {1--9}\n}\n@inproceedings{nelson2009misleading,\n\ttitle        = {Misleading learners: Co-opting your spam filter},\n\tauthor       = {Blaine Nelson and Marco Barreno and Fuching Jack Chi and Anthony D Joseph and Benjamin IP Rubinstein and Udam Saini and Charles Sutton and JD Tygar and Kai Xia},\n\tyear         = 2009,\n\tbooktitle    = {Machine learning in cyber trust},\n\tpages        = {17--51}\n}\n@inproceedings{nelson2013osnap,\n\ttitle        = {{OSNAP}: Faster numerical linear algebra algorithms via sparser subspace embeddings},\n\tauthor       = {Jelani Nelson and Huy L. Nguyen},\n\tyear         = 2013,\n\tbooktitle    = {Foundations of Computer Science (FOCS)}\n}\n@article{nemirovski02saddle,\n\ttitle        = {An Efficient Stochastic Approximation Algorithm for Stochastic Saddle Point Problems},\n\tauthor       = {A. Nemirovski and R. Y. Rubinstein},\n\tyear         = 2002,\n\tjournal      = {International Series in Operations Research and Management Science},\n\tvolume       = 46,\n\tpages        = {155--184}\n}\n@techreport{nemirovski1997,\n\ttitle        = {On self-concordant convex-concave functions},\n\tauthor       = {Nemirovskii, Arkadii},\n\tyear         = 1997,\n\tmonth        = jun,\n\tnumber       = {\\# 3/97},\n\tinstitution  = {Optimization Laboratory Faculty of Industrial Engineering and Management, The Technion - Israel Institute of Technology}\n}\n@article{Nemirovski2004,\n\ttitle        = {{Prox-Method with Rate of Convergence $O(1/t)$ for Variational Inequalities with Lipschitz Continuous Monotone Operators and Smooth Convex-Concave Saddle Point Problems}},\n\tauthor       = {Nemirovski, Arkadi},\n\tyear         = 2004,\n\tmonth        = jan,\n\tjournal      = {SIAM Journal on Optimization},\n\tvolume       = 15,\n\tnumber       = 1,\n\tpages        = {229--251},\n\tdoi          = {10.1137/S1052623403425629},\n\tissn         = {1052-6234},\n\tannote       = {Nemirovski's Mirror-Prox Method},\n\tmendeley-groups = {Optimization/Gradient Descent Theory}\n}\n@incollection{nemirovski2005efficient,\n\ttitle        = {An efficient stochastic approximation algorithm for stochastic saddle point problems},\n\tauthor       = {Nemirovski, Arkadi and Rubinstein, Reuven Y},\n\tyear         = 2005,\n\tbooktitle    = {Modeling Uncertainty},\n\tpublisher    = {Springer},\n\tpages        = {156--184}\n}\n@article{nemirovski2006convex,\n\ttitle        = {Convex approximations of chance constrained programs},\n\tauthor       = {Nemirovski, Arkadi and Shapiro, Alexander},\n\tyear         = 2006,\n\tjournal      = {SIAM Journal on Optimization},\n\tpublisher    = {SIAM},\n\tvolume       = 17,\n\tnumber       = 4,\n\tpages        = {969--996}\n}\n@article{nemirovski2009robust,\n\ttitle        = {Robust stochastic approximation approach to stochastic programming},\n\tauthor       = {Nemirovski, Arkadi and Juditsky, Anatoli and Lan, Guanghui and Shapiro, Alexander},\n\tyear         = 2009,\n\tjournal      = {SIAM Journal on optimization},\n\tpublisher    = {SIAM},\n\tvolume       = 19,\n\tnumber       = 4,\n\tpages        = {1574--1609},\n\tdoi          = {10.1137/070704277},\n\tissn         = {1052-6234},\n\turl          = {http://dx.doi.org/10.1137/070704277},\n\tfjournal     = {SIAM Journal on Optimization},\n\tmrclass      = {90C15 (90C51)},\n\tmrnumber     = 2486041,\n\tmrreviewer   = {Teemu Pennanen}\n}\n@book{Nemirovski2013,\n\ttitle        = {{Lectures on Modern Convex Optimization}},\n\tauthor       = {{Ben-Tal}, Aharon and Nemirovski, Arkadi},\n\tyear         = 2013,\n\tmonth        = jan,\n\tpublisher    = {Society for Industrial and Applied Mathematics},\n\tdoi          = {10.1137/1.9780898718829},\n\tisbn         = {978-0-89871-491-3},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Ben-Tal, Nemirovski - 2013 - Lectures on Modern Convex Optimization.pdf:pdf},\n\tmendeley-groups = {Optimization/Gradient Descent Theory,Books/Optimization}\n}\n@article{NemirovskiBook,\n\ttitle        = {Interior point polynomial time methods in convex programming},\n\tauthor       = {Nemirovskii, AS},\n\tyear         = 2004,\n\tjournal      = {Lecture Notes}\n}\n@book{Nemirovsky1978,\n\ttitle        = {Problem complexity and method efficiency in optimization.},\n\tauthor       = {Nemirovsky, Arkadi and Yudin, David},\n\tyear         = 1978,\n\tpublisher    = {Nauka Publishers, Moscow (in Russian)},\n\tnote         = {John Wiley, New York (in English) 1983}\n}\n@inproceedings{nenkova2007pyramid,\n\ttitle        = {The Pyramid Method: Incorporating human content selection variation in summarization evaluation},\n\tauthor       = {Ani Nenkova and Rebecca J. Passonneau and Kathleen McKeown},\n\tyear         = 2007,\n\tbooktitle    = {ACM Transactions on Speech and Language Processing}\n}\n@book{Nesterov,\n\ttitle        = {Introductory lectures on convex optimization : a basic course},\n\tauthor       = {Nesterov, Yurii},\n\tyear         = 2004,\n\tpublisher    = {Kluwer Academic Publ.},\n\taddress      = {Boston, Dordrecht, London},\n\tseries       = {Applied optimization},\n\tvolume       = 87,\n\tisbn         = {1-4020-7553-7},\n\turl          = {http://opac.inria.fr/record=b1104789}\n}\n@inproceedings{Nesterov1983,\n\ttitle        = {A method of solving a convex programming problem with convergence rate {$O(1/k^2)$}},\n\tauthor       = {Nesterov, Yurii},\n\tyear         = 1983,\n\tbooktitle    = {Doklady AN SSSR (translated as Soviet Mathematics Doklady)},\n\tvolume       = 269,\n\tnumber       = 2,\n\tpages        = {543--547}\n}\n@article{nesterov1983acceleration,\n\ttitle        = {A method of solving a convex programming problem with convergence rate ${O}(1/k^2)$},\n\tauthor       = {Y. Nesterov},\n\tyear         = 1983,\n\tjournal      = {Soviet Mathematics Doklady},\n\tvolume       = 27,\n\tnumber       = 2,\n\tpages        = {372--376}\n}\n@book{nesterov1994interior,\n\ttitle        = {Interior-point polynomial algorithms in convex programming},\n\tauthor       = {Nesterov, Yurii and Nemirovskii, Arkadii and Ye, Yinyu},\n\tyear         = 1994,\n\tpublisher    = {SIAM},\n\tvolume       = 13\n}\n@article{nesterov1998semidefinite,\n\ttitle        = {Semidefinite relaxation and nonconvex quadratic optimization},\n\tauthor       = {Y. Nesterov},\n\tyear         = 1998,\n\tjournal      = {Optimization methods and software},\n\tvolume       = 9,\n\tpages        = {141--160}\n}\n@incollection{nesterov2000squared,\n\ttitle        = {Squared functional systems and optimization problems},\n\tauthor       = {Nesterov, Yurii},\n\tyear         = 2000,\n\tbooktitle    = {High performance optimization},\n\tpublisher    = {Springer},\n\tpages        = {405--440}\n}\n@book{Nesterov2004,\n\ttitle        = {Introductory Lectures on Convex Programming Volume: A Basic course},\n\tauthor       = {Nesterov, Yurii},\n\tyear         = 2004,\n\tpublisher    = {Kluwer Academic Publishers},\n\tvolume       = {I},\n\tisbn         = 1402075537\n}\n@book{nesterov2004introductory,\n\ttitle        = {Introductory Lectures on Convex Optimization: A Basic Course},\n\tauthor       = {Y. Nesterov},\n\tyear         = 2004,\n\tpublisher    = {Springer}\n}\n@article{Nesterov2005,\n\ttitle        = {{Smooth minimization of non-smooth functions}},\n\tauthor       = {Nesterov, Yurii},\n\tyear         = 2005,\n\tmonth        = dec,\n\tjournal      = {Mathematical Programming},\n\tvolume       = 103,\n\tnumber       = 1,\n\tpages        = {127--152},\n\tdoi          = {10.1007/s10107-004-0552-5},\n\tisbn         = 1010700405,\n\tissn         = {0025-5610},\n\tabstract     = {In this paper we propose a new approach for constructing efficient schemes for non- smooth convex optimization. It is based on a special smoothing technique, which can be applied to the functions with explicit max-structure. Our approach can be considered as an alternative to black-box minimization. From the viewpoint of efficiency estimates, we manage to improve the traditional bounds on the number of iterations of the gra- dient schemes from O 1 unchanged. 2 to O1, keeping basically the complexity of each iteration},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Nesterov - 2005 - Smooth minimization of non-smooth functions.pdf:pdf},\n\tkeywords     = {complexity theory,convex optimization,non smooth optimization,optimal methods,optimization,structural optimization},\n\tmendeley-groups = {Optimization/Gradient Descent Theory},\n\tmendeley-tags = {optimization}\n}\n@article{Nesterov2005excessive,\n\ttitle        = {{Excessive Gap Technique in Nonsmooth Convex Minimization}},\n\tauthor       = {Nesterov, Yurii},\n\tyear         = 2005,\n\tmonth        = jan,\n\tjournal      = {SIAM Journal on Optimization},\n\tvolume       = 16,\n\tnumber       = 1,\n\tpages        = {235--249},\n\tdoi          = {10.1137/S1052623403422285},\n\tissn         = {1052-6234},\n\tannote       = {YinTat mentioned that this paper may have combined the primal/dual descent steps of Nesterov into (either one or two, I forgot) Prox steps.},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Nesterov - 2005 - Excessive Gap Technique in Nonsmooth Convex Minimization.pdf:pdf},\n\tkeywords     = {black-box oracle,complexity theory,convex optimization,non-smooth optimization,optimal methods,structural},\n\tmendeley-groups = {Optimization/Gradient Descent Theory}\n}\n@article{nesterov2005smooth,\n\ttitle        = {Smooth minimization of non-smooth functions},\n\tauthor       = {Y. Nesterov},\n\tyear         = 2005,\n\tjournal      = {Mathematical Programming},\n\tvolume       = 103,\n\tnumber       = 1,\n\tpages        = {127--152}\n}\n@article{nesterov2006cubic,\n\ttitle        = {Cubic regularization of Newton method and its global performance},\n\tauthor       = {Nesterov, Yurii and Polyak, Boris T},\n\tyear         = 2006,\n\tjournal      = {Mathematical Programming},\n\tpublisher    = {Springer},\n\tvolume       = 108,\n\tnumber       = 1,\n\tpages        = {177--205}\n}\n@article{nesterov2008cubic,\n\ttitle        = {Accelerating the cubic regularization of Newton's method on convex problems},\n\tauthor       = {Nesterov, Yurii},\n\tyear         = 2008,\n\tjournal      = {Mathematical Programming},\n\tpublisher    = {Springer},\n\tvolume       = 112,\n\tnumber       = 1,\n\tpages        = {159--181}\n}\n@article{nesterov2008rounding,\n\ttitle        = {Rounding of convex sets and efficient gradient methods for linear programming problems},\n\tauthor       = {Nesterov, Yu},\n\tyear         = 2008,\n\tjournal      = {Optimisation Methods and Software},\n\tpublisher    = {Taylor \\& Francis},\n\tvolume       = 23,\n\tnumber       = 1,\n\tpages        = {109--128}\n}\n@article{Nesterov2009,\n\ttitle        = {{Primal-dual subgradient methods for convex problems}},\n\tauthor       = {Nesterov, Yurii},\n\tyear         = 2007,\n\tmonth        = jun,\n\tjournal      = {Mathematical Programming},\n\tvolume       = 120,\n\tnumber       = 1,\n\tpages        = {221--259},\n\tdoi          = {10.1007/s10107-007-0149-x},\n\tissn         = {0025-5610},\n\tabstract     = {In this paper we present a new approach for constructing subgradient schemesfordifferent types ofnonsmoothproblems withconvexstructure.Ourmethods are primal-dual since they are always able to generate a feasible approximation to the optimum of an appropriately formulated dual problem. Besides other advantages, this useful feature provides the methods with a reliable stopping criterion. The proposed schemes differ from the classical approaches (divergent series methods, mirror descent methods) by presence of two control sequences. The first sequence is responsible for aggregating the support functions in the dual space, and the second one establishes a dynamically updated scale between the primal and dual spaces. This additional flexi- bility allows to guarantee a boundedness of the sequence of primal test points even in the case of unbounded feasible set (however, we always assume the uniform bounded- ness of subgradients).We present the variants of subgradient schemes for nonsmooth convex minimization, minimax problems, saddle point problems, variational inequali- ties, and stochastic optimization. In all situations our methods are proved to be optimal from the view point of worst-case black-box lower complexity bounds.},\n\tannote       = {A good citation to his dual averaging.},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Nesterov - 2007 - Primal-dual subgradient methods for convex problems.pdf:pdf},\n\tkeywords     = {Black-box methods,Convex optimization,Lower complexity bounds,Minimax problems,Non-smooth optimization,Saddle points,Stochastic optimization,Subgradient methods,Variational inequalities},\n\tmendeley-groups = {Optimization/Gradient Descent Theory}\n}\n@article{nesterov2011random,\n\ttitle        = {Random gradient-free minimization of convex functions},\n\tauthor       = {Yurii Nesterov and Vladimir Spokoiny},\n\tyear         = 2011,\n\tjournal      = {Foundations of Computational Mathematics},\n\tpages        = {1--40}\n}\n@article{Nesterov2012,\n\ttitle        = {{Efficiency of Coordinate Descent Methods on Huge-Scale Optimization Problems}},\n\tauthor       = {Nesterov, Yurii},\n\tyear         = 2012,\n\tmonth        = jan,\n\tjournal      = {SIAM Journal on Optimization},\n\tvolume       = 22,\n\tnumber       = 2,\n\tpages        = {341--362},\n\tdoi          = {10.1137/100802001},\n\tissn         = {1052-6234},\n\turl          = {http://130.104.5.100/cps/ucl/doc/core/documents/coredp2010{\\_}2web.pdf http://epubs.siam.org/doi/abs/10.1137/100802001},\n\tabstract     = {In this paper we propose new methods for solving huge-scale optimization problems. For problems of this size, even the simplest full-dimensional vector operations are very expensive. Hence, we propose to apply an optimization technique based on random partial update of decision variables. For these methods, we prove the global estimates for the rate of convergence. Surprisingly, for certain classes of objective functions, our results are better than the standard worst-case bounds for deterministic algorithms. We present constrained and unconstrained versions of the method and its accelerated variant. Our numerical test confirms a high efficiency of this technique on problems of very big size. Read More: http://epubs.siam.org/doi/abs/10.1137/100802001},\n\tfile         = {:D$\\backslash$:/Mendeley Desktop/Nesterov - 2012 - Efficiency of Coordinate Descent Methods on Huge-Scale Optimization Problems.pdf:pdf},\n\tkeywords     = {Google problem,convex optimization,coordinate relaxation,fast gradient schemes,worst-case efficiency estimates},\n\tmendeley-groups = {Optimization/Coordinate Descent}\n}\n@article{Nesterov2013,\n\ttitle        = {{Gradient methods for minimizing composite functions}},\n\tauthor       = {Nesterov, Yurii},\n\tyear         = 2013,\n\tjournal      = {Mathematical Programming},\n\tvolume       = 140,\n\tnumber       = 1,\n\tpages        = {125--161},\n\tdoi          = {10.1007/s10107-012-0629-5},\n\tissn         = {0025-5610},\n\tfile         = {:D$\\backslash$:/Mendeley Desktop/Nesterov - 2013 - Gradient methods for minimizing composite functions.pdf:pdf},\n\tmendeley-groups = {Optimization/Gradient Descent Theory,Optimization/Gradient Descent Theory/Composite}\n}\n@article{Nesterov2014,\n\ttitle        = {{Universal gradient methods for convex optimization problems}},\n\tauthor       = {Nesterov, Yurii},\n\tyear         = 2014,\n\tmonth        = may,\n\tjournal      = {Mathematical Programming},\n\tdoi          = {10.1007/s10107-014-0790-0},\n\tissn         = {0025-5610},\n\tmendeley-groups = {Optimization/Gradient Descent Theory}\n}\n@book{NesterovBook,\n\ttitle        = {Introductory lectures on convex optimization},\n\tauthor       = {Nesterov, Yurii},\n\tyear         = 2004,\n\tpublisher    = {Springer Science \\& Business Media},\n\tvolume       = 87\n}\n@techreport{NesterovStich2016,\n\ttitle        = {Efficiency of accelerated coordinate descent method on structured optimization problems},\n\tauthor       = {Nesterov, Yurii and Stich, Sebastian},\n\tyear         = 2016,\n\tinstitution  = {CORE Discussion Papers}\n}\n@article{nestor2019feature,\n\ttitle        = {Feature robustness in non-stationary health records: caveats to deployable model performance in common clinical machine learning tasks},\n\tauthor       = {Bret Nestor and Matthew McDermott and Willie Boag and Gabriela Berner and Tristan Naumann and Michael C Hughes and Anna Goldenberg and Marzyeh Ghassemi},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1908.00690}\n}\n@article{netrapalli2013phase,\n\ttitle        = {Phase Retrieval using Alternating Minimization},\n\tauthor       = {Netrapalli, Praneeth and Jain, Prateek and Sanghavi, Sujay},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1306.0160}\n}\n@inproceedings{netzer2011reading,\n\ttitle        = {Reading Digits in Natural Images with Unsupervised Feature Learning},\n\tauthor       = {Netzer, Yuval and Wang, Tao and Coates, Adam and Bissacco, Alessandro and Wu, Bo and Ng, Andrew Y},\n\tyear         = 2011,\n\tbooktitle    = {NIPS Workshop on Deep Learning and Unsupervised Feature Learning}\n}\n@article{neu2020unifying,\n\ttitle        = {A Unifying View of Optimism in Episodic Reinforcement Learning},\n\tauthor       = {Neu, Gergely and Pike-Burke, Ciara},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.01891}\n}\n@inproceedings{NEURIPS2018_69386f6b,\n\ttitle        = {Neural Ordinary Differential Equations},\n\tauthor       = {Chen, Ricky T. Q. and Rubanova, Yulia and Bettencourt, Jesse and Duvenaud, David K},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 31,\n\tpages        = {},\n\turl          = {https://proceedings.neurips.cc/paper/2018/file/69386f6bb1dfed68692a24c8686939b9-Paper.pdf},\n\teditor       = {S. Bengio and H. Wallach and H. Larochelle and K. Grauman and N. Cesa-Bianchi and R. Garnett}\n}\n@inproceedings{NEURIPS2018_ab88b157,\n\ttitle        = {Conditional Adversarial Domain Adaptation},\n\tauthor       = {Long, Mingsheng and CAO, ZHANGJIE and Wang, Jianmin and Jordan, Michael I},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 31,\n\tpages        = {},\n\turl          = {https://proceedings.neurips.cc/paper/2018/file/ab88b15733f543179858600245108dd8-Paper.pdf},\n\teditor       = {S. Bengio and H. Wallach and H. Larochelle and K. Grauman and N. Cesa-Bianchi and R. Garnett}\n}\n@incollection{NEURIPS2019_9015,\n\ttitle        = {PyTorch: An Imperative Style, High-Performance Deep Learning Library},\n\tauthor       = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and Desmaison, Alban and Kopf, Andreas and Yang, Edward and DeVito, Zachary and Raison, Martin and Tejani, Alykhan and Chilamkurthy, Sasank and Steiner, Benoit and Fang, Lu and Bai, Junjie and Chintala, Soumith},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems 32},\n\tpublisher    = {Curran Associates, Inc.},\n\tpages        = {8024--8035},\n\turl          = {http://papers.neurips.cc/paper/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf},\n\teditor       = {H. Wallach and H. Larochelle and A. Beygelzimer and F. dAlch\\'{e}-Buc and E. Fox and R. Garnett}\n}\n@inproceedings{NEURIPS2020_e1fe6165,\n\ttitle        = {Neural Networks with Small Weights and Depth-Separation Barriers},\n\tauthor       = {Vardi, Gal and Shamir, Ohad},\n\tyear         = 2020,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 33,\n\tpages        = {19433--19442},\n\turl          = {https://proceedings.neurips.cc/paper/2020/file/e1fe6165cad3f7f3f57d409f78e4415f-Paper.pdf},\n\teditor       = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin}\n}\n@article{nevitt2006osteoarthritis,\n\ttitle        = {The {Osteoarthritis Initiative}},\n\tauthor       = {M Nevitt and David T Felson and Gayle Lester},\n\tyear         = 2006,\n\tjournal      = {Cohort study protocol}\n}\n@article{newcomb1886generalized,\n\ttitle        = {A Generalized Theory of the Combination of Observations so as to Obtain the Best Result},\n\tauthor       = {Simon Newcomb},\n\tyear         = 1886,\n\tjournal      = {American Journal of Mathematics},\n\tvolume       = 8,\n\tnumber       = 4,\n\tpages        = {343--366}\n}\n@inproceedings{newell2014practicality,\n\ttitle        = {On the practicality of integrity attacks on document-level sentiment analysis},\n\tauthor       = {Andrew Newell and Rahul Potharaju and Luojie Xiang and Cristina Nita-Rotaru},\n\tyear         = 2014,\n\tbooktitle    = {Workshop on Artificial Intelligence and Security (AISec)},\n\tpages        = {83--93}\n}\n@article{newey1994asymptotic,\n\ttitle        = {The asymptotic variance of semiparametric estimators},\n\tauthor       = {Whitney K. Newey},\n\tyear         = 1994,\n\tjournal      = {Econometrica: Journal of the Econometric Society},\n\tpages        = {1349--1382}\n}\n@incollection{newey1994large,\n\ttitle        = {Large sample estimation and hypothesis testing},\n\tauthor       = {Whitney K. Newey and Daniel McFadden},\n\tyear         = 1994,\n\tbooktitle    = {Handbook of Econometrics},\n\tvolume       = 4,\n\tpages        = {2111--2245}\n}\n@article{newman2003lying,\n\ttitle        = {Lying Words: Predicting Deception From Linguistic Styles},\n\tauthor       = {Matthew L. Newman and James W. Pennebaker and Diane S. Berry and J. Michael Richards},\n\tyear         = 2003,\n\tjournal      = {Personality and Social Psychology Bulletin},\n\tvolume       = 29\n}\n@inproceedings{newman2006statistical,\n\ttitle        = {Statistical entity-topic models},\n\tauthor       = {Newman, David and Chemudugunta, Chaitanya and Smyth, Padhraic},\n\tyear         = 2006,\n\tbooktitle    = {\n\t\tProceedings of the 12th ACM SIGKDD international conference on Knowledge\n\n\t\tdiscovery and data mining\n\t},\n\tlocation     = {Philadelphia, PA, USA},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {KDD '06},\n\tpages        = {680--686},\n\tdoi          = {http://doi.acm.org/10.1145/1150402.1150487},\n\tisbn         = {1-59593-339-5},\n\tacmid        = 1150487,\n\tkeywords     = {entity recognition, text modeling, topic modeling},\n\tnumpages     = 7\n}\n@article{newman2018harmonic,\n\ttitle        = {HARMONIC: A Multimodal Dataset of Assistive Human-Robot Collaboration},\n\tauthor       = {Benjamin A. Newman and Reuben M. Aronson and S. Srinivasa and K. Kitani and Henny Admoni},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1807.11154}\n}\n@inproceedings{newman2020eos,\n\ttitle        = {The {EOS} Decision and Length Extrapolation},\n\tauthor       = {Benjamin Newman and John Hewitt and Percy Liang and Christopher D. Manning},\n\tyear         = 2020,\n\tbooktitle    = {Proceedings of the Third BlackboxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP}\n}\n@inproceedings{newsamp,\n\ttitle        = {Convergence rates of sub-sampled Newton methods},\n\tauthor       = {Erdogdu, Murat A and Montanari, Andrea},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {3034--3042}\n}\n@inproceedings{newsome2006paragraph,\n\ttitle        = {Paragraph: Thwarting signature learning by training maliciously},\n\tauthor       = {James Newsome and Brad Karp and Dawn Song},\n\tyear         = 2006,\n\tbooktitle    = {International Workshop on Recent Advances in Intrusion Detection}\n}\n@inproceedings{ney1992improvements,\n\ttitle        = {Improvements in beam search for 10000-word continuous speech recognition},\n\tauthor       = {Hermann Ney and Reinhold Haeb-Umbach and B-H Tran and Martin Oerder},\n\tyear         = 1992,\n\tbooktitle    = {International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},\n\tvolume       = 1,\n\tpages        = {9--12}\n}\n@article{ney1994structuring,\n\ttitle        = {On structuring probabilistic dependences in stochastic language modeling},\n\tauthor       = {Hermann Ney and Ute Essen and Reinhard Kneser},\n\tyear         = 1994,\n\tjournal      = {Computer, Speech, and Language},\n\tvolume       = 8,\n\tnumber       = 1,\n\tpages        = {1--38}\n}\n@inproceedings{ney96hmm,\n\ttitle        = {{HMM}-Based Word Alignment in Statistical Translation},\n\tauthor       = {Hermann Ney and Stephan Vogel},\n\tyear         = 1996,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)},\n\tpages        = {836--841}\n}\n@article{neyshabur2014implicit,\n\ttitle        = {In search of the real inductive bias: On the role of implicit regularization in deep learning},\n\tauthor       = {Behnam Neyshabur and Ryota Tomioka and Nathan Srebro},\n\tyear         = 2014,\n\tjournal      = {arXiv}\n}\n@inproceedings{neyshabur2015norm,\n\ttitle        = {Norm-based capacity control in neural networks},\n\tauthor       = {Neyshabur, Behnam and Tomioka, Ryota and Srebro, Nathan},\n\tyear         = 2015,\n\tbooktitle    = {Conference on Learning Theory},\n\tpages        = {1376--1401},\n\torganization = {PMLR}\n}\n@inproceedings{neyshabur2015path,\n\ttitle        = {Path-sgd: Path-normalized optimization in deep neural networks},\n\tauthor       = {Neyshabur, Behnam and Salakhutdinov, Russ R and Srebro, Nati},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {2422--2430}\n}\n@inproceedings{neyshabur2017exploring,\n\ttitle        = {Exploring generalization in deep learning},\n\tauthor       = {Neyshabur, Behnam and Bhojanapalli, Srinadh and McAllester, David and Srebro, Nati},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {5947--5956}\n}\n@inproceedings{neyshabur2017generalization,\n\ttitle        = {Exploring Generalization in Deep Learning},\n\tauthor       = {Behnam Neyshabur and Srinadh Bhojanapalli and David McAllester and Nathan Srebro},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{neyshabur2017pac,\n\ttitle        = {A pac-bayesian approach to spectrally-normalized margin bounds for neural networks},\n\tauthor       = {Neyshabur, Behnam and Bhojanapalli, Srinadh and Srebro, Nathan},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1707.09564}\n}\n@article{NFINDR,\n\ttitle        = {N-FindR method versus independent component analysis for lithological identification in hyperspectral imagery},\n\tauthor       = {C. Gomez and H. Le Borgne and P. Allemand and C. Delacourt and P. Ledru},\n\tyear         = 2007,\n\tmonth        = jan,\n\tjournal      = {Int. J. Remote Sens.},\n\tpublisher    = {Taylor \\& Francis, Inc.},\n\taddress      = {Bristol, PA, USA},\n\tvolume       = 28,\n\tnumber       = 23,\n\tissue_date   = {November 2007}\n}\n@inproceedings{ng02compare,\n\ttitle        = {On Discriminative vs. Generative classifiers: A comparison of logistic regression and naive {B}ayes},\n\tauthor       = {Andrew Y. Ng and Michael I. Jordan},\n\tyear         = 2002,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{ng1999policy,\n\ttitle        = {Policy invariance under reward transformations: Theory and application to reward shaping},\n\tauthor       = {Andrew Y Ng and Daishi Harada and Stuart Russell},\n\tyear         = 1999,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tvolume       = 99,\n\tpages        = {278--287}\n}\n@inproceedings{ng2000machine,\n\ttitle        = {A machine learning approach to answering questions for reading comprehension tests},\n\tauthor       = {Hwee Tou Ng and Leong Hwee Teo and Jennifer Lai Pheng Kwan},\n\tyear         = 2000,\n\tbooktitle    = {Joint SIGDAT conference on empirical methods in natural language processing and very large corpora - Volume 13},\n\tpages        = {124--132}\n}\n@article{ng2001spectral,\n\ttitle        = {On spectral clustering: Analysis and an algorithm},\n\tauthor       = {Ng, Andrew and Jordan, Michael and Weiss, Yair},\n\tyear         = 2001,\n\tjournal      = {Advances in neural information processing systems},\n\tbooktitle    = {Advances in neural information processing systems},\n\tvolume       = 14,\n\tpages        = {849--856}\n}\n@inproceedings{Ng2004-L1LR,\n\ttitle        = {Feature selection, {L1 vs. L2} regularization, and rotational invariance},\n\tauthor       = {Ng, Andrew Y.},\n\tyear         = 2004,\n\tbooktitle    = {Proceedings of the 21st International Conference on Machine Learning},\n\tseries       = {ICML 2004},\n\tpages        = 78,\n\torganization = {ACM}\n}\n@article{ng2009agenda,\n\ttitle        = {An agenda for personalized medicine},\n\tauthor       = {Ng, Pauline C and Murray, Sarah S and Levy, Samuel and Venter, J Craig},\n\tyear         = 2009,\n\tjournal      = {Nature},\n\tpublisher    = {Nature Publishing Group},\n\tvolume       = 461,\n\tnumber       = 7265,\n\tpages        = {724--726}\n}\n@article{Nguyen-Regev,\n\ttitle        = {Learning a Parallelepiped: Cryptanalysis of {GGH} and {NTRU} Signatures},\n\tauthor       = {P. Q. Nguyen and O. Regev},\n\tyear         = 2009,\n\tjournal      = {Journal of Cryptology},\n\tvolume       = 22,\n\tnumber       = 2,\n\tpages        = {139--160}\n}\n@article{nguyen2013exact,\n\ttitle        = {Exact Recoverability From Dense Corrupted Observations via $\\ell_1$-Minimization},\n\tauthor       = {Nam H. Nguyen and Trac D. Tran},\n\tyear         = 2013,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tvolume       = 59,\n\tnumber       = 4,\n\tpages        = {2017--2035}\n}\n@inproceedings{nguyen2013ros,\n\ttitle        = {{ROS} Commander (ROSCo): Behavior Creation for Home Robots},\n\tauthor       = {H. Nguyen and M. Ciocarlie and J. Hsiao and C. C. Kemp},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Robotics and Automation (ICRA)}\n}\n@inproceedings{nguyen2014anchors,\n\ttitle        = {Anchors Regularized: Adding Robustness and Extensibility to Scalable Topic-Modeling Algorithms},\n\tauthor       = {Thang Nguyen and Yuening Hu and Jordan Boyd-Graber},\n\tyear         = 2014,\n\tbooktitle    = {ACL}\n}\n@inproceedings{nguyen20151,\n\ttitle        = {L 1 adaptive control for bipedal robots with control Lyapunov function based quadratic programs},\n\tauthor       = {Nguyen, Quan and Sreenath, Koushil},\n\tyear         = 2015,\n\tbooktitle    = {2015 American Control Conference (ACC)},\n\tpages        = {862--867},\n\torganization = {IEEE}\n}\n@inproceedings{nguyen2015posterior,\n\ttitle        = {Posterior calibration and exploratory analysis for natural language processing models},\n\tauthor       = {Khanh Nguyen and Brendan O'Connor},\n\tyear         = 2015,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1587--1598}\n}\n@inproceedings{nguyen2016exponential,\n\ttitle        = {Exponential control barrier functions for enforcing high relative-degree safety-critical constraints},\n\tauthor       = {Nguyen, Quan and Sreenath, Koushil},\n\tyear         = 2016,\n\tbooktitle    = {2016 American Control Conference (ACC)},\n\tpages        = {322--328},\n\torganization = {IEEE}\n}\n@inproceedings{nguyen2016ms,\n\ttitle        = {{MS MARCO}: A human generated machine reading comprehension dataset},\n\tauthor       = {Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng},\n\tyear         = 2016,\n\tbooktitle    = {Workshop on Cognitive Computing at NIPS}\n}\n@inproceedings{nguyen2017loss,\n\ttitle        = {The Loss Surface of Deep and Wide Neural Networks},\n\tauthor       = {Nguyen, Quynh and Hein, Matthias},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1704.08045},\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {2603--2612}\n}\n@article{nguyen2017loss2,\n\ttitle        = {The loss surface and expressivity of deep convolutional neural networks},\n\tauthor       = {Nguyen, Quynh and Hein, Matthias},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1710.10928}\n}\n@inproceedings{nguyen2018optimization,\n\ttitle        = {Optimization landscape and expressivity of deep cnns},\n\tauthor       = {Nguyen, Quynh and Hein, Matthias},\n\tyear         = 2018,\n\tbooktitle    = {International conference on machine learning},\n\tpages        = {3730--3739},\n\torganization = {PMLR}\n}\n@inproceedings{ni2019justifying,\n\ttitle        = {Justifying recommendations using distantly-labeled reviews and fine-grained aspects},\n\tauthor       = {Jianmo Ni and Jiacheng Li and Julian McAuley},\n\tyear         = 2019,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {188--197}\n}\n@article{ni2020survey,\n\ttitle        = {A Survey on Theories and Applications for Self-Driving Cars Based on Deep Learning Methods},\n\tauthor       = {Ni, Jianjun and Chen, Yinan and Chen, Yan and Zhu, Jinxiu and Ali, Deena and Cao, Weidong},\n\tyear         = 2020,\n\tjournal      = {Applied Sciences},\n\tpublisher    = {Multidisciplinary Digital Publishing Institute},\n\tvolume       = 10,\n\tnumber       = 8,\n\tpages        = 2749\n}\n@inproceedings{nickel12yago,\n\ttitle        = {Factorizing {YAGO}},\n\tauthor       = {Maximilian Nickel and Volker Tresp and Hans-Peter Kriegel},\n\tyear         = 2012,\n\tbooktitle    = {World Wide Web (WWW)}\n}\n@inproceedings{nickel2011three,\n\ttitle        = {A three-way model for collective learning on multi-relational data},\n\tauthor       = {Maximilian Nickel and Volker Tresp and Hans-Peter Kriegel},\n\tyear         = 2011,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {809--816}\n}\n@inproceedings{nickel2014reducing,\n\ttitle        = {Reducing the Rank in Relational Factorization Models by Including Observable Patterns},\n\tauthor       = {Maximilian Nickel and Xueyan Jiang and Volker Tresp},\n\tyear         = 2014,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1179--1187}\n}\n@inproceedings{niculescu2005predicting,\n\ttitle        = {Predicting good probabilities with supervised learning},\n\tauthor       = {Niculescu-Mizil, Alexandru and Caruana, Rich},\n\tyear         = 2005,\n\tbooktitle    = {Proceedings of the 22nd international conference on Machine learning},\n\tpages        = {625--632},\n\torganization = {ACM}\n}\n@inproceedings{niculescu2007inductive,\n\ttitle        = {Inductive Transfer for Bayesian Network Structure Learning.},\n\tauthor       = {Niculescu-Mizil, Alexandru and Caruana, Rich},\n\tyear         = 2007,\n\tbooktitle    = {AISTATS},\n\tpages        = {339--346}\n}\n@article{nie2013certifying,\n\ttitle        = {Certifying convergence of Lasserre’s hierarchy via flat truncation},\n\tauthor       = {Jiawang Nie},\n\tyear         = 2013,\n\tjournal      = {Mathematical Programming},\n\tvolume       = 142,\n\tnumber       = 1,\n\tpages        = {485--510}\n}\n@article{nie2013linear,\n\ttitle        = {Linear optimization with cones of moments and nonnegative polynomials},\n\tauthor       = {Jiawang Nie},\n\tyear         = 2013,\n\tjournal      = {Mathematical Programming},\n\tpages        = {1--28}\n}\n@inproceedings{Nie2013online,\n\ttitle        = {Online pca with optimal regrets},\n\tauthor       = {Nie, Jiazhong and Kot{\\l}owski, Wojciech and Warmuth, Manfred K},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Algorithmic Learning Theory},\n\tpages        = {98--112},\n\torganization = {Springer}\n}\n@article{nie2014optimality,\n\ttitle        = {Optimality conditions and finite convergence of Lasserre’s hierarchy},\n\tauthor       = {Jiawang Nie},\n\tyear         = 2014,\n\tjournal      = {Mathematical programming},\n\tvolume       = 146,\n\tnumber       = 1,\n\tpages        = {97--121}\n}\n@article{nie2014truncated,\n\ttitle        = {The {A}-Truncated {K}-Moment Problem},\n\tauthor       = {Jiawang Nie},\n\tyear         = 2014,\n\tjournal      = {Foundations of Computational Mathematics},\n\tvolume       = 14,\n\tnumber       = 6,\n\tpages        = {1243--1276}\n}\n@article{nie2015hierarchy,\n\ttitle        = {The hierarchy of local minimums in polynomial optimization},\n\tauthor       = {Nie, Jiawang},\n\tyear         = 2015,\n\tjournal      = {Mathematical Programming},\n\tpublisher    = {Springer},\n\tvolume       = 151,\n\tnumber       = 2,\n\tpages        = {555--583}\n}\n@inproceedings{nie2020adversarial,\n\ttitle        = {Adversarial NLI: A New Benchmark for Natural Language Understanding},\n\tauthor       = {Yixin Nie and Adina Williams and Emily Dinan and Mohit Bansal and J. Weston and Douwe Kiela},\n\tyear         = 2020,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{nie2020learn,\n\ttitle        = {What Can We Learn from Collective Human Opinions on Natural Language Inference Data?},\n\tauthor       = {Yixin Nie and Xiang Zhou and Mohit Bansal},\n\tyear         = 2020,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{niekum2013incremental,\n\ttitle        = {Incremental Semantically Grounded Learning from Demonstration},\n\tauthor       = {S. Niekum and S. Chitta and A. Barto and B. Marthi and S. Osentoski},\n\tyear         = 2013,\n\tbooktitle    = {Robotics: Science and Systems (RSS)}\n}\n@article{nielsen2009statistical,\n\ttitle        = {Statistical exponential families: A digest with flash cards},\n\tauthor       = {Frank Nielsen and Vincent Garcia},\n\tyear         = 2009,\n\tjournal      = {arXiv preprint arXiv:0911.4863}\n}\n@inproceedings{niepert2014exchangeable,\n\ttitle        = {Exchangeable variable models},\n\tauthor       = {Mathias Niepert and Pedro Domingos},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{nigam1998learning,\n\ttitle        = {Learning to classify text from labeled and unlabeled documents},\n\tauthor       = {Kamal Nigam and Andrew McCallum and Sebastian Thrun and Tom Mitchell},\n\tyear         = 1998,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{nikolaidis2017human,\n\ttitle        = {Human-robot mutual adaptation in shared autonomy},\n\tauthor       = {Stefanos Nikolaidis and Yu Xiang Zhu and David Hsu and Siddhartha Srinivasa},\n\tyear         = 2017,\n\tbooktitle    = {ACM/IEEE International Conference on Human Robot Interaction (HRI)},\n\tpages        = {294--302}\n}\n@inproceedings{NIPS12-WLSWC,\n\ttitle        = {Learning with Partially Absorbing Random Walks},\n\tauthor       = {Wu, Xiao-Ming and Li, Zhenguo and So, Anthony Man-Cho and Wright, John and Chang, Shih-Fu},\n\tyear         = 2012,\n\tbooktitle    = {NIPS}\n}\n@incollection{NIPS2017_7203,\n\ttitle        = {The Expressive Power of Neural Networks: A View from the Width},\n\tauthor       = {Lu, Zhou and Pu, Hongming and Wang, Feicheng and Hu, Zhiqiang and Wang, Liwei},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems 30},\n\tpublisher    = {Curran Associates, Inc.},\n\tpages        = {6231--6239}\n}\n@inproceedings{NIPS2017_a8baa565,\n\ttitle        = {Label Efficient Learning of Transferable Representations acrosss Domains and Tasks},\n\tauthor       = {Luo, Zelun and Zou, Yuliang and Hoffman, Judy and Fei-Fei, Li F},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 30,\n\tpages        = {},\n\turl          = {https://proceedings.neurips.cc/paper/2017/file/a8baa56554f96369ab93e4f3bb068c22-Paper.pdf},\n\teditor       = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett}\n}\n@book{nisan2007algorithmic,\n\ttitle        = {Algorithmic game theory},\n\tauthor       = {Nisan, Noam and Roughgarden, Tim and Tardos, Eva and Vazirani, Vijay V},\n\tyear         = 2007,\n\tpublisher    = {Cambridge University Press Cambridge},\n\tvolume       = 1\n}\n@techreport{nivre05dependency,\n\ttitle        = {Dependency Grammar and Dependency Parsing},\n\tauthor       = {Joakim Nivre},\n\tyear         = 2005,\n\tinstitution  = {Växjö University: School of Mathematics and Systems Engineering}\n}\n@inproceedings{nivre2003efficient,\n\ttitle        = {An efficient algorithm for projective dependency parsing},\n\tauthor       = {Joakim Nivre},\n\tyear         = 2003,\n\tbooktitle    = {Proceedings of the 8th International Workshop on Parsing Technologies (IWPT}\n}\n@article{nixon2019calibration,\n\ttitle        = {Measuring Calibration in Deep Learning},\n\tauthor       = {Jeremy V. Nixon and Michael W. Dusenberry and Linchuan Zhang and Ghassen Jerfel and Dustin Tran},\n\tyear         = 2019,\n\tjournal      = {arXiv}\n}\n@book{nixon2019feature,\n\ttitle        = {Feature extraction and image processing for computer vision},\n\tauthor       = {Mark Nixon and Alberto Aguado},\n\tyear         = 2019,\n\tpublisher    = {Academic press}\n}\n@inproceedings{NJS,\n\ttitle        = {Phase Retrieval using Alternating Minimization},\n\tauthor       = {Praneeth Netrapalli and Prateek Jain and Sujay Sanghavi},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems 26: 27th Annual Conference on Neural Information Processing Systems 2013. Proceedings of a meeting held December 5-8, 2013, Lake Tahoe, Nevada, United States.},\n\tpages        = {2796--2804},\n\turl          = {http://papers.nips.cc/paper/5041-phase-retrieval-using-alternating-minimization},\n\tcrossref     = {DBLP:conf/nips/2013},\n\ttimestamp    = {Fri, 31 Jan 2014 12:11:40 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/nips/Netrapalli0S13},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@article{NNopq2014,\n\ttitle        = {Optimized Product Quantization},\n\tauthor       = {Tiezheng Ge and Kaiming He and Qifa Ke and Jian Sun},\n\tyear         = 2014,\n\tjournal      = {{IEEE} Trans. Pattern Anal. Mach. Intell.},\n\tvolume       = 36,\n\tnumber       = 4,\n\tpages        = {744--755}\n}\n@article{NNpq2011,\n\ttitle        = {Product Quantization for Nearest Neighbor Search},\n\tauthor       = {Herv{\\'{e}} J{\\'{e}}gou and Matthijs Douze and Cordelia Schmid},\n\tyear         = 2011,\n\tjournal      = {{IEEE} Trans. Pattern Anal. Mach. Intell.},\n\tvolume       = 33,\n\tnumber       = 1,\n\tpages        = {117--128}\n}\n@inproceedings{NNSAJ,\n\ttitle        = {Non-convex Robust {PCA}},\n\tauthor       = {Praneeth Netrapalli and Niranjan U. N and Sujay Sanghavi and Animashree Anandkumar and Prateek Jain},\n\tyear         = 2014,\n\tbooktitle    = {Advances in Neural Information Processing Systems 27: Annual Conference on Neural Information Processing Systems 2014, December 8-13 2014, Montreal, Quebec, Canada},\n\tpages        = {1107--1115},\n\turl          = {http://papers.nips.cc/paper/5430-non-convex-robust-pca},\n\tcrossref     = {DBLP:conf/nips/2014},\n\ttimestamp    = {Wed, 10 Dec 2014 21:34:12 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/nips/NetrapalliNSA014},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@book{Nocedal2006NO,\n\ttitle        = {Numerical Optimization},\n\tauthor       = {J. Nocedal and S. J. Wright},\n\tyear         = 2006,\n\tpublisher    = {Springer},\n\taddress      = {New York},\n\tedition      = {2nd}\n}\n@book{nocedal2006numerical,\n\ttitle        = {Numerical optimization},\n\tauthor       = {Nocedal, Jorge and Wright, Stephen},\n\tyear         = 2006,\n\tpublisher    = {Springer Science \\& Business Media}\n}\n@article{nocedal80lbfgs,\n\ttitle        = {Updating Quasi-Newton Matrices with Limited Storage},\n\tauthor       = {J. Nocedal},\n\tyear         = 1980,\n\tjournal      = {Mathematics of Computation},\n\tvolume       = 35,\n\tpages        = {773--782}\n}\n@inproceedings{nogueira2016end,\n\ttitle        = {End-to-End Goal-Driven Web Navigation},\n\tauthor       = {Rodrigo Nogueira and Kyunghyun Cho},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{nogueira2017task,\n\ttitle        = {Task-oriented query reformulation with reinforcement learning},\n\tauthor       = {Rodrigo Nogueira and Kyunghyun Cho},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{nogueira2019passage,\n\ttitle        = {Passage Re-ranking with {BERT}},\n\tauthor       = {Rodrigo Nogueira and Kyunghyun Cho},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1901.04085}\n}\n@article{noh2016training,\n\ttitle        = {Training recurrent answering units with joint loss minimization for vqa},\n\tauthor       = {Hyeonwoo Noh and Bohyung Han},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1606.03647}\n}\n@article{noor2008nighttime,\n\ttitle        = {Using remotely sensed night-time light as a proxy for poverty in Africa},\n\tauthor       = {Abdisalan Noor and Victor Alegana and Peter Gething and Andrew Tatem and Robert Snow},\n\tyear         = 2008,\n\tjournal      = {Population Health Metrics},\n\tvolume       = 6\n}\n@article{noren2019safe,\n\ttitle        = {Safe Adaptation in Confined Environments Using Energy Functions},\n\tauthor       = {Noren, Charles and Liu, Changliu},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1912.09095}\n}\n@inproceedings{norouzi2016reward,\n\ttitle        = {Reward augmented maximum likelihood for neural structured prediction},\n\tauthor       = {Mohammad Norouzi and Samy Bengio and Navdeep Jaitly and Mike Schuster and Yonghui Wu and Dale Schuurmans and others},\n\tyear         = 2016,\n\tbooktitle    = {Advances In Neural Information Processing Systems},\n\tpages        = {1723--1731}\n}\n@inproceedings{nothman2012event,\n\ttitle        = {Event linking: Grounding event reference in a news archive},\n\tauthor       = {J. Nothman and M. Honnibal and B. Hachey and J. Curran},\n\tyear         = 2012,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{nouiehed2018convergence,\n\ttitle        = {Convergence to Second-Order Stationarity for Constrained Non-Convex Optimization},\n\tauthor       = {Nouiehed, Maher and Lee, Jason D and Razaviyayn, Meisam},\n\tyear         = 2018,\n\tjournal      = {Submitted to SIAM Journal on Optimization}\n}\n@article{nouiehed2019solving,\n\ttitle        = {Solving a class of non-convex min-max games using iterative first order methods},\n\tauthor       = {Nouiehed, Maher and Sanjabi, Maziar and Huang, Tianjian and Lee, Jason D and Razaviyayn, Meisam},\n\tyear         = 2019,\n\tjournal      = {Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{nouri2012cultural,\n\ttitle        = {A Cultural Decision-making Model for Negotiation based on Inverse Reinforcement Learning},\n\tauthor       = {Nouri, Elnaz and Georgila, Kallirroi and Traum, David},\n\tyear         = 2012,\n\tbooktitle    = {The Annual Meeting of the Cognitive Science Society}\n}\n@article{novikova2016crowd,\n\ttitle        = {Crowd-sourcing {NLG} data: Pictures elicit better data},\n\tauthor       = {Jekaterina Novikova and Oliver Lemon and Verena Rieser},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@inproceedings{novikova2017why,\n\ttitle        = {Why We Need New Evaluation Metrics for {NLG}},\n\tauthor       = {Jekaterina Novikova and Ond\\v{r}ej Du\\v{s}ek and Amanda Cercas Curry and Verena Rieser},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{nowak1999evolutionary,\n\ttitle        = {The evolutionary language game},\n\tauthor       = {Martin A Nowak and Joshua B Plotkin and David C Krakauer},\n\tyear         = 1999,\n\tjournal      = {Journal of Theoretical Biology},\n\tvolume       = 200,\n\tnumber       = 2,\n\tpages        = {147--162}\n}\n@article{nowell1976clonal,\n\ttitle        = {The clonal evolution of tumor cell populations},\n\tauthor       = {Peter C Nowell},\n\tyear         = 1976,\n\tjournal      = {Science},\n\tvolume       = 194,\n\tnumber       = 4260,\n\tpages        = {23--28}\n}\n@inproceedings{nowozin2016f,\n\ttitle        = {f-gan: Training generative neural samplers using variational divergence minimization},\n\tauthor       = {Nowozin, Sebastian and Cseke, Botond and Tomioka, Ryota},\n\tyear         = 2016,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {271--279}\n}\n@article{npr2018grading,\n\ttitle        = {More States Opting To 'Robo-Grade' Student Essays By Computer},\n\tauthor       = {NPR},\n\tyear         = 2018,\n\tjournal      = {National Public Radio},\n\turl          = {https://www.npr.org/2018/06/30/624373367/more-states-opting-to-robo-grade-student-essays-by-computer}\n}\n@inproceedings{NSLIFK-gauss-southwell,\n\ttitle        = {Coordinate Descent Converges Faster with the Gauss-Southwell Rule Than Random Selection},\n\tauthor       = {Nutini, Julie and Schmidt, Mark and Laradji, Issam and Friedlander, Michael and Koepke, Hoyt},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the 32nd International Conference on Machine Learning (ICML-15)},\n\tpages        = {1632--1641}\n}\n@inproceedings{nuhn2013beamdecipher,\n\ttitle        = {Beam Search for Solving Substitution Ciphers},\n\tauthor       = {Malte Nuhn and Julian Schamper and Hermann Ney},\n\tyear         = 2013,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {1569--1576}\n}\n@inproceedings{nuhn2014fastem,\n\ttitle        = {{EM} Decipherment for Large Vocabularies},\n\tauthor       = {Malte Nuhn and Hermann Ney},\n\tyear         = 2014,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {759--764}\n}\n@inproceedings{nuhn2014homophonics,\n\ttitle        = {Improved Decipherment of Homophonic Ciphers},\n\tauthor       = {Malte Nuhn and Hermann Ney},\n\tyear         = 2014,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{nuske2014modeling,\n\ttitle        = {Modeling and Calibrating Visual Yield Estimates in Vineyards},\n\tauthor       = {Nuske, Stephen and Gupta, Kamal and Narasimhan, Srinivasa and Singh, Sanjiv},\n\tyear         = 2014,\n\tbooktitle    = {Field and Service Robotics},\n\tpages        = {343--356},\n\torganization = {Springer}\n}\n@book{nussbaum2010not,\n\ttitle        = {Not for profit: Why democracy needs the humanities},\n\tauthor       = {Martha Craven Nussbaum},\n\tyear         = 2010,\n\tpublisher    = {Princeton University Press}\n}\n@book{NuY83,\n\ttitle        = {Problem complexity and method efficiency in optimization},\n\tauthor       = {Nemirovskii, Arkadi and Yudin, David Borisovich},\n\tyear         = 1983,\n\tpublisher    = {Wiley}\n}\n@inproceedings{NWS-Kaczmarz-algorithm,\n\ttitle        = {Stochastic Gradient Descent, Weighted Sampling, and the Randomized Kaczmarz algorithm},\n\tauthor       = {Needell, Deanna and Ward, Rachel and Srebro, Nati},\n\tyear         = 2014,\n\tbooktitle    = {Advances in Neural Information Processing Systems 27},\n\tpages        = {1017--1025}\n}\n@misc{nyclu2019data,\n\ttitle        = {Stop-and-Frisk Data},\n\tauthor       = {NYCLU},\n\tyear         = 2019,\n\thowpublished = {https://www.nyclu.org/en/stop-and-frisk-data}\n}\n@misc{NYCyellowcabJan2016,\n\ttitle        = {{NYC} {Taxi} and {Limousine} {Commission} ({TLC}) trip record data},\n\tnote         = {Accessed June 11, 2018},\n\thowpublished = {\\url{http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml}}\n}\n@misc{NYT,\n\ttitle        = {UCI Machine Learning Repository},\n\tauthor       = {A. Frank and A. Asuncion},\n\tyear         = 2010,\n\tnote         = {http://archive.ics.uci.edu/ml. Irvine, CA: University of California, School of Information and Computer Science}\n}\n@article{nyt2016jigsaw,\n\ttitle        = {The Times is Partnering with Jigsaw to Expand Comment Capabilities},\n\tauthor       = {{The New York Times Company}},\n\tyear         = 2016,\n\tjournal      = {The New York Times},\n\turl          = {https://www.nytco.com/press/the-times-is-partnering-with-jigsaw-to-expand-comment-capabilities/}\n}\n@article{o18,\n\ttitle        = {Learning Compact Neural Networks with Regularization},\n\tauthor       = {Oymak, Samet},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1802.01223}\n}\n@article{o2012multiphen,\n\ttitle        = {Multi{P}hen: joint model of multiple phenotypes can increase discovery in {GWAS}},\n\tauthor       = {Paul F O'Reilly and Clive J Hoggart and Yotsawat Pomyen and Federico CF Calboli and Paul Elliott and Marjo-Riitta Jarvelin and Lachlan JM Coin},\n\tyear         = 2012,\n\tjournal      = {PloS One},\n\tvolume       = 7,\n\tnumber       = 5\n}\n@book{o2014analysis,\n\ttitle        = {Analysis of boolean functions},\n\tauthor       = {O'Donnell, Ryan},\n\tyear         = 2014,\n\tpublisher    = {Cambridge University Press}\n}\n@book{o2016weapons,\n\ttitle        = {Weapons of math destruction: How big data increases inequality and threatens democracy},\n\tauthor       = {Cathy O'Neil},\n\tyear         = 2016,\n\tpublisher    = {Broadway Books}\n}\n@article{o2020generative,\n\ttitle        = {Generative causal explanations of black-box classifiers},\n\tauthor       = {Matthew O'Shaughnessy and Gregory Canal and Marissa Connor and Mark Davenport and Christopher Rozell},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.13913}\n}\n@inproceedings{oakden2020hidden,\n\ttitle        = {Hidden stratification causes clinically meaningful failures in machine learning for medical imaging},\n\tauthor       = {Luke Oakden-Rayner and Jared Dunnmon and Gustavo Carneiro and Christopher R{\\'e}},\n\tyear         = 2020,\n\tbooktitle    = {Proceedings of the ACM Conference on Health, Inference, and Learning},\n\tpages        = {151--159}\n}\n@article{obermeyer2019dissecting,\n\ttitle        = {Dissecting racial bias in an algorithm used to manage the health of populations},\n\tauthor       = {Ziad Obermeyer and Brian Powers and Christine Vogeli and Sendhil Mullainathan},\n\tyear         = 2019,\n\tjournal      = {Science},\n\tvolume       = 366,\n\tnumber       = 6464,\n\tpages        = {447--453}\n}\n@article{och03systematic,\n\ttitle        = {A Systematic Comparison of Various Statistical Alignment Models},\n\tauthor       = {Franz Josef Och and Hermann Ney},\n\tyear         = 2003,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 29,\n\tpages        = {19--51}\n}\n@inproceedings{och2003minimum,\n\ttitle        = {Minimum error rate training in statistical machine translation},\n\tauthor       = {Franz Josef Och},\n\tyear         = 2003,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {160--167}\n}\n@article{och2004alignment,\n\ttitle        = {The Alignment Template Approach to Statistical Machine Translation},\n\tauthor       = {Franz Joseph Och and Hermann Ney},\n\tyear         = 2004,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 30,\n\tpages        = {417--449}\n}\n@article{oda2015learning,\n\ttitle        = {Learning to Generate Pseudo-Code from Source Code Using Statistical Machine Translation},\n\tauthor       = {Yusuke Oda and Hiroyuki Fudaba and Graham Neubig and Hideaki Hata and Sakriani Sakti and Tomoki Toda and Satoshi Nakamura},\n\tyear         = 2015,\n\tjournal      = {IEEE/ACM International Conference on Automated Software Engineering (ASE)},\n\tvolume       = 30,\n\tpages        = {574--584}\n}\n@phdthesis{odonnell11fragment,\n\ttitle        = {Productivity and Reuse in Language},\n\tauthor       = {Timothy J. O'Donnell},\n\tyear         = 2011,\n\tschool       = {Massachusetts Institute of Technology}\n}\n@article{ODonoghue2012,\n\ttitle        = {{Adaptive Restart for Accelerated Gradient Schemes}},\n\tauthor       = {{O'Donoghue}, Brendan and Cand\\`{e}s, Emmanuel},\n\tyear         = 2013,\n\tmonth        = jul,\n\tjournal      = {Foundations of Computational Mathematics},\n\tdoi          = {10.1007/s10208-013-9150-3},\n\tissn         = {1615-3375},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan Zhu/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Unknown - Unknown - No Title.pdf:pdf},\n\tmendeley-groups = {Optimization/Gradient Descent Theory}\n}\n@article{OF,\n\ttitle        = {Sparse coding with an overcomplete basis set: A strategy employed by V1?},\n\tauthor       = {Olshausen, Bruno A and Field, David J},\n\tyear         = 1997,\n\tjournal      = {Vision research},\n\tpublisher    = {Elsevier},\n\tvolume       = 37,\n\tnumber       = 23,\n\tpages        = {3311--3325},\n\towner        = {gewor_000},\n\ttimestamp    = {2013.11.10}\n}\n@inproceedings{ogorman2016richer,\n\ttitle        = {Richer Event Description: Integrating event coreference with temporal, causal and bridging annotation},\n\tauthor       = {Tim O’Gorman and Kristin Wright-Bettner and Martha Palmer},\n\tyear         = 2016,\n\tbooktitle    = {Computing News Storylines Workshop}\n}\n@article{ogras2006online,\n\ttitle        = {Online summarization of dynamic time series data},\n\tauthor       = {Ogras, Y. and Ferhatosmanoglu, Hakan},\n\tyear         = 2006,\n\tmonth        = jan,\n\tjournal      = {The VLDB Journal},\n\tpublisher    = {Springer-Verlag New York, Inc.},\n\taddress      = {Secaucus, NJ, USA},\n\tvolume       = 15,\n\tpages        = {84--98},\n\tdoi          = {http://dx.doi.org/10.1007/s00778-004-0149-x},\n\tissn         = {1066-8888},\n\tacmid        = 1146470,\n\tissue        = 1,\n\tkeywords     = {\n\t\tData streams, Dimensionality reduction, Time-series data, Transformation-based\n\n\t\tsummarization\n\t},\n\tnumpages     = 15\n}\n@inproceedings{oh2000stochastic,\n\ttitle        = {Stochastic language generation for spoken dialogue systems},\n\tauthor       = {Alice H Oh and Alexander I Rudnicky},\n\tyear         = 2000,\n\tbooktitle    = {ANLP/NAACL Workshop on Conversational systems - Volume 3},\n\tpages        = {27--32}\n}\n@inproceedings{oh2015action,\n\ttitle        = {Action-conditional video prediction using deep networks in atari games},\n\tauthor       = {Junhyuk Oh and Xiaoxiao Guo and Honglak Lee and Richard L Lewis and Satinder Singh},\n\tyear         = 2015,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {2863--2871}\n}\n@inproceedings{oh2017value,\n\ttitle        = {Value prediction network},\n\tauthor       = {Oh, Junhyuk and Singh, Satinder and Lee, Honglak},\n\tyear         = 2017,\n\tjournal      = {Advances in Neural Information Processing Systems},\n\tbooktitle    = {Proceedings of the 31st International Conference on Neural Information Processing Systems},\n\tpages        = {6120--6130}\n}\n@article{oh2018self,\n\ttitle        = {Self-Imitation Learning},\n\tauthor       = {Oh, Junhyuk and Guo, Yijie and Singh, Satinder and Lee, Honglak},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1806.05635}\n}\n@article{OhadDeep,\n\ttitle        = {A Provably Efficient Algorithm for Training Deep Networks},\n\tauthor       = {Roi Livni and Shai Shalev-Shwartz and Ohad Shamir},\n\tyear         = 2013,\n\tjournal      = {ArXiv},\n\tvolume       = {1304.7045},\n\towner        = {rongge},\n\ttimestamp    = {2013.09.26}\n}\n@inproceedings{ohlsson12phase,\n\ttitle        = {{CPRL} -- An Extension of Compressive Sensing to the Phase Retrieval Problem},\n\tauthor       = {Henrik Ohlsson and Allen Yang and Roy Dong and Shankar Sastry},\n\tyear         = 2012,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{ohnishi2019barrier,\n\ttitle        = {Barrier-certified adaptive reinforcement learning with applications to brushbot navigation},\n\tauthor       = {Ohnishi, Motoya and Wang, Li and Notomista, Gennaro and Egerstedt, Magnus},\n\tyear         = 2019,\n\tjournal      = {IEEE Transactions on robotics},\n\tpublisher    = {IEEE},\n\tvolume       = 35,\n\tnumber       = 5,\n\tpages        = {1186--1205}\n}\n@article{oja82,\n\ttitle        = {{Simplified neuron model as a principal component analyzer}},\n\tauthor       = {Oja, Erkki},\n\tyear         = 1982,\n\tmonth        = nov,\n\tday          = 1,\n\tjournal      = {Journal of Mathematical Biology},\n\tbooktitle    = {Journal of Mathematical Biology},\n\tpublisher    = {Springer-Verlag},\n\tvolume       = 15,\n\tnumber       = 3,\n\tpages        = {267--273},\n\tdoi          = {10.1007/bf00275687},\n\tissn         = {0303-6812},\n\turl          = {http://dx.doi.org/10.1007/bf00275687},\n\tabstract     = {{A simple linear neuron model with constrained Hebbian-type synaptic modification is analyzed and a new class of unconstrained learning rules is derived. It is shown that the model neuron tends to extract the principal component from a stationary input vector sequence.}},\n\tciteulike-article-id = 1222082,\n\tciteulike-linkout-0 = {http://dx.doi.org/10.1007/bf00275687},\n\tciteulike-linkout-1 = {http://www.springerlink.com/content/u9u6120r003825u1},\n\tciteulike-linkout-2 = {http://link.springer.com/article/10.1007/BF00275687},\n\tkeywords     = {computational-neuroscience, prototype-learning},\n\tposted-at    = {2009-12-02 21:57:21},\n\tpriority     = 2\n}\n@article{oja92,\n\ttitle        = {Principal components, minor components, and linear neural networks},\n\tauthor       = {Oja, Erkki},\n\tyear         = 1992,\n\tjournal      = {Neural Networks},\n\tpublisher    = {Elsevier},\n\tvolume       = 5,\n\tnumber       = 6,\n\tpages        = {927--935}\n}\n@inproceedings{ok2018exploration,\n\ttitle        = {Exploration in structured reinforcement learning},\n\tauthor       = {Ok, Jungseul and Proutiere, Alexandre and Tranos, Damianos},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {8874--8882}\n}\n@techreport{okelly2016apex,\n\ttitle        = {{APEX}: Autonomous Vehicle Plan Verification and Execution},\n\tauthor       = {Matthew O'Kelly and Houssam Abbas and Sicun Gao and Shin'ichi Shiraishi and Shinpei Kato and Rahul Mangharam},\n\tyear         = 2016,\n\tinstitution  = {University of Pennsylvania}\n}\n@techreport{okelly2017computer,\n\ttitle        = {Computer-Aided Design for Safe Autonomous Vehicles},\n\tauthor       = {Matthew O'Kelly and Houssam Abbas and Rahul Mangharam},\n\tyear         = 2017,\n\tinstitution  = {University of Pennsylvania}\n}\n@article{oliveira10concentration,\n\ttitle        = {{Concentration of the adjacency matrix and of the Laplacian in random graphs with independent edges}},\n\tauthor       = {{Imbuzeiro Oliveira}, R.},\n\tyear         = 2009,\n\tmonth        = nov,\n\tjournal      = {ArXiv e-prints},\n\tarchiveprefix = {arXiv},\n\teprint       = {0911.0600},\n\tprimaryclass = {math.CO},\n\tkeywords     = {Mathematics - Combinatorics, Mathematics - Probability, 05C80, 60B20},\n\tadsurl       = {http://adsabs.harvard.edu/abs/2009arXiv0911.0600I},\n\tadsnote      = {Provided by the SAO/NASA Astrophysics Data System}\n}\n@inproceedings{oliver2018realistic,\n\ttitle        = {Realistic evaluation of deep semi-supervised learning algorithms},\n\tauthor       = {Avital Oliver and Augustus Odena and Colin A Raffel and Ekin Dogus Cubuk and Ian Goodfellow},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {3235--3246}\n}\n@article{olney2012question,\n\ttitle        = {Question Generation from Concept Maps},\n\tauthor       = {Andrew M. Olney and Arthur C. Graesser and Natalie K. Person},\n\tyear         = 2012,\n\tjournal      = {Dialogue and Discourse},\n\tvolume       = 3,\n\tpages        = {75--99}\n}\n@article{olshausen2004sparse,\n\ttitle        = {Sparse coding of sensory inputs},\n\tauthor       = {Olshausen, Bruno A and Field, David J},\n\tyear         = 2004,\n\tjournal      = {Current opinion in neurobiology},\n\tpublisher    = {Elsevier},\n\tvolume       = 14,\n\tnumber       = 4,\n\tpages        = {481--487}\n}\n@article{olsson2018skill,\n\ttitle        = {Skill Rating for Generative Models},\n\tauthor       = {Catherine Olsson and Surya Bhupatiraju and Tom Brown and Augustus Odena and Ian Goodfellow},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1808.04888}\n}\n@inproceedings{omari2016lossless,\n\ttitle        = {Lossless Separation of Web Pages into Layout Code and Data},\n\tauthor       = {Adi Omari and Benny Kimelfeld and Eran Yahav and Sharon Shoham},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)}\n}\n@inproceedings{onishi2016wdw,\n\ttitle        = {Who did what: A large-scale person-centered cloze dataset},\n\tauthor       = {Takeshi Onishi and Hai Wang and Mohit Bansal and Kevin Gimpel and David McAllester},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{oo18,\n\ttitle        = {Non-asymptotic Identification of {LTI} Systems from a Single Trajectory},\n\tauthor       = {Oymak, Samet and Ozay, Necmiye},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1806.05722}\n}\n@article{oord2016pixel,\n\ttitle        = {Pixel recurrent neural networks},\n\tauthor       = {Aaron van den Oord and Nal Kalchbrenner and Koray Kavukcuoglu},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1601.06759}\n}\n@article{oord2016wavenet,\n\ttitle        = {WaveNet: A generative model for raw audio},\n\tauthor       = {Aaron van den Oord and Sander Dieleman and Heiga Zen and Karen Simonyan and Oriol Vinyals and Alex Graves and Nal Kalchbrenner and Andrew Senior and Koray Kavukcuoglu},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1609.03499}\n}\n@article{oord2018representation,\n\ttitle        = {Representation learning with contrastive predictive coding},\n\tauthor       = {Oord, Aaron van den and Li, Yazhe and Vinyals, Oriol},\n\tyear         = 2018,\n\tjournal      = {arXiv:1807.03748}\n}\n@misc{openvino2018cvat,\n\ttitle        = {Computer Vision Annotation Tool},\n\tauthor       = {OpenVinoToolKit},\n\tyear         = 2018,\n\thowpublished = {\\url{https://github.com/openvinotoolkit/cvat}}\n}\n@article{opper1995statistical,\n\ttitle        = {Statistical mechanics of learning: Generalization},\n\tauthor       = {Manfred Opper},\n\tyear         = 1995,\n\tjournal      = {The Handbook of Brain Theory and Neural Networks,},\n\tpages        = {922--925}\n}\n@inproceedings{ops17,\n\ttitle        = {The Statistical Recurrent Unit},\n\tauthor       = {Oliva, Junier B and P{\\'o}czos, Barnab{\\'a}s and Schneider, Jeff},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {2671--2680}\n}\n@inproceedings{orabona14simultaneous,\n\ttitle        = {Simultaneous Model Selection and Optimization through Parameter-free Stochastic Learning},\n\tauthor       = {Francesco Orabona},\n\tyear         = 2014,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{Orabona2012prisma,\n\ttitle        = {Prisma: Proximal iterative smoothing algorithm},\n\tauthor       = {Orabona, Francesco and Argyriou, Andreas and Srebro, Nathan},\n\tyear         = 2012,\n\tjournal      = {arXiv preprint arXiv:1206.2372}\n}\n@article{orabona2015generalized,\n\ttitle        = {A generalized online mirror descent with applications to classification and regression},\n\tauthor       = {Francesco Orabona and Koby Crammer and Nicolo Cesa-Bianchi},\n\tyear         = 2015,\n\tjournal      = {Machine Learning},\n\tvolume       = 99,\n\tnumber       = 3,\n\tpages        = {411--435}\n}\n@phdthesis{Orecchia11,\n\ttitle        = {Fast Approximation Algorithms for Graph Partitioning using Spectral and Semidefinite-Programming Techniques},\n\tauthor       = {Orecchia, Lorenzo},\n\tyear         = 2011,\n\tmonth        = may,\n\tnumber       = {UCB/EECS-2011-56},\n\tschool       = {EECS Department, University of California, Berkeley}\n}\n@inproceedings{oren2019drolm,\n\ttitle        = {Distributionally Robust Language Modeling},\n\tauthor       = {Yonatan Oren and Shiori Sagawa and Tatsunori Hashimoto and Percy Liang},\n\tyear         = 2019,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{ortner2007logarithmic,\n\ttitle        = {Logarithmic online regret bounds for undiscounted reinforcement learning},\n\tauthor       = {Ortner, P and Auer, R},\n\tyear         = 2007,\n\tjournal      = {Advances in Neural Information Processing Systems},\n\tvolume       = 19,\n\tpages        = 49\n}\n@inproceedings{ortner2014selecting,\n\ttitle        = {Selecting near-optimal approximate state representations in reinforcement learning},\n\tauthor       = {Ortner, Ronald and Maillard, Odalric-Ambrym and Ryabko, Daniil},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Algorithmic Learning Theory},\n\torganization = {Springer}\n}\n@inproceedings{osband2013more,\n\ttitle        = {(More) efficient reinforcement learning via posterior sampling},\n\tauthor       = {Osband, Ian and Russo, Daniel and Van Roy, Benjamin},\n\tyear         = 2013,\n\tjournal      = {Advances in Neural Information Processing Systems},\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {3003--3011}\n}\n@article{osband2014generalization,\n\ttitle        = {Generalization and Exploration via Randomized Value Functions},\n\tauthor       = {Osband, Ian and Van Roy, Benjamin and Wen, Zheng},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1402.0635}\n}\n@inproceedings{osband2014model,\n\ttitle        = {Model-based reinforcement learning and the eluder dimension},\n\tauthor       = {Osband, Ian and Roy, Benjamin Van},\n\tyear         = 2014,\n\tbooktitle    = {Proceedings of the 27th International Conference on Neural Information Processing Systems-Volume 1},\n\tpages        = {1466--1474}\n}\n@inproceedings{osband2014near,\n\ttitle        = {Near-optimal reinforcement learning in factored mdps},\n\tauthor       = {Osband, Ian and Van Roy, Benjamin},\n\tyear         = 2014,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {604--612}\n}\n@inproceedings{osband2016deep,\n\ttitle        = {Deep exploration via bootstrapped {DQN}},\n\tauthor       = {Ian Osband and Charles Blundell and Alexander Pritzel and Benjamin Van Roy},\n\tyear         = 2016,\n\tbooktitle    = {Advances In Neural Information Processing Systems},\n\tpages        = {4026--4034}\n}\n@article{osband2016on,\n\ttitle        = {On Lower Bounds for Regret in Reinforcement Learning},\n\tauthor       = {Ian Osband and Benjamin Van Roy},\n\tyear         = 2016,\n\tjournal      = {ArXiv},\n\tvolume       = {abs/1608.02732}\n}\n@article{osband2016posterior,\n\ttitle        = {Why is posterior sampling better than optimism for reinforcement learning},\n\tauthor       = {I. Osband and B. Van Roy},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1607.00215}\n}\n@article{osband2017deep,\n\ttitle        = {Deep exploration via randomized value functions},\n\tauthor       = {Osband, Ian and Russo, Daniel and Wen, Zheng and Van Roy, Benjamin},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1703.07608}\n}\n@inproceedings{osband2017posterior,\n\ttitle        = {Why is posterior sampling better than optimism for reinforcement learning?},\n\tauthor       = {Osband, Ian and Van Roy, Benjamin},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the 34th International Conference on Machine Learning-Volume 70},\n\torganization = {JMLR. org}\n}\n@article{osgood2018mapping,\n\ttitle        = {Mapping child growth failure in Africa between 2000 and 2015},\n\tauthor       = {Aaron Osgood-Zimmerman and Anoushka I. Millear and Rebecca W. Stubbs and Chloe Shields and Brandon V. Pickering and Lucas Earl and Nicholas Graetz and Damaris K. Kinyoki and Sarah E. Ray and Samir Bhatt and Annie J. Browne and Roy Burstein and Ewan Cameron and Daniel C. Casey and Aniruddha Deshpande and Nancy Fullman and Peter W. Gething and Harry S. Gibson and Nathaniel J. Henry and Mario Herrero and L. Kendall Krause and Ian D. Letourneau and Aubrey J. Levine and Patrick Y. Liu and Joshua Longbottom and Benjamin K. Mayala and Jonathan F. Mosser and Abdisalan M. Noor and David M. Pigott and Ellen G. Piwoz and Puja Rao and Rahul Rawat and Robert C. Reiner and David L. Smith and Daniel J. Weiss and Kirsten E. Wiens and Ali H. Mokdad and Stephen S. Lim and Christopher J. L. Murray and Nicholas J. Kassebaum and Simon I. Hay},\n\tyear         = 2018,\n\tjournal      = {Nature},\n\tvolume       = 555\n}\n@article{ostrovski2017count,\n\ttitle        = {Count-based exploration with neural density models},\n\tauthor       = {Ostrovski, Georg and Bellemare, Marc G and Oord, Aaron van den and Munos, R{\\'e}mi},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1703.01310},\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {2721--2730}\n}\n@inproceedings{OSV12,\n\ttitle        = {Approximating the exponential, the lanczos method and an $\\tilde{O}(m)$-time spectral algorithm for balanced separator},\n\tauthor       = {Orecchia, Lorenzo and Sachdeva, Sushant and Vishnoi, Nisheeth K.},\n\tyear         = 2012,\n\tmonth        = nov,\n\tbooktitle    = {STOC '12},\n\tpublisher    = {ACM Press}\n}\n@inproceedings{OSVV2008,\n\ttitle        = {On partitioning graphs via single commodity flows},\n\tauthor       = {Orecchia, Lorenzo and Schulman, Leonard J. and Vazirani, Umesh V. and Vishnoi, Nisheeth K.},\n\tyear         = 2008,\n\tbooktitle    = {STOC 08},\n\taddress      = {New York, New York, USA}\n}\n@inproceedings{ott2018analyzing,\n\ttitle        = {Analyzing Uncertainty in Neural Machine Translation},\n\tauthor       = {Myle Ott and Michael Auli and David Grangier and Marc'Aurelio Ranzato},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{ouyang2017learning,\n\ttitle        = {Learning unknown markov decision processes: A thompson sampling approach},\n\tauthor       = {Ouyang, Yi and Gagrani, Mukul and Nayyar, Ashutosh and Jain, Rahul},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {1333--1342}\n}\n@inproceedings{ovadia2019can,\n\ttitle        = {Can you trust your model's uncertainty? Evaluating predictive uncertainty under dataset shift},\n\tauthor       = {Ovadia, Yaniv and Fertig, Emily and Ren, Jie and Nado, Zachary and Sculley, David and Nowozin, Sebastian and Dillon, Joshua and Lakshminarayanan, Balaji and Snoek, Jasper},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {13991--14002}\n}\n@inproceedings{ovadia2019uncertainty,\n\ttitle        = {Can You Trust Your Model's Uncertainty? Evaluating Predictive Uncertainty Under Dataset Shift},\n\tauthor       = {Yaniv Ovadia and Emily Fertig and Jie Ren and Zachary Nado and D Sculley and Sebastian Nowozin and Joshua V. Dillon and Balaji Lakshminarayanan and Jasper Snoek},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{OvercompleteLVMs2014,\n\ttitle        = {{Sample Complexity Analysis for Learning Overcomplete Latent Variable Models through Tensor Methods}},\n\tauthor       = {Anima Anandkumar and Rong Ge and Majid Janzamin},\n\tyear         = 2014,\n\tmonth        = aug,\n\tjournal      = {arXiv preprint arXiv:1408.0553}\n}\n@book{owen2013monte,\n\ttitle        = {Monte Carlo theory, methods and examples},\n\tauthor       = {Art B. Owen},\n\tyear         = 2013,\n\tpublisher    = {}\n}\n@inproceedings{oymak2019overparameterized,\n\ttitle        = {Overparameterized nonlinear learning: Gradient descent takes the shortest path?},\n\tauthor       = {Samet Oymak and Mahdi Soltanolkotabi},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {4951--4960}\n}\n@article{oymak2020towards,\n\ttitle        = {Towards moderate overparameterization: global convergence guarantees for training shallow neural networks},\n\tauthor       = {Oymak, Samet and Soltanolkotabi, Mahdi},\n\tyear         = 2020,\n\tjournal      = {IEEE Journal on Selected Areas in Information Theory},\n\tpublisher    = {IEEE}\n}\n@inproceedings{ozay2010gpca,\n\ttitle        = {{GPCA} with denoising: a moments-based convex approach},\n\tauthor       = {Necmiye Ozay and Mario Sznaier and Constantino M. Lagoa and Octavia I. Camps},\n\tyear         = 2010,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {3209--3216}\n}\n@inproceedings{p2017latent,\n\ttitle        = {Latent Space Embedding for Retrieval in Question-Answer Archives},\n\tauthor       = {Deepak P and Dinesh Garg and Shirish Shevade},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {855--865}\n}\n@article{p63,\n\ttitle        = {Gradient methods for minimizing functionals},\n\tauthor       = {Polyak, Boris Teodorovich},\n\tyear         = 1963,\n\tjournal      = {Zhurnal Vychislitel'noi Matematiki i Matematicheskoi Fiziki},\n\tpublisher    = {Russian Academy of Sciences, Branch of Mathematical Sciences},\n\tvolume       = 3,\n\tnumber       = 4,\n\tpages        = {643--653}\n}\n@article{p99,\n\ttitle        = {Approximation theory of the MLP model in neural networks},\n\tauthor       = {Pinkus, Allan},\n\tyear         = 1999,\n\tjournal      = {Acta Numerica},\n\tpublisher    = {Cambridge University Press},\n\tvolume       = 8,\n\tpages        = {143--195}\n}\n@inproceedings{PAC,\n\ttitle        = {A theory of the learnable},\n\tauthor       = {Valiant, L. G.},\n\tyear         = 1984,\n\tbooktitle    = {Proceedings of the sixteenth annual ACM symposium on Theory of computing},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {STOC '84},\n\tpages        = {436--445},\n\tisbn         = {0-89791-133-4},\n\tnumpages     = 10\n}\n@article{pacchiano2020optimism,\n\ttitle        = {On Optimism in Model-Based Reinforcement Learning},\n\tauthor       = {Pacchiano, Aldo and Ball, Philip and Parker-Holder, Jack and Choromanski, Krzysztof and Roberts, Stephen},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.11911}\n}\n@phdthesis{pachinko,\n\ttitle        = {Pachinko allocation: dag-structured mixture models of topic correlations},\n\tauthor       = {Li, Wei},\n\tyear         = 2007,\n\tpublisher    = {University of Massachusetts Amherst},\n\tisbn         = {978-0-549-33023-3},\n\tnote         = {AAI3289214},\n\tadvisor      = {Mccallum, Andrew}\n}\n@book{pachter2005algebraic,\n\ttitle        = {Algebraic statistics for computational biology},\n\tauthor       = {Pachter, L. and Sturmfels, B.},\n\tyear         = 2005,\n\tpublisher    = {Cambridge University Press},\n\tvolume       = 13\n}\n@inproceedings{paek2007toward,\n\ttitle        = {Toward evaluation that leads to best practices: reconciling dialog evaluation in research and industry},\n\tauthor       = {Tim Paek},\n\tyear         = 2007,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{PageRank-BrinPage98,\n\ttitle        = {The Anatomy of a Large-Scale Hypertextual Web Search Engine},\n\tauthor       = {Sergey Brin and Lawrence Page},\n\tyear         = 1998,\n\tjournal      = {Computer Networks},\n\tvolume       = 30,\n\tnumber       = {1-7},\n\tpages        = {107--117},\n\tee           = {http://dx.doi.org/10.1016/S0169-7552(98)00110-X},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de}\n}\n@article{pagliardini2017unsupervised,\n\ttitle        = {Unsupervised learning of sentence embeddings using compositional n-gram features},\n\tauthor       = {Matteo Pagliardini and Prakhar Gupta and Martin Jaggi},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@article{pagnoncelli2009sample,\n\ttitle        = {Sample average approximation method for chance constrained programming: theory and applications},\n\tauthor       = {Pagnoncelli, BK and Ahmed, Shabbir and Shapiro, A},\n\tyear         = 2009,\n\tjournal      = {Journal of optimization theory and applications},\n\tpublisher    = {Springer},\n\tvolume       = 142,\n\tnumber       = 2,\n\tpages        = {399--416}\n}\n@inproceedings{paice1990lancaster,\n\ttitle        = {Another Stemmer},\n\tauthor       = {Chris D. Paice},\n\tyear         = 1990,\n\tbooktitle    = {ACM SIGIR Forum}\n}\n@inproceedings{paisley2012variational,\n\ttitle        = {Variational {B}ayesian inference with stochastic search},\n\tauthor       = {John Paisley and David M Blei and Michael I Jordan},\n\tyear         = 2012,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1363--1370}\n}\n@inproceedings{paiva05control,\n\ttitle        = {Empirically-based Control of Natural Language Generation},\n\tauthor       = {Daniel S. Paiva and Roger Evans},\n\tyear         = 2005,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {58--65}\n}\n@article{pajor1998metric,\n\ttitle        = {Metric entropy of the Grassmann manifold},\n\tauthor       = {Pajor, Alain},\n\tyear         = 1998,\n\tjournal      = {Convex Geometric Analysis},\n\tpublisher    = {Cambridge University Press},\n\tvolume       = 34,\n\tpages        = {181--188}\n}\n@inproceedings{pal06mcl,\n\ttitle        = {Multi-Conditional Learning: Generative/Discriminative Training for Clustering and Classification},\n\tauthor       = {Andrew McCallum and Chris Pal and Greg Druck and Xuerei Wang},\n\tyear         = 2006,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{pal2006sparse,\n\ttitle        = {Sparse forward-backward using minimum divergence beams for fast training of conditional random fields},\n\tauthor       = {Chris Pal and Charles Sutton and Andrew McCallum},\n\tyear         = 2006,\n\tbooktitle    = {International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},\n\tvolume       = 5\n}\n@inproceedings{palatucci2009zero,\n\ttitle        = {Zero-shot learning with semantic output codes},\n\tauthor       = {Mark Palatucci and Dean Pomerleau and Geoffrey E Hinton and Tom M Mitchell},\n\tyear         = 2009,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {1410--1418}\n}\n@book{palis2012geometric,\n\ttitle        = {Geometric {T}heory of {D}ynamical {S}ystems: {A}n {I}ntroduction},\n\tauthor       = {Palis, J Jr and De Melo, Welington},\n\tyear         = 2012,\n\tpublisher    = {Springer Science \\& Business Media}\n}\n@inproceedings{pan2010cross,\n\ttitle        = {Cross-domain sentiment classification via spectral feature alignment},\n\tauthor       = {Sinno Jialin Pan and Xiaochuan Ni and Jian-Tao Sun and Qiang Yang and Zheng Chen},\n\tyear         = 2010,\n\tbooktitle    = {Proceedings of the 19th international conference on World wide web},\n\tpages        = {751--760}\n}\n@article{pan2010survey,\n\ttitle        = {A survey on transfer learning},\n\tauthor       = {Pan, Sinno Jialin and Yang, Qiang},\n\tyear         = 2010,\n\tjournal      = {Knowledge and Data Engineering, IEEE Transactions on},\n\tpublisher    = {IEEE},\n\tvolume       = 22,\n\tnumber       = 10,\n\tpages        = {1345--1359}\n}\n@article{pan2019improving,\n\ttitle        = {Improving Question Answering with External Knowledge},\n\tauthor       = {Xiaoman Pan and Kai Sun and Dian Yu and Heng Ji and Dong Yu},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1902.00993}\n}\n@article{panageas2016gradient,\n\ttitle        = {Gradient descent only converges to minimizers: Non-isolated critical points and invariant regions},\n\tauthor       = {Panageas, Ioannis and Piliouras, Georgios},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1605.00405}\n}\n@inproceedings{panait2005cooperative,\n\ttitle        = {Cooperative Multi-Agent Learning: The State of the Art},\n\tauthor       = {Liviu Panait and Sean Luke},\n\tyear         = 2005,\n\tbooktitle    = {International Conference on Autonomous Agents and Multiagent Systems (AAMAS)},\n\tpages        = {387--434}\n}\n@inproceedings{panayotov2015librispeech,\n\ttitle        = {Librispeech: an asr corpus based on public domain audio books},\n\tauthor       = {Vassil Panayotov and Guoguo Chen and Daniel Povey and Sanjeev Khudanpur},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},\n\tpages        = {5206--5210}\n}\n@article{pang2017robust,\n\ttitle        = {Robust Deep Learning via Reverse Cross-Entropy Training and Thresholding Test},\n\tauthor       = {Tianyu Pang and Chao Du and Jun Zhu},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1706.00633}\n}\n@article{paninski2003entropy,\n\ttitle        = {Estimation of Entropy and Mutual Information},\n\tauthor       = {Liam Paninski},\n\tyear         = 2003,\n\tjournal      = {Neural Computation},\n\tvolume       = 15,\n\tpages        = {1191--1253}\n}\n@inproceedings{papachristodoulou2002construction,\n\ttitle        = {On the construction of Lyapunov functions using the sum of squares decomposition},\n\tauthor       = {Antonis Papachristodoulou and Stephen Prajna},\n\tyear         = 2002,\n\tbooktitle    = {IEEE Conference on Decision and Control}\n}\n@article{papachristodoulou2005analysis,\n\ttitle        = {Analysis of non-polynomial systems using the sum of squares decomposition},\n\tauthor       = {Antonis Papachristodoulou and Stephen Prajna},\n\tyear         = 2005,\n\tjournal      = {Positive polynomials in control}\n}\n@article{papadimitriou1987complexity,\n\ttitle        = {The complexity of {M}arkov decision processes},\n\tauthor       = {Papadimitriou, Christos H and Tsitsiklis, John N},\n\tyear         = 1987,\n\tjournal      = {Mathematics of operations research},\n\tpublisher    = {INFORMS},\n\tvolume       = 12,\n\tnumber       = 3,\n\tpages        = {441--450}\n}\n@inproceedings{papadimitriou1998latent,\n\ttitle        = {Latent semantic indexing: A probabilistic analysis},\n\tauthor       = {Papadimitriou, Christos H and Tamaki, Hisao and Raghavan, Prabhakar and Vempala, Santosh},\n\tyear         = 1998,\n\tbooktitle    = {Proceedings of the seventeenth ACM SIGACT-SIGMOD-SIGART symposium on Principles of database systems},\n\tpages        = {159--168},\n\torganization = {ACM}\n}\n@inproceedings{papadimitriou2003adaptive,\n\ttitle        = {Adaptive, hands-off stream mining},\n\tauthor       = {Papadimitriou, Spiros and Brockwell, Anthony and Faloutsos, Christos},\n\tyear         = 2003,\n\tbooktitle    = {\n\t\tProceedings of the 29th international conference on Very large data\n\n\t\tbases - Volume 29\n\t},\n\tlocation     = {Berlin, Germany},\n\tpublisher    = {VLDB Endowment},\n\tseries       = {VLDB '2003},\n\tpages        = {560--571},\n\tisbn         = {0-12-722442-4},\n\tacmid        = 1315500,\n\tnumpages     = 12\n}\n@inbook{papadimitriou2003computational,\n\ttitle        = {Computational Complexity},\n\tauthor       = {Papadimitriou, Christos H.},\n\tyear         = 2003,\n\tbooktitle    = {Encyclopedia of Computer Science},\n\tpublisher    = {John Wiley and Sons Ltd.},\n\taddress      = {GBR},\n\tpages        = {260–265},\n\tisbn         = {0470864125},\n\tnumpages     = 6\n}\n@article{papadimitriou2005streaming,\n\ttitle        = {Streaming pattern discovery in multiple time-series},\n\tauthor       = {Papadimitriou, Spiros and Sun, Jimeng and Faloutsos, Christos},\n\tyear         = 2005,\n\tbooktitle    = {\n\t\tProceedings of the 31st international conference on Very large data\n\n\t\tbases\n\t},\n\tlocation     = {Trondheim, Norway},\n\tpublisher    = {VLDB Endowment},\n\tseries       = {VLDB '05},\n\tpages        = {697--708},\n\tisbn         = {1-59593-154-6},\n\tacmid        = 1083674,\n\tnumpages     = 12\n}\n@inproceedings{papadimitriou2006optimal,\n\ttitle        = {Optimal multi-scale patterns in time series streams},\n\tauthor       = {Papadimitriou, Spiros and Yu, Philip},\n\tyear         = 2006,\n\tbooktitle    = {\n\t\tProceedings of the 2006 ACM SIGMOD international conference on Management\n\n\t\tof data\n\t},\n\tlocation     = {Chicago, IL, USA},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {SIGMOD '06},\n\tpages        = {647--658},\n\tdoi          = {http://doi.acm.org/10.1145/1142473.1142545},\n\tisbn         = {1-59593-434-0},\n\tacmid        = 1142545,\n\tkeywords     = {\n\t\tSVD, empirical orthogonal functions, local patterns, multi-scale,\n\n\t\tsingular spectrum, stream\n\t},\n\tnumpages     = 12\n}\n@article{Papadimitriou98latentsemantic,\n\ttitle        = {Latent Semantic Indexing: A Probabilistic Analysis},\n\tauthor       = {C. H. Papadimitriou and P. Raghavan and H. Tamaki and S. Vempala},\n\tyear         = 2000,\n\tjournal      = {J. Comput. Syst. Sci.},\n\tvolume       = 61,\n\tnumber       = 2\n}\n@article{papaspiliopoulos08retro,\n\ttitle        = {Retrospective {MCMC} for {D}irichlet process hierarchical models},\n\tauthor       = {Omiros Papaspiliopoulos and Gareth O. Roberts},\n\tyear         = 2008,\n\tjournal      = {Biometrika},\n\tvolume       = 95,\n\tpages        = {169--186}\n}\n@inproceedings{paperno2016lambada,\n\ttitle        = {The {LAMBADA} dataset: Word prediction requiring a broad discourse context},\n\tauthor       = {Denis Paperno and German Kruszewski and Angeliki Lazaridou and Quan Ngoc Pham and Raffaella Bernardi and Sandro Pezzelle and Marco Baroni and Gemma Boleda and Raquel Fernandez},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{papernot2016distillation,\n\ttitle        = {Distillation as a defense to adversarial perturbations against deep neural networks},\n\tauthor       = {Nicolas Papernot and Patrick McDaniel and Xi Wu and Somesh Jha and Ananthram Swami},\n\tyear         = 2016,\n\tbooktitle    = {IEEE Symposium on Security and Privacy},\n\tpages        = {582--597}\n}\n@inproceedings{papernot2016limitations,\n\ttitle        = {The limitations of deep learning in adversarial settings},\n\tauthor       = {Nicolas Papernot and Patrick McDaniel and Somesh Jha and Matt Fredrikson and Z Berkay Celik and Ananthram Swami},\n\tyear         = 2016,\n\tbooktitle    = {Security and Privacy (EuroS\\&P), 2016 IEEE European Symposium on},\n\tpages        = {372--387}\n}\n@article{papernot2016towards,\n\ttitle        = {Towards the Science of Security and Privacy in Machine Learning},\n\tauthor       = {Nicolas Papernot and Patrick McDaniel and Arunesh Sinha and Michael Wellman},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@article{papernot2016transferability,\n\ttitle        = {Transferability in machine learning: from phenomena to black-box attacks using adversarial samples},\n\tauthor       = {Nicolas Papernot and Patrick McDaniel and Ian Goodfellow},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@inproceedings{papernot2017blackbox,\n\ttitle        = {Practical Black-Box Attacks against Deep Learning Systems using Adversarial Examples},\n\tauthor       = {Nicolas Papernot and Patrick McDaniel and Ian Goodfellow and Somesh Jha and Z.Berkay Celik and Ananthram Swami},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the ACM Asia Conference on Computer and Communications Security}\n}\n@article{papernot2017extending,\n\ttitle        = {Extending Defensive Distillation},\n\tauthor       = {Nicolas Papernot and Patrick McDaniel},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1705.05264}\n}\n@inproceedings{papernot2017practical,\n\ttitle        = {Practical black-box attacks against machine learning},\n\tauthor       = {Nicolas Papernot and Patrick McDaniel and Ian Goodfellow and Somesh Jha and Berkay Z. Celik and Ananthram Swami},\n\tyear         = 2017,\n\tbooktitle    = {Asia Conference on Computer and Communications Security},\n\tpages        = {506--519}\n}\n@article{papernot2018cleverhans,\n\ttitle        = {Technical Report on the CleverHans v2.1.0 Adversarial Examples Library},\n\tauthor       = {Nicolas Papernot and Fartash Faghri and Nicholas Carlini and Ian Goodfellow and Reuben Feinman and Alexey Kurakin and Cihang Xie and Yash Sharma and Tom Brown and Aurko Roy and Alexander Matyasko and Vahid Behzadan and Karen Hambardzumyan and Zhishuai Zhang and Yi-Lin Juang and Zhi Li and Ryan Sheatsley and Abhibhav Garg and Jonathan Uesato and Willi Gierke and Yinpeng Dong and David Berthelot and Paul Hendricks and Jonas Rauber and Rujun Long},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1610.00768}\n}\n@article{papernot2018deep,\n\ttitle        = {Deep k-nearest neighbors: Towards confident, interpretable and robust deep learning},\n\tauthor       = {Nicolas Papernot and Patrick McDaniel},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1803.04765}\n}\n@inproceedings{papineni02bleu,\n\ttitle        = {{BLEU}: A Method for Automatic Evaluation of Machine Translation},\n\tauthor       = {Kishore Papineni and Salim Roukos and Todd Ward and Wei-Jing Zhu},\n\tyear         = 2002,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{pappu2013predicting,\n\ttitle        = {Predicting tasks in goal-oriented spoken dialog systems using semantic knowledge bases},\n\tauthor       = {Aasish Pappu and Alexander Rudnicky},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of the SIGDIAL 2013 Conference},\n\tpages        = {242--250}\n}\n@article{parhi2019minnorm,\n\ttitle        = {Minimum \"Norm\" Neural Networks are Splines},\n\tauthor       = {R. Parhi and R. D. Nowak},\n\tyear         = 2019,\n\tjournal      = {arXiv}\n}\n@inproceedings{parikh12spectral,\n\ttitle        = {A Spectral Algorithm for Latent Junction Trees},\n\tauthor       = {A. Parikh and L. Song and M. Ishteva and G. Teodoru and E. Xing},\n\tyear         = 2012,\n\tbooktitle    = {Uncertainty in Artificial Intelligence (UAI)}\n}\n@article{parikh2014proximal,\n\ttitle        = {Proximal Algorithms},\n\tauthor       = {Neal Parikh and Stephen Boyd},\n\tyear         = 2014,\n\tjournal      = {Foundations and Trends in Optimization},\n\tvolume       = 1,\n\tnumber       = 3,\n\tpages        = {123--231}\n}\n@inproceedings{parikh2016decomposable,\n\ttitle        = {A Decomposable Attention Model for Natural Language Inference},\n\tauthor       = {Ankur Parikh and Oscar T\\\"{a}ckstr\\\"{o}m and Dipanjan Das and Jakob Uszkoreit},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{parisotto2015actor,\n\ttitle        = {Actor-mimic: Deep multitask and transfer reinforcement learning},\n\tauthor       = {Emilio Parisotto and Jimmy Lei Ba and Ruslan Salakhutdinov},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1511.06342}\n}\n@inproceedings{parisotto2017sql,\n\ttitle        = {Neuro-symbolic Program Synthesis},\n\tauthor       = {Emilio Parisotto and Abdel-rahman Mohamed and Rishabh Singh and Lihong Li and Dengyong Zhou and Pushmeet Kohli},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{park1991universal,\n\ttitle        = {Universal approximation using radial-basis-function networks},\n\tauthor       = {Park, Jooyoung and Sandberg, Irwin W},\n\tyear         = 1991,\n\tjournal      = {Neural computation},\n\tpublisher    = {MIT Press},\n\tvolume       = 3,\n\tnumber       = 2,\n\tpages        = {246--257}\n}\n@inproceedings{park2006capturing,\n\ttitle        = {Capturing and animating skin deformation in human motion},\n\tauthor       = {Park, Sang Il and Hodgins, Jessica K.},\n\tyear         = 2006,\n\tbooktitle    = {ACM SIGGRAPH 2006 Papers},\n\tlocation     = {Boston, Massachusetts},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {SIGGRAPH '06},\n\tpages        = {881--889},\n\tdoi          = {http://doi.acm.org/10.1145/1179352.1141970},\n\tisbn         = {1-59593-364-6},\n\tacmid        = 1141970,\n\tkeywords     = {human animation, motion capture, skin deformation},\n\tnumpages     = 9\n}\n@inproceedings{park2017non,\n\ttitle        = {Non-square matrix sensing without spurious local minima via the {B}urer-{M}onteiro approach},\n\tauthor       = {Park, Dohyung and Kyrillidis, Anastasios and Carmanis, Constantine and Sanghavi, Sujay},\n\tyear         = 2017,\n\tbooktitle    = {Artificial Intelligence and Statistics},\n\tpages        = {65--74}\n}\n@inproceedings{park2017resilient,\n\ttitle        = {Resilient linear classification: an approach to deal with attacks on training data},\n\tauthor       = {Sangdon Park and James Weimer and Insup Lee},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Cyber-Physical Systems},\n\tpages        = {155--164}\n}\n@inproceedings{park2018reducing,\n\ttitle        = {Reducing Gender Bias in Abusive Language Detection},\n\tauthor       = {Ji Ho Park and Jamin Shin and Pascale Fung},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {2799--2804}\n}\n@inproceedings{park2019ai,\n\ttitle        = {{AI}-based request augmentation to increase crowdsourcing participation},\n\tauthor       = {Junwon Park and Ranjay Krishna and Pranav Khadpe and Li Fei-Fei and Michael Bernstein},\n\tyear         = 2019,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tvolume       = 7,\n\tpages        = {115--124}\n}\n@article{park2020provable,\n\ttitle        = {Provable Memorization via Deep Neural Networks using Sub-linear Parameters},\n\tauthor       = {Park, Sejun and Lee, Jaeho and Yun, Chulhee and Shin, Jinwoo},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.13363}\n}\n@article{park2020toward,\n\ttitle        = {Toward Active Robot-Assisted Feeding with a General-Purpose Mobile Manipulator: Design, Evaluation, and Lessons Learned},\n\tauthor       = {Daehyung Park and Yuuna Hoshi and Harshal P. Mahajan and W. Rogers and Charles C. Kemp},\n\tyear         = 2020,\n\tjournal      = {Robotics and Autonomous Systems},\n\tvolume       = 124\n}\n@inproceedings{parkash2012attributes,\n\ttitle        = {Attributes for classifier feedback},\n\tauthor       = {Amar Parkash and Devi Parikh},\n\tyear         = 2012,\n\tbooktitle    = {European Conference on Computer Vision (ECCV)},\n\tpages        = {354--368}\n}\n@inproceedings{parkhi2012cats,\n\ttitle        = {Cats and dogs},\n\tauthor       = {Parkhi, Omkar M and Vedaldi, Andrea and Zisserman, Andrew and Jawahar, CV},\n\tyear         = 2012,\n\tbooktitle    = {2012 IEEE conference on computer vision and pattern recognition},\n\tpages        = {3498--3505}\n}\n@inproceedings{parr1998reinforcement,\n\ttitle        = {Reinforcement learning with hierarchies of machines},\n\tauthor       = {R. Parr and S. J. Russell},\n\tyear         = 1998,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1043--1049}\n}\n@inproceedings{parr2007analyzing,\n\ttitle        = {Analyzing feature generation for value-function approximation},\n\tauthor       = {Parr, Ronald and Painter-Wakefield, Christopher and Li, Lihong and Littman, Michael},\n\tyear         = 2007,\n\tbooktitle    = {Proceedings of the 24th international conference on Machine learning},\n\tpages        = {737--744},\n\torganization = {ACM}\n}\n@inproceedings{parr2008analysis,\n\ttitle        = {An analysis of linear models, linear value-function approximation, and feature selection for reinforcement learning},\n\tauthor       = {Parr, Ronald and Li, Lihong and Taylor, Gavin and Painter-Wakefield, Christopher and Littman, Michael L},\n\tyear         = 2008,\n\tbooktitle    = {Proceedings of the 25th international conference on Machine learning},\n\tpages        = {752--759},\n\torganization = {ACM}\n}\n@article{parrilo2003minimizing,\n\ttitle        = {Minimizing polynomial functions},\n\tauthor       = {Pablo A Parrilo and Bernd Sturmfels},\n\tyear         = 2003,\n\tjournal      = {Algorithmic and quantitative real algebraic geometry, DIMACS Series in Discrete Mathematics and Theoretical Computer Science},\n\tvolume       = 60,\n\tpages        = {83--99}\n}\n@article{parrilo2003semidefinite,\n\ttitle        = {Semidefinite programming relaxations for semialgebraic problems},\n\tauthor       = {Pablo A Parrilo},\n\tyear         = 2003,\n\tjournal      = {Mathematical programming},\n\tvolume       = 96,\n\tnumber       = 2,\n\tpages        = {293--320}\n}\n@article{parse,\n\ttitle        = {Head-Driven Statistical Models for Natural Language Parsing},\n\tauthor       = {Collins, Michael},\n\tyear         = 2003,\n\tmonth        = dec,\n\tjournal      = {Comput. Linguist.},\n\tpublisher    = {MIT Press},\n\taddress      = {Cambridge, MA, USA},\n\tvolume       = 29,\n\tnumber       = 4,\n\tpages        = {589--637},\n\tdoi          = {10.1162/089120103322753356},\n\tissn         = {0891-2017},\n\turl          = {http://dx.doi.org/10.1162/089120103322753356},\n\tissue_date   = {December 2003},\n\tnumpages     = 49,\n\tacmid        = 1105706\n}\n@article{partee1995lexical,\n\ttitle        = {Lexical semantics and compositionality},\n\tauthor       = {Partee, Barbara},\n\tyear         = 1995,\n\tjournal      = {An Invitation to Cognitive Science},\n\tvolume       = {0}\n}\n@article{partee2007compositionality,\n\ttitle        = {Compositionality and coercion in semantics: The dynamics of adjective meaning},\n\tauthor       = {Barbara H. Partee},\n\tyear         = 2007,\n\tjournal      = {Cognitive Foundations of Interpretation}\n}\n@article{partee2011origins,\n\ttitle        = {Formal Semantics: Origins, Issues, Early Impact},\n\tauthor       = {Barbara H. Partee},\n\tyear         = 2011,\n\tjournal      = {Baltic International Yearbook of Cognition, Logic and Communication},\n\tvolume       = 6\n}\n@article{parzen1962,\n\ttitle        = {On Estimation of a Probability Density Function and Mode},\n\tauthor       = {Emanuel Parzen},\n\tyear         = 1962,\n\tjournal      = {Annals of Mathematical Statistics},\n\tvolume       = 33,\n\tpages        = {1065--1076}\n}\n@article{pasca2003open,\n\ttitle        = {Open-domain question answering from large text collections},\n\tauthor       = {Marius Pa{\\c{s}}ca},\n\tyear         = 2003,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 29\n}\n@inproceedings{pascanu2013difficulty,\n\ttitle        = {On the difficulty of training recurrent neural networks},\n\tauthor       = {Razvan Pascanu and Tomas Mikolov and Yoshua Bengio},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{paschou2007intra,\n\ttitle        = {Intra-and interpopulation genotype reconstruction from tagging SNPs},\n\tauthor       = {Paschou, Peristera and Mahoney, Michael W and Javed, Asif and Kidd, Judith R and Pakstis, Andrew J and Gu, Sheng and Kidd, Kenneth K and Drineas, Petros},\n\tyear         = 2007,\n\tjournal      = {Genome Research},\n\tpublisher    = {Cold Spring Harbor Lab},\n\tvolume       = 17,\n\tnumber       = 1,\n\tpages        = {96--107}\n}\n@inproceedings{paskin02bigrams,\n\ttitle        = {Grammatical Bigrams},\n\tauthor       = {Mark A. Paskin},\n\tyear         = 2002,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{passonneau2005applying,\n\ttitle        = {Applying the pyramid method in {DUC} 2005},\n\tauthor       = {Rebecca J. Passonneau and Ani Nenkova and Kathleen McKeown and Sergey Sigelman},\n\tyear         = 2005,\n\tbooktitle    = {Document Understanding Conference}\n}\n@inproceedings{passonneau2014benefits,\n\ttitle        = {The Benefits of a Model of Annotation},\n\tauthor       = {Rebecca J. Passonneau and Bob Carpenter},\n\tyear         = 2014,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{pasunuru2017reinforced,\n\ttitle        = {Reinforced Video Captioning with Entailment Rewards},\n\tauthor       = {Ramakanth Pasunuru and Mohit Bansal},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{pasupat2014extraction,\n\ttitle        = {Zero-shot Entity Extraction from Web Pages},\n\tauthor       = {Panupong Pasupat and Percy Liang},\n\tyear         = 2014,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{pasupat2015compositional,\n\ttitle        = {Compositional Semantic Parsing on Semi-Structured Tables},\n\tauthor       = {Panupong Pasupat and Percy Liang},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{pasupat2016inferring,\n\ttitle        = {Inferring Logical Forms From Denotations},\n\tauthor       = {Panupong Pasupat and Percy Liang},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{pasupat2018elements,\n\ttitle        = {Mapping Natural Language Commands to Web Elements},\n\tauthor       = {Panupong Pasupat and Tian-Shun Jiang and Evan Zheran Liu and Kelvin Guu and Percy Liang},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@misc{paszke2017automatic,\n\ttitle        = {Automatic differentiation in PyTorch},\n\tauthor       = {Adam Paszke and Sam Gross and Soumith Chintala and Gregory Chanan and Edward Yang and Zachary DeVito and Zeming Lin and Alban Desmaison and Luca Antiga and Adam Lerer},\n\tyear         = 2017\n}\n@inproceedings{paszke2019pytorch,\n\ttitle        = {PyTorch: An Imperative Style, High-Performance Deep Learning Library},\n\tauthor       = {Adam Paszke and Sam Gross and Francisco Massa and Adam Lerer and James Bradbury and Gregory Chanan and Trevor Killeen and Zeming Lin and Natalia Gimelshein and Luca Antiga and Alban Desmaison and Andreas K{\\\"o}pf and Edward Yang and Zach DeVito and Martin Raison and Alykhan Tejani and Sasank Chilamkurthy and Benoit Steiner and Lu Fang and Junjie Bai and Soumith Chintala},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{pataki1998rank,\n\ttitle        = {On the rank of extreme matrices in semidefinite programs and the multiplicity of optimal eigenvalues},\n\tauthor       = {G. Pataki},\n\tyear         = 1998,\n\tjournal      = {Mathematics of Operations Research},\n\tvolume       = 23,\n\tnumber       = 2,\n\tpages        = {339--358}\n}\n@inproceedings{patel2003energy,\n\ttitle        = {Energy Aware Grid: Global Workload Placement based on Energy Efficiency},\n\tauthor       = {Chandrakant Patel and Ratnesh Sharma and Cullen Bash and Sven Graupner},\n\tyear         = 2003,\n\tbooktitle    = {ASME International Mechanical Engineering Congress and R\\&D Expo}\n}\n@inproceedings{patel2003smart,\n\ttitle        = {Smart Cooling of Data Centers},\n\tauthor       = {Patel, C.D. and Bash, C.E. and Sharma, R. and Friedrich, R.},\n\tyear         = 2003,\n\tbooktitle    = {ASME Interpack}\n}\n@inproceedings{pathak2017curiosity,\n\ttitle        = {Curiosity-driven exploration by self-supervised prediction},\n\tauthor       = {Pathak, Deepak and Agrawal, Pulkit and Efros, Alexei A and Darrell, Trevor},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning},\n\tvolume       = 2017,\n\tpages        = {16--17}\n}\n@inproceedings{patnaik2009sustainable,\n\ttitle        = {\n\t\tSustainable operation and management of data center chillers using\n\n\t\ttemporal data mining\n\t},\n\tauthor       = {\n\t\tPatnaik, Debprakash and Marwah, Manish and Sharma, Ratnesh and Ramakrishnan,\n\n\t\tNaren\n\t},\n\tyear         = 2009,\n\tbooktitle    = {\n\t\tProceedings of the 15th ACM SIGKDD international conference on Knowledge\n\n\t\tdiscovery and data mining\n\t},\n\tlocation     = {Paris, France},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {KDD '09},\n\tpages        = {1305--1314},\n\tdoi          = {http://doi.acm.org/10.1145/1557019.1557159},\n\tisbn         = {978-1-60558-495-9},\n\tacmid        = 1557159,\n\tkeywords     = {chillers, clustering, data centers, frequent episodes, motifs, sustainability},\n\tnumpages     = 10\n}\n@inproceedings{patro2020fairrec,\n\ttitle        = {FairRec: Two-Sided Fairness for Personalized Recommendations in Two-Sided Platforms},\n\tauthor       = {Gourab K Patro and Arpita Biswas and Niloy Ganguly and Krishna P Gummadi and Abhijnan Chakraborty},\n\tyear         = 2020,\n\tbooktitle    = {Proceedings of The Web Conference 2020},\n\tpages        = {1194--1204}\n}\n@article{paudice2018detection,\n\ttitle        = {Detection of Adversarial Training Examples in Poisoning Attacks through Anomaly Detection},\n\tauthor       = {Andrea Paudice and Luis Mu{\\~n}oz-Gonz{\\'a}lez and Andras Gyorgy and Emil C Lupu},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1802.03041}\n}\n@inproceedings{paul2015column,\n\ttitle        = {Column Selection via Adaptive Sampling},\n\tauthor       = {Paul, Saurabh and Magdon-Ismail, Malik and Drineas, Petros},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {406--414}\n}\n@book{paulos1988innumeracy,\n\ttitle        = {Innumeracy: Mathematical illiteracy and its consequences},\n\tauthor       = {John Allen Paulos},\n\tyear         = 1988,\n\tpublisher    = {Macmillan}\n}\n@inproceedings{pauls2009kbest,\n\ttitle        = {{K}-best {A*} parsing},\n\tauthor       = {Adam Pauls and Dan Klein},\n\tyear         = 2009,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {958--966}\n}\n@inproceedings{pauls2012treelets,\n\ttitle        = {Large-Scale Syntactic Language Modeling with Treelets},\n\tauthor       = {Adam Pauls and Dan Klein},\n\tyear         = 2012,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{paulus2018deep,\n\ttitle        = {A Deep Reinforced Model for Abstractive Summarization},\n\tauthor       = {Romain Paulus and Caiming Xiong and Richard Socher},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{pavlick2015domain,\n\ttitle        = {Domain-Specific Paraphrase Extraction},\n\tauthor       = {Ellie  Pavlick and Juri   Ganitkevitch and Tsz Ping   Chan and Xuchen   Yao and Benjamin   Van Durme and Chris  Callison-Burch},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{pavlick2015ppdb,\n\ttitle        = {{PPDB} 2.0: Better paraphrase ranking, fine-grained entailment relations, word embeddings, and style classification},\n\tauthor       = {Ellie Pavlick and Pushpendre Rastogi and Juri Ganitkevitch and Benjamin Van Durme and Chris Callison-Burch},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{pavlick2015semantics,\n\ttitle        = {Adding semantics to data-driven paraphrasing},\n\tauthor       = {Ellie Pavlick and Johan Bos and Malvina Nissim and Charley Beller and Benjamin Van and Durme Chris Callison-Burch},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{pavlick2016gun,\n\ttitle        = {The Gun Violence Database: A new task and data set for {NLP}},\n\tauthor       = {Ellie Pavlick and Heng Ji and Xiaoman Pan and Chris Callison-Burch},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1018--1024}\n}\n@inproceedings{pavlick2016most,\n\ttitle        = {Most\" babies\" are\" little\" and most\" problems\" are\" huge\": Compositional Entailment in Adjective-Nouns},\n\tauthor       = {Ellie Pavlick and Chris Callison-Burch},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tvolume       = 1,\n\tpages        = {2164--2173}\n}\n@article{pavlick2017style,\n\ttitle        = {An Empirical Analysis of Formality in Online Communication},\n\tauthor       = {Ellie Pavlick and Joel Tetreault},\n\tyear         = 2017,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 4\n}\n@article{pavlick2019inherent,\n\ttitle        = {Inherent Disagreements in Human Textual Inferences},\n\tauthor       = {Ellie Pavlick and Tom Kwiatkowski},\n\tyear         = 2019,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 7\n}\n@inproceedings{pb17,\n\ttitle        = {Geometry of Neural Network Loss Surfaces via Random Matrix Theory},\n\tauthor       = {Pennington, Jeffrey and Bahri, Yasaman},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\taddress      = {International Convention Centre, Sydney, Australia}\n}\n@article{pearce1984rationalizable,\n\ttitle        = {Rationalizable strategic behavior and the problem of perfection},\n\tauthor       = {David G Pearce},\n\tyear         = 1984,\n\tjournal      = {Econometrica: Journal of the Econometric Society},\n\tpages        = {1029--1050}\n}\n@article{pearl1986fusion,\n\ttitle        = {Fusion, Propagation, and Structuring in Belief Networks},\n\tauthor       = {Judea Pearl},\n\tyear         = 1986,\n\tjournal      = {Artif. Intell.},\n\tvolume       = 29,\n\tnumber       = 3,\n\tpages        = {241--288},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de},\n\tee           = {http://dx.doi.org/10.1016/0004-3702(86)90072-X}\n}\n@article{pearl1987evidential,\n\ttitle        = {Evidential reasoning using stochastic simulation of causal models},\n\tauthor       = {Pearl, J.},\n\tyear         = 1987,\n\tjournal      = {Artificial Intelligence},\n\tvolume       = 32,\n\tpages        = {247--257}\n}\n@book{pearl2000causality,\n\ttitle        = {{Causality: Models, Reasoning and Inference}},\n\tauthor       = {Judea Pearl},\n\tyear         = 2000,\n\tpublisher    = {Springer},\n\tvolume       = 29\n}\n@book{pearl2009causality,\n\ttitle        = {Causality: Models, Reasoning and Inference},\n\tauthor       = {Pearl, Judea},\n\tyear         = 2009,\n\tpublisher    = {Cambridge University Press},\n\taddress      = {USA},\n\tisbn         = {052189560X},\n\tedition      = {2nd}\n}\n@article{pearlmutter1994fast,\n\ttitle        = {Fast exact multiplication by the {Hessian}},\n\tauthor       = {Barak A Pearlmutter},\n\tyear         = 1994,\n\tjournal      = {Neural Computation},\n\tvolume       = 6,\n\tnumber       = 1,\n\tpages        = {147--160}\n}\n@article{pearson1894,\n\ttitle        = {Contributions to the Mathematical Theory of Evolution},\n\tauthor       = {Karl Pearson},\n\tyear         = 1894,\n\tjournal      = {Philosophical Transactions of the Royal Society of London. A},\n\tvolume       = 185,\n\tpages        = {71--110}\n}\n@article{Pearson94,\n\ttitle        = {Contributions to the mathematical theory of evolution},\n\tauthor       = {K. Pearson},\n\tyear         = 1894,\n\tjournal      = {Philosophical Transactions of the Royal Society, London, A.},\n\tpages        = 71\n}\n@article{pedregosa2011sklearn,\n\ttitle        = {Scikit-learn: Machine Learning in {P}ython},\n\tauthor       = {F. Pedregosa and G. Varoquaux and A. Gramfort and V. Michel and B. Thirion and O. Grisel and M. Blondel and P. Prettenhofer and R. Weiss and V. Dubourg and J. Vanderplas and A. Passos and D. Cournapeau and M. Brucher and M. Perrot and E. Duchesnay},\n\tyear         = 2011,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 12\n}\n@article{pegasos,\n\ttitle        = {Pegasos: Primal estimated sub-gradient solver for svm},\n\tauthor       = {Shalev-Shwartz, Shai and Singer, Yoram and Srebro, Nathan and Cotter, Andrew},\n\tyear         = 2011,\n\tjournal      = {Mathematical programming},\n\tpublisher    = {Springer},\n\tvolume       = 127,\n\tnumber       = 1,\n\tpages        = {3--30}\n}\n@article{peikari2018cluster,\n\ttitle        = {A cluster-then-label semi-supervised learning approach for pathology image classification},\n\tauthor       = {Mohammad Peikari and Sherine Salama and Sharon Nofech-Mozes and Anne L Martel},\n\tyear         = 2018,\n\tjournal      = {Scientific reports},\n\tvolume       = 8,\n\tnumber       = 1,\n\tpages        = {1--13}\n}\n@article{pemantle1990nonconvergence,\n\ttitle        = {Nonconvergence to unstable points in urn models and stochastic approximations},\n\tauthor       = {Pemantle, Robin and others},\n\tyear         = 1990,\n\tjournal      = {The Annals of Probability},\n\tpublisher    = {Institute of Mathematical Statistics},\n\tvolume       = 18,\n\tnumber       = 2,\n\tpages        = {698--712}\n}\n@inproceedings{penas2009respubliqa,\n\ttitle        = {Overview of {ResPubliQA} 2009: Question Answering Evaluation over European Legislation},\n\tauthor       = {Anselmo Pe{\\~n}as and Pamela Forner and Richard Sutcliffe and \\'{A}lvaro Rodrigo and Corina For\\u{a}scu and I\\={n}aki Alegria and Danilo Giampiccolo and Nicolas Moreau and Petya Osenova},\n\tyear         = 2009,\n\tbooktitle    = {Cross Language Evaluation Forum}\n}\n@inproceedings{penas2013mre,\n\ttitle        = {{QA4MRE} 2011-2013: Overview of Question Answering for Machine Reading Evaluation},\n\tauthor       = {Anselmo Pe{\\~n}as and Eduard Hovy and Pamela Forner and \\'{A}lvaro Rodrigo and Richard Sutcliffe and Roser Morante},\n\tyear         = 2013,\n\tbooktitle    = {Cross Language Evaluation Forum}\n}\n@inproceedings{peng2018towards,\n\ttitle        = {Towards Controllable Story Generation},\n\tauthor       = {Nanyun Peng and Marjan Ghazvininejad and Jonathan May and Kevin Knight},\n\tyear         = 2018,\n\tbooktitle    = {NAACL Workshop}\n}\n@inproceedings{peng2018visda,\n\ttitle        = {Visda: A synthetic-to-real benchmark for visual domain adaptation},\n\tauthor       = {Xingchao Peng and Ben Usman and Neela Kaushik and Dequan Wang and Judy Hoffman and Kate Saenko},\n\tyear         = 2018,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {2021--2026}\n}\n@inproceedings{peng2019moment,\n\ttitle        = {Moment matching for multi-source domain adaptation},\n\tauthor       = {Peng, Xingchao and Bai, Qinxun and Xia, Xide and Huang, Zijun and Saenko, Kate and Wang, Bo},\n\tyear         = 2019,\n\tbooktitle    = {Proceedings of the IEEE International Conference on Computer Vision},\n\tpages        = {1406--1415}\n}\n@inproceedings{peng2019plan,\n\ttitle        = {Plan-And-Write: Towards Better Automatic Storytelling},\n\tauthor       = {Lili Yao and Nanyun Peng and Ralph Weischedel and Kevin Knight and Dongyan Zhao and Rui Yan},\n\tyear         = 2019,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{peng2019pun,\n\ttitle        = {Pun Generation with Surprise},\n\tauthor       = {Nanyun Peng and He He and Percy Liang},\n\tyear         = 2019,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{peng2020facmac,\n\ttitle        = {FACMAC: Factored Multi-Agent Centralised Policy Gradients},\n\tauthor       = {Peng, Bei and Rashid, Tabish and de Witt, Christian A Schroeder and Kamienny, Pierre-Alexandre and Torr, Philip HS and B{\\\"o}hmer, Wendelin and Whiteson, Shimon},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2003.06709}\n}\n@inproceedings{PengRowSampling,\n\ttitle        = {Iterative Row Sampling},\n\tauthor       = {Mu Li and Gary L. Miller and Richard Peng},\n\tyear         = 2013,\n\tbooktitle    = {54th Annual {IEEE} Symposium on Foundations of Computer Science, {FOCS} 2013, 26-29 October, 2013, Berkeley, CA, {USA}},\n\tpages        = {127--136},\n\tdoi          = {10.1109/FOCS.2013.22},\n\turl          = {http://dx.doi.org/10.1109/FOCS.2013.22},\n\tcrossref     = {DBLP:conf/focs/2013},\n\ttimestamp    = {Tue, 16 Dec 2014 09:57:25 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/focs/LiMP13},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{PengS14,\n\ttitle        = {An efficient parallel solver for {SDD} linear systems},\n\tauthor       = {Richard Peng and Daniel A. Spielman},\n\tyear         = 2014,\n\tbooktitle    = {Symposium on Theory of Computing, {STOC} 2014, New York, NY, USA, May 31 - June 03, 2014},\n\tpages        = {333--342},\n\tdoi          = {10.1145/2591796.2591832},\n\turl          = {http://doi.acm.org/10.1145/2591796.2591832},\n\ttimestamp    = {Mon, 03 Nov 2014 22:25:46 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/stoc/PengS14},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{PengTangwongsan2012,\n\ttitle        = {{Faster and simpler width-independent parallel algorithms for positive semidefinite programming}},\n\tauthor       = {Peng, Richard and Tangwongsan, Kanat},\n\tyear         = 2012,\n\tmonth        = jan,\n\tbooktitle    = {Proceedinbgs of the 24th ACM symposium on Parallelism in algorithms and architectures - SPAA '12},\n\tpublisher    = {ACM Press},\n\taddress      = {New York, New York, USA},\n\tpages        = 101,\n\tdoi          = {10.1145/2312005.2312026},\n\tisbn         = 9781450312134,\n\tarchiveprefix = {arXiv},\n\tarxivid      = {arXiv:1201.5135v1},\n\teprint       = {arXiv:1201.5135v1},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Peng, Tangwongsan - 2012 - Faster and simpler width-independent parallel algorithms for positive semidefinite programming.pdf:pdf},\n\tmendeley-groups = {Algorithms/Multiplicative Weight/SDP}\n}\n@inproceedings{pennington2014glove,\n\ttitle        = {Glove: Global vectors for word representation},\n\tauthor       = {Pennington, Jeffrey and Socher, Richard and Manning, Christopher D},\n\tyear         = 2014,\n\tjournal      = {Proceedings of the Empiricial Methods in Natural Language Processing},\n\tbooktitle    = {Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP)},\n\tpages        = {1532--1543},\n\turl          = {http://aclweb.org/anthology/D/D14/D14-1162.pdf},\n\tcrossref     = {DBLP:conf/emnlp/2014},\n\ttimestamp    = {Sat, 15 Nov 2014 14:45:18 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/emnlp/PenningtonSM14},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{pennington2017resurrecting,\n\ttitle        = {Resurrecting the sigmoid in deep learning through dynamical isometry: theory and practice},\n\tauthor       = {Pennington, Jeffrey and Schoenholz, Samuel and Ganguli, Surya},\n\tyear         = 2017,\n\tmonth        = nov,\n\tjournal      = {arXiv:1711.04735},\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {4785--4795},\n\turl          = {http://arxiv.org/abs/1711.04735}\n}\n@article{pennington2018emergence,\n\ttitle        = {The emergence of spectral universality in deep networks},\n\tauthor       = {Pennington, Jeffrey and Schoenholz, Samuel S and Ganguli, Surya},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1802.09979}\n}\n@article{perdomo2020performative,\n\ttitle        = {Performative Prediction},\n\tauthor       = {Perdomo, Juan C and Zrnic, Tijana and Mendler-D{\\\"u}nner, Celestine and Hardt, Moritz},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2002.06673}\n}\n@book{perea2012epistemic,\n\ttitle        = {Epistemic game theory: reasoning and choice},\n\tauthor       = {Andr{'e}s Perea},\n\tyear         = 2012,\n\tpublisher    = {Cambridge University Press}\n}\n@inproceedings{pereira92bracket,\n\ttitle        = {Inside-outside reestimation from partially bracketed corpora},\n\tauthor       = {Fernando Pereira and Yves Shabes},\n\tyear         = 1992,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {128--135}\n}\n@article{perelman2014state,\n\ttitle        = {When “the state of the art” is counting words},\n\tauthor       = {Les Perelman},\n\tyear         = 2014,\n\tjournal      = {Assessing Writing},\n\tvolume       = 21,\n\tpages        = {104--111}\n}\n@inproceedings{perez2017c,\n\ttitle        = {{C}-LEARN: Learning geometric constraints from demonstrations for multi-step manipulation in shared autonomy},\n\tauthor       = {C. Perez-D'Arpino and J. A. Shah},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Robotics and Automation (ICRA)},\n\tpages        = {4058--4065}\n}\n@inproceedings{perez2018film,\n\ttitle        = {FiLM: Visual Reasoning with a General Conditioning Layer},\n\tauthor       = {Ethan Perez and Florian Strub and Harm D. Vries and Vincent Dumoulin and Aaron C. Courville},\n\tyear         = 2018,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@article{perez2019turing,\n\ttitle        = {On the turing completeness of modern neural network architectures},\n\tauthor       = {P{\\'e}rez, Jorge and Marinkovi{\\'c}, Javier and Barcel{\\'o}, Pablo},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1901.03429}\n}\n@inproceedings{perkins2003convergent,\n\ttitle        = {A convergent form of approximate policy iteration},\n\tauthor       = {Perkins, Theodore J and Precup, Doina},\n\tyear         = 2003,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {1627--1634}\n}\n@inproceedings{perozzi2014deepwalk,\n\ttitle        = {Deepwalk: Online learning of social representations},\n\tauthor       = {Bryan Perozzi and Rami Al-Rfou and Steven Skiena},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)},\n\tpages        = {701--710}\n}\n@article{perrault1980plan,\n\ttitle        = {A plan-based analysis of indirect speech acts},\n\tauthor       = {C Raymond Perrault and James F Allen},\n\tyear         = 1980,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 6,\n\tnumber       = 3,\n\tpages        = {167--182}\n}\n@techreport{PertDJ,\n\ttitle        = {Perturbation of joint diagonalizers. Ref\\# 94D027},\n\tauthor       = {J.-F. Cardoso},\n\tyear         = 1994,\n\tinstitution  = {T\\'{e}l\\'{e}com {P}aris}\n}\n@inproceedings{peshkin2000learning,\n\ttitle        = {Learning to cooperate via policy search},\n\tauthor       = {Leonid Peshkin and Kee-Eung Kim and Nicolas Meuleau and Leslie Pack Kaelbling},\n\tyear         = 2000,\n\tbooktitle    = {Proceedings of the Sixteenth conference on Uncertainty in artificial intelligence},\n\tpages        = {489--496}\n}\n@article{peters06bindingprediction,\n\ttitle        = {A community resource benchmarking predictions of peptide binding to {MHC-I} molecules},\n\tauthor       = {B. Peters and H.-H Bui and S. Frankild and M. Nielson and C. Lundegaard and E. Kostem and D. Basch and K. Lamberth and M. Harndahl and W. Fleri and S. S Wilson and J. Sidney and O. Lund and S. Buus and A. Sette},\n\tyear         = 2006,\n\tjournal      = {PLoS Compututational Biology},\n\tvolume       = 2\n}\n@inproceedings{peters2006policy,\n\ttitle        = {Policy gradient methods for robotics},\n\tauthor       = {Peters, Jan and Schaal, Stefan},\n\tyear         = 2006,\n\tbooktitle    = {2006 IEEE/RSJ International Conference on Intelligent Robots and Systems},\n\tpages        = {2219--2225},\n\torganization = {IEEE}\n}\n@article{peters2016causal,\n\ttitle        = {Causal inference by using invariant prediction: identification and confidence intervals},\n\tauthor       = {Peters, Jonas and B{\\\"u}hlmann, Peter and Meinshausen, Nicolai},\n\tyear         = 2016,\n\tjournal      = {Journal of the Royal Statistical Society. Series B (Statistical Methodology)},\n\tpublisher    = {JSTOR},\n\tvolume       = 78,\n\tpages        = {947--1012}\n}\n@article{peters2018deep,\n\ttitle        = {Deep contextualized word representations},\n\tauthor       = {Peters, Matthew E and Neumann, Mark and Iyyer, Mohit and Gardner, Matt and Clark, Christopher and Lee, Kenton and Zettlemoyer, Luke},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1802.05365}\n}\n@inproceedings{peters2018elmo,\n\ttitle        = {Deep contextualized word representations},\n\tauthor       = {Matthew E. Peters and Mark Neumann and Mohit Iyyer and Matt Gardner and Christopher Clark and Kenton Lee and Luke Zettlemoyer},\n\tyear         = 2018,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@inproceedings{petrik2007analysis,\n\ttitle        = {An Analysis of Laplacian Methods for Value Function Approximation in {MDP}s.},\n\tauthor       = {Petrik, Marek},\n\tyear         = 2007,\n\tbooktitle    = {IJCAI},\n\tpages        = {2574--2579}\n}\n@inproceedings{petrov06latent,\n\ttitle        = {Learning Accurate, Compact, and Interpretable Tree Annotation},\n\tauthor       = {S. Petrov and L. Barrett and R. Thibaux and D. Klein},\n\tyear         = 2006,\n\tbooktitle    = {International Conference on Computational Linguistics and Association for Computational Linguistics (COLING/ACL)},\n\tpages        = {433--440}\n}\n@inproceedings{petrov07split,\n\ttitle        = {Learning and Inference for Hierarchically Split {PCFG}s},\n\tauthor       = {S. Petrov and D. Klein},\n\tyear         = 2007,\n\tbooktitle    = {Human Language Technology and North American Association for Computational Linguistics (HLT/NAACL)},\n\tpages        = {404--411}\n}\n@inproceedings{petrov08discriminative,\n\ttitle        = {Discriminative Log-Linear Grammars with Latent Variables},\n\tauthor       = {Slav Petrov and Dan Klein},\n\tyear         = 2008,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{petrov2010uptraining,\n\ttitle        = {Uptraining for accurate deterministic question parsing},\n\tauthor       = {Slav Petrov and Pi-Chuan Chang and Michael Ringgaard and Hiyan Alshawi},\n\tyear         = 2010,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@book{petrov2011coarse,\n\ttitle        = {Coarse-to-fine natural language processing},\n\tauthor       = {Slav Petrov and Eugene Charniak},\n\tyear         = 2011,\n\tpublisher    = {Springer Science \\& Business Media}\n}\n@inproceedings{petrovic2013unsupervised,\n\ttitle        = {Unsupervised Joke Generation from Big Data},\n\tauthor       = {Sasa Petrovic and David Matthews},\n\tyear         = 2013,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{pezeshki2020gradient,\n\ttitle        = {Gradient Starvation: A Learning Proclivity in Neural Networks},\n\tauthor       = {Mohammad Pezeshki and S{\\'e}kou-Oumar Kaba and Yoshua Bengio and Aaron Courville and Doina Precup and Guillaume Lajoie},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2011.09468}\n}\n@inproceedings{pfohl2019creating,\n\ttitle        = {Creating fair models of atherosclerotic cardiovascular disease risk},\n\tauthor       = {Pfohl, Stephen and Marafino, Ben and Coulet, Adrien and Rodriguez, Fatima and Palaniappan, Latha and Shah, Nigam H},\n\tyear         = 2019,\n\tbooktitle    = {Proceedings of the 2019 AAAI/ACM Conference on AI, Ethics, and Society},\n\tpages        = {271--278}\n}\n@inproceedings{pham2017column,\n\ttitle        = {Column Networks for Collective Classification},\n\tauthor       = {Trang Pham and Truyen Tran and Dinh Phung and Svetha Venkatesh},\n\tyear         = 2017,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@article{phang2018sentence,\n\ttitle        = {Sentence encoders on stilts: Supplementary training on intermediate labeled-data tasks},\n\tauthor       = {Jason Phang and Thibault Fevry and Samuel R Bowman},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1811.01088}\n}\n@article{phelps1972statistical,\n\ttitle        = {The statistical theory of racism and sexism},\n\tauthor       = {Edmund S Phelps},\n\tyear         = 1972,\n\tjournal      = {The american economic review},\n\tvolume       = 62,\n\tnumber       = 4,\n\tpages        = {659--661}\n}\n@article{phillips2016learning,\n\ttitle        = {Learning to plan for constrained manipulation from demonstrations},\n\tauthor       = {M. Phillips and V. Hwang and S. Chitta and M. Likhachev},\n\tyear         = 2016,\n\tjournal      = {Autonomous Robots},\n\tvolume       = 40,\n\tnumber       = 1,\n\tpages        = {109--124}\n}\n@article{phillips2020chexphoto,\n\ttitle        = {Chexphoto: 10,000+ smartphone photos and synthetic photographic transformations of chest x-rays for benchmarking deep learning robustness},\n\tauthor       = {Nick A Phillips and Pranav Rajpurkar and Mark Sabini and Rayan Krishnan and Sharon Zhou and Anuj Pareek and Nguyet Minh Phu and Chris Wang and Andrew Y Ng and Matthew P Lungren},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.06199}\n}\n@inproceedings{piantadosi08compositional,\n\ttitle        = {A {B}ayesian Model of the Acquisition of Compositional Semantics},\n\tauthor       = {S. T. Piantadosi and N. D. Goodman and B. A. Ellis and J. B. Tenenbaum},\n\tyear         = 2008,\n\tbooktitle    = {Proceedings of the Thirtieth Annual Conference of the Cognitive Science Society},\n\tpages        = {1620--1625}\n}\n@article{piech2013tuned,\n\ttitle        = {Tuned models of peer assessment in MOOCs},\n\tauthor       = {Chris Piech and Jonathan Huang and Zhenghao Chen and Chuong Do and Andrew Ng and Daphne Koller},\n\tyear         = 2013,\n\tjournal      = {Educational Data Mining}\n}\n@inproceedings{pieraccini1991stochastic,\n\ttitle        = {Stochastic Representation of Conceptual Structure in the {ATIS} Task},\n\tauthor       = {Roberto Pieraccini and Esther Levin and Chin-Hui Lee},\n\tyear         = 1991,\n\tbooktitle    = {Human Language Technology (HLT)}\n}\n@article{pierce1970whither,\n\ttitle        = {Whither Speech Recognition?},\n\tauthor       = {J. R. Pierce},\n\tyear         = 1970,\n\tjournal      = {Journal of the Acoustical Society of America},\n\tvolume       = 47,\n\tpages        = {1616--1617}\n}\n@article{pierson2017fast,\n\ttitle        = {Fast threshold tests for detecting discrimination},\n\tauthor       = {Emma Pierson and Sam Corbett-Davies and Sharad Goel},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1702.08536}\n}\n@inproceedings{pierson2019aging,\n\ttitle        = {Inferring Multidimensional Rates of Aging from Cross-Sectional Data},\n\tauthor       = {Emma Pierson and Pang Wei Koh and Tatsunori Hashimoto and Daphne Koller and Jure Leskovec and Nick Eriksson and Percy Liang},\n\tyear         = 2019,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)}\n}\n@article{pierson2019using,\n\ttitle        = {Using machine learning to understand racial and socioeconomic differences in knee pain},\n\tauthor       = {Emma Pierson and David Cutler and Jure Leskovec and Sendhil Mullainathan and Ziad Obermeyer},\n\tyear         = 2019,\n\tjournal      = {NBER Machine Learning and Healthcare Conference}\n}\n@article{pimentel2014review,\n\ttitle        = {A review of novelty detection},\n\tauthor       = {Marco AF Pimentel and David A Clifton and Lei Clifton and Lionel Tarassenko},\n\tyear         = 2014,\n\tjournal      = {Signal Processing},\n\tvolume       = 99,\n\tpages        = {215--249}\n}\n@inproceedings{pimplikar2012answering,\n\ttitle        = {Answering table queries on the web using column keywords},\n\tauthor       = {Rakesh Pimplikar and Sunita Sarawagi},\n\tyear         = 2012,\n\tbooktitle    = {Very Large Data Bases (VLDB)},\n\tvolume       = 5,\n\tnumber       = 10,\n\tpages        = {908--919}\n}\n@article{pinelis1994optimum,\n\ttitle        = {Optimum bounds for the distributions of martingales in Banach spaces},\n\tauthor       = {Pinelis, Iosif},\n\tyear         = 1994,\n\tjournal      = {The Annals of Probability},\n\tpublisher    = {JSTOR},\n\tpages        = {1679--1706}\n}\n@article{pintore2006spatially,\n\ttitle        = {Spatially adaptive smoothing splines},\n\tauthor       = {Pintore, Alexandre and Speckman, Paul and Holmes, Chris C},\n\tyear         = 2006,\n\tjournal      = {Biometrika},\n\tpublisher    = {Oxford University Press},\n\tvolume       = 93,\n\tnumber       = 1,\n\tpages        = {113--125}\n}\n@article{pires2016multiclass,\n\ttitle        = {Multiclass Classification Calibration Functions},\n\tauthor       = {Bernardo Ávila Pires and Csaba Szepesvári},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@article{pisier12grothendieck,\n\ttitle        = {{G}rothendieck's Theorem, past and present},\n\tauthor       = {G. Pisier},\n\tyear         = 2012,\n\tjournal      = {Bulletin of the American Mathematical Society},\n\tvolume       = 49,\n\tpages        = {237--323}\n}\n@article{pisier1981remarques,\n\ttitle        = {Remarques sur un r{\\'e}sultat non publi{\\'e} de B. Maurey},\n\tauthor       = {Pisier, Gilles},\n\tyear         = 1981,\n\tjournal      = {S{\\'e}minaire Analyse fonctionnelle (dit\" Maurey-Schwartz\")},\n\tpages        = {1--12}\n}\n@techreport{pitman02process,\n\ttitle        = {Combinatorial Stochastic Processes},\n\tauthor       = {J. Pitman},\n\tyear         = 2002,\n\tnumber       = 621,\n\tinstitution  = {Department of Statistics, University of California at Berkeley (UC Berkeley)}\n}\n@article{pitman97yor,\n\ttitle        = {The two-parameter {P}oisson-{D}irichlet distribution derived from a stable subordinator},\n\tauthor       = {J. Pitman and M. Yor},\n\tyear         = 1997,\n\tjournal      = {Annals of Probability},\n\tvolume       = 25,\n\tpages        = {855--900}\n}\n@article{piwek2012varieties,\n\ttitle        = {Varieties of Question Generation: Introduction to this Special Issue},\n\tauthor       = {Paul Piwek and Kristy Elizabeth Boyer},\n\tyear         = 2012,\n\tjournal      = {Dialogue and Discourse},\n\tvolume       = 3,\n\tpages        = {1--9}\n}\n@article{plank16nonstandard,\n\ttitle        = {What to do about non-standard (or non-canonical) language in {NLP}},\n\tauthor       = {Barbara Plank},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@phdthesis{plank2011domain,\n\ttitle        = {Domain adaptation for parsing},\n\tauthor       = {Barbara Plank},\n\tyear         = 2011,\n\tschool       = {University of Groningen}\n}\n@mastersthesis{platanios2015estimating,\n\ttitle        = {Estimating Accuracy from Unlabeled Data},\n\tauthor       = {Emmanouil Antonios Platanios},\n\tyear         = 2015,\n\tbooktitle    = {Uncertainty in Artificial Intelligence (UAI)},\n\tschool       = {Carnegie Mellon University}\n}\n@article{platt1999probabilistic,\n\ttitle        = {Probabilistic outputs for support vector machines and comparisons to regularized likelihood methods},\n\tauthor       = {Platt, John and others},\n\tyear         = 1999,\n\tjournal      = {Advances in large margin classifiers},\n\tpublisher    = {Cambridge, MA},\n\tvolume       = 10,\n\tnumber       = 3,\n\tpages        = {61--74}\n}\n@inproceedings{pleiss2017,\n\ttitle        = {On Fairness and Calibration},\n\tauthor       = {Geoff Pleiss and Manish Raghavan and Felix Wu and Jon Kleinberg and Kilian Q. Weinberger},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {5684--5693}\n}\n@inproceedings{pleiss2017fairness,\n\ttitle        = {On fairness and calibration},\n\tauthor       = {Pleiss, Geoff and Raghavan, Manish and Wu, Felix and Kleinberg, Jon and Weinberger, Kilian Q},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {5680--5689}\n}\n@article{PlotkinST1995,\n\ttitle        = {{Fast Approximation Algorithms for Fractional Packing and Covering Problems}},\n\tauthor       = {Plotkin, Serge A. and Shmoys, David B. and Tardos, \\'{E}va},\n\tyear         = 1995,\n\tmonth        = may,\n\tjournal      = {Mathematics of Operations Research},\n\tvolume       = 20,\n\tnumber       = 2,\n\tpages        = {257--301},\n\tdoi          = {10.1287/moor.20.2.257},\n\tissn         = {0364-765X},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Plotkin, Shmoys, Tardos - 1995 - Fast Approximation Algorithms for Fractional Packing and Covering Problems.pdf:pdf},\n\tmendeley-groups = {Algorithms/Multiplicative Weight/LP}\n}\n@inproceedings{plrsg16,\n\ttitle        = {Exponential expressivity in deep neural networks through transient chaos},\n\tauthor       = {Poole, Ben and Lahiri, Subhaneil and Raghu, Maithreyi and Sohl-Dickstein, Jascha and Ganguli, Surya},\n\tyear         = 2016,\n\tbooktitle    = {Advances In Neural Information Processing Systems},\n\tpages        = {3360--3368},\n\tnote         = {00047}\n}\n@inproceedings{pmlr-v37-long15,\n\ttitle        = {Learning Transferable Features with Deep Adaptation Networks},\n\tauthor       = {Long, Mingsheng and Cao, Yue and Wang, Jianmin and Jordan, Michael},\n\tyear         = 2015,\n\tmonth        = {07--09 Jul},\n\tbooktitle    = {Proceedings of the 32nd International Conference on Machine Learning},\n\tpublisher    = {PMLR},\n\taddress      = {Lille, France},\n\tseries       = {Proceedings of Machine Learning Research},\n\tvolume       = 37,\n\tpages        = {97--105},\n\turl          = {http://proceedings.mlr.press/v37/long15.html},\n\teditor       = {Bach, Francis and Blei, David},\n\tpdf          = {http://proceedings.mlr.press/v37/long15.pdf}\n}\n@inproceedings{pmlr-v48-hardt16,\n\ttitle        = {Train faster, generalize better: Stability of stochastic gradient descent},\n\tauthor       = {Moritz Hardt and Ben Recht and Yoram Singer},\n\tyear         = 2016,\n\tmonth        = {20--22 Jun},\n\tbooktitle    = {Proceedings of The 33rd International Conference on Machine Learning},\n\tpublisher    = {PMLR},\n\taddress      = {New York, New York, USA},\n\tseries       = {Proceedings of Machine Learning Research},\n\tvolume       = 48,\n\tpages        = {1225--1234},\n\turl          = {http://proceedings.mlr.press/v48/hardt16.html},\n\teditor       = {Maria Florina Balcan and Kilian Q. Weinberger},\n\tpdf          = {http://proceedings.mlr.press/v48/hardt16.pdf},\n\tabstract     = {We show that parametric models trained by a stochastic gradient method (SGM) with few iterations have vanishing generalization error. We prove our results by arguing that SGM is algorithmically stable in the sense of Bousquet and Elisseeff. Our analysis only employs elementary tools from convex and continuous optimization. We derive stability bounds for both convex and non-convex optimization under standard Lipschitz and smoothness assumptions. Applying our results to the convex case, we provide new insights for why multiple epochs of stochastic gradient methods generalize well in practice. In the non-convex case, we give a new interpretation of common practices in neural networks, and formally show that popular techniques for training large deep models are indeed stability-promoting. Our findings conceptually underscore the importance of reducing training time beyond its obvious benefit.}\n}\n@inproceedings{pmlr-v49-telgarsky16,\n\ttitle        = {benefits of depth in neural networks},\n\tauthor       = {Matus Telgarsky},\n\tyear         = 2016,\n\tmonth        = {23--26 Jun},\n\tbooktitle    = {29th Annual Conference on Learning Theory},\n\tpublisher    = {PMLR},\n\taddress      = {Columbia University, New York, New York, USA},\n\tseries       = {Proceedings of Machine Learning Research},\n\tvolume       = 49,\n\tpages        = {1517--1539},\n\turl          = {http://proceedings.mlr.press/v49/telgarsky16.html},\n\teditor       = {Vitaly Feldman and Alexander Rakhlin and Ohad Shamir},\n\tpdf          = {http://proceedings.mlr.press/v49/telgarsky16.pdf}\n}\n@inproceedings{pmlr-v65-daniely17a,\n\ttitle        = {Depth Separation for Neural Networks},\n\tauthor       = {Daniely, Amit},\n\tyear         = 2017,\n\tmonth        = {07--10 Jul},\n\tbooktitle    = {Proceedings of the 2017 Conference on Learning Theory},\n\tpublisher    = {PMLR},\n\tseries       = {Proceedings of Machine Learning Research},\n\tvolume       = 65,\n\tpages        = {690--696},\n\turl          = {http://proceedings.mlr.press/v65/daniely17a.html},\n\teditor       = {Kale, Satyen and Shamir, Ohad},\n\tpdf          = {http://proceedings.mlr.press/v65/daniely17a/daniely17a.pdf}\n}\n@inproceedings{pmlr-v65-lee17a,\n\ttitle        = {On the Ability of Neural Nets to Express Distributions},\n\tauthor       = {Lee, Holden and Ge, Rong and Ma, Tengyu and Risteski, Andrej and Arora, Sanjeev},\n\tyear         = 2017,\n\tmonth        = {07--10 Jul},\n\tbooktitle    = {Proceedings of the 2017 Conference on Learning Theory},\n\tpublisher    = {PMLR},\n\tseries       = {Proceedings of Machine Learning Research},\n\tvolume       = 65,\n\tpages        = {1271--1296},\n\turl          = {http://proceedings.mlr.press/v65/lee17a.html},\n\teditor       = {Kale, Satyen and Shamir, Ohad},\n\tpdf          = {http://proceedings.mlr.press/v65/lee17a/lee17a.pdf},\n\torganization = {PMLR}\n}\n@inproceedings{poesio2013phrase,\n\ttitle        = {Phrase {D}etectives: Utilizing collective intelligence for internet-scale language resource creation},\n\tauthor       = {Massimo Poesio and Jon Chamberlain and Udo Kruschwitz and Livio Robaldo and Luca Ducceschi},\n\tyear         = 2013,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@article{poggio2017theory,\n\ttitle        = {Theory of deep learning III: explaining the non-overfitting puzzle},\n\tauthor       = {Poggio, Tomaso and Kawaguchi, Kenji and Liao, Qianli and Miranda, Brando and Rosasco, Lorenzo and Boix, Xavier and Hidary, Jack and Mhaskar, Hrushikesh},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1801.00173}\n}\n@article{pohlen2018observe,\n\ttitle        = {Observe and Look Further: Achieving Consistent Performance on {ATARI}},\n\tauthor       = {T. Pohlen and B. Piot and T. Hester and M. G. Azar and D. Horgan and D. Budden and G. Barth-Maron  and H. van Hasselt and J. Quan and M. Ve{\\v{c}}er{'\\i}k and others},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.11593}\n}\n@inproceedings{poliak2018hypothesis,\n\ttitle        = {Hypothesis Only Baselines in Natural Language Inference},\n\tauthor       = {Adam Poliak and Jason Naradowsky and Aparajita Haldar and Rachel Rudinger and Benjamin Van Durme},\n\tyear         = 2018,\n\tbooktitle    = {Joint Conference on Lexical and Computational Semantics}\n}\n@book{pollard84convergence,\n\ttitle        = {Convergence of Stochastic Processes},\n\tauthor       = {David Pollard},\n\tyear         = 1984,\n\tpublisher    = {Springer-Verlag}\n}\n@inproceedings{polosukhin2018neural,\n\ttitle        = {Neural Program Search: Solving Programming Tasks from Description and Examples},\n\tauthor       = {Illia Polosukhin and Alexander Skidanov},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{polson2008practical,\n\ttitle        = {Practical filtering with sequential parameter learning},\n\tauthor       = {Polson, Nicholas G. and Stroud, Jonathan R. and M\\\"uller, Peter},\n\tyear         = 2008,\n\tjournal      = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)},\n\tpublisher    = {Blackwell Publishing Ltd},\n\tvolume       = 70,\n\tnumber       = 2,\n\tpages        = {413--428},\n\tkeywords     = {Filtering, Markov chain Monte Carlo methods, Particle filtering, Sequential parameter learning, Spatiotemporal models, State space models}\n}\n@article{polyak1964some,\n\ttitle        = {Some methods of speeding up the convergence of iteration methods},\n\tauthor       = {Polyak, Boris T},\n\tyear         = 1964,\n\tjournal      = {USSR Computational Mathematics and Mathematical Physics},\n\tpublisher    = {Elsevier},\n\tvolume       = 4,\n\tnumber       = 5,\n\tpages        = {1--17}\n}\n@techreport{pomerleau1989alvinn,\n\ttitle        = {Alvinn: An autonomous land vehicle in a neural network},\n\tauthor       = {Pomerleau, Dean A},\n\tyear         = 1989,\n\tinstitution  = {CARNEGIE-MELLON UNIV PITTSBURGH PA ARTIFICIAL INTELLIGENCE AND PSYCHOLOGY~…}\n}\n@article{pomerleau1991efficient,\n\ttitle        = {Efficient training of artificial neural networks for autonomous navigation},\n\tauthor       = {Dean A Pomerleau},\n\tyear         = 1991,\n\tjournal      = {Neural Computation},\n\tvolume       = 3,\n\tnumber       = 1,\n\tpages        = {88--97}\n}\n@inproceedings{poon09semantic,\n\ttitle        = {Unsupervised Semantic Parsing},\n\tauthor       = {Hoifung Poon and Pedro Domingos},\n\tyear         = 2009,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{poon10ontology,\n\ttitle        = {Unsupervised Ontology Induction from Text},\n\tauthor       = {Hoifung Poon and Pedro Domingos},\n\tyear         = 2010,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{poon2011sum,\n\ttitle        = {Sum-product networks: A new deep architecture},\n\tauthor       = {Hoifung Poon and Pedro Domingos},\n\tyear         = 2011,\n\tbooktitle    = {Uncertainty in Artificial Intelligence (UAI)},\n\tpages        = {337--346}\n}\n@inproceedings{poon2013gusp,\n\ttitle        = {Grounded Unsupervised Semantic Parsing},\n\tauthor       = {Hoifung Poon},\n\tyear         = 2013,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{popescu03precise,\n\ttitle        = {Towards a Theory of Natural Language Interfaces to Databases},\n\tauthor       = {Ana-Maria Popescu and Oren Etzioni and Henry Kautz},\n\tyear         = 2003,\n\tbooktitle    = {International Conference on Intelligent User Interfaces (IUI)},\n\tpages        = {149--157}\n}\n@article{porcel2018chest,\n\ttitle        = {Chest tube drainage of the pleural space: a concise review for pulmonologists},\n\tauthor       = {Jos{\\'e} M Porcel},\n\tyear         = 2018,\n\tjournal      = {Tuberculosis and Respiratory Diseases},\n\tvolume       = 81,\n\tnumber       = 2,\n\tpages        = {106--115}\n}\n@article{porter80stem,\n\ttitle        = {An algorithm for suffix stripping},\n\tauthor       = {M. F. Porter},\n\tyear         = 1980,\n\tjournal      = {Program: electronic library and information systems},\n\tvolume       = 14,\n\tpages        = {130--137}\n}\n@inproceedings{post09ptsg,\n\ttitle        = {{B}ayesian learning of a tree substitution grammar},\n\tauthor       = {Matt Post and Daniel Gildea},\n\tyear         = 2009,\n\tbooktitle    = {Association for Computational Linguistics and International Joint Conference on Natural Language Processing (ACL-IJCNLP)}\n}\n@article{post2015simplex,\n\ttitle        = {The simplex method is strongly polynomial for deterministic {M}arkov decision processes},\n\tauthor       = {Post, Ian and Ye, Yinyu},\n\tyear         = 2015,\n\tjournal      = {Mathematics of Operations Research},\n\tpublisher    = {INFORMS},\n\tvolume       = 40,\n\tnumber       = 4,\n\tpages        = {859--868}\n}\n@techreport{pottier05modern,\n\ttitle        = {A Modern Eye on {ML} Type Inference: Old Techniques and Recent Developments},\n\tauthor       = {François Pottier},\n\tyear         = 2005,\n\tinstitution  = {INRIA}\n}\n@phdthesis{potts03thesis,\n\ttitle        = {The Logic of Conventional Implicatures},\n\tauthor       = {Christopher Potts},\n\tyear         = 2003,\n\tschool       = {UC Santa Cruz}\n}\n@inproceedings{potts2012cards,\n\ttitle        = {Goal-Driven Answers in the {C}ards Dialogue Corpus},\n\tauthor       = {Christopher Potts},\n\tyear         = 2012,\n\tbooktitle    = {Proceedings of the 30th West Coast Conference on Formal Linguistics},\n\tpages        = {1--20}\n}\n@inproceedings{pourchot2018cem,\n\ttitle        = {{CEM}-{RL}: Combining evolutionary and gradient-based methods for policy search},\n\tauthor       = {Pourchot and Sigaud},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations},\n\turl          = {https://openreview.net/forum?id=BkeU5j0ctQ}\n}\n@incollection{powell1994estimation,\n\ttitle        = {Estimation of semiparametric models},\n\tauthor       = {James L. Powell},\n\tyear         = 1994,\n\tbooktitle    = {Handbook of Econometrics},\n\tvolume       = 4,\n\tpages        = {2443--2521}\n}\n@inproceedings{prabhu2019understanding,\n\ttitle        = {Understanding Adversarial Robustness Through Loss Landscape Geometries},\n\tauthor       = {Vinay Uday Prabhu and Joyce Xu and Dian Ang Yap and John Whaley},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations Workshop (ICLR)}\n}\n@inproceedings{prabhu2021sentry,\n\ttitle        = {Selective Entropy Optimization via Committee Consistency for Unsupervised Domain Adaptation},\n\tauthor       = {Viraj Prabhu and Shivam Khare and Deeksha Karthik and Judy Hoffman},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Computer Vision (ICCV)}\n}\n@inproceedings{pradel2018deepbugs,\n\ttitle        = {Deepbugs: A learning approach to name-based bug detection},\n\tauthor       = {Michael Pradel and Koushik Sen},\n\tyear         = 2018,\n\tbooktitle    = {Object-Oriented Programming, Systems, Languages, and Applications (OOPSLA)}\n}\n@inproceedings{prajna2004safety,\n\ttitle        = {Safety verification of hybrid systems using barrier certificates},\n\tauthor       = {Prajna, Stephen and Jadbabaie, Ali},\n\tyear         = 2004,\n\tbooktitle    = {International Workshop on Hybrid Systems: Computation and Control},\n\tpages        = {477--492},\n\torganization = {Springer}\n}\n@article{prajna2005necessity,\n\ttitle        = {On the necessity of barrier certificates},\n\tauthor       = {Prajna, Stephen and Rantzer, Anders},\n\tyear         = 2005,\n\tjournal      = {IFAC Proceedings Volumes},\n\tpublisher    = {Elsevier},\n\tvolume       = 38,\n\tnumber       = 1,\n\tpages        = {526--531}\n}\n@inproceedings{prasad2008penn,\n\ttitle        = {The {P}enn Discourse TreeBank 2.0},\n\tauthor       = {Rashmi Prasad and Nikhil Dinesh and Alan Lee and Eleni Miltsakaki and Livio Robaldo and Aravind K Joshi and Bonnie L Webber},\n\tyear         = 2008,\n\tbooktitle    = {LREC}\n}\n@article{prasad2018robust,\n\ttitle        = {Robust estimation via robust gradient estimation},\n\tauthor       = {Adarsh Prasad and Arun Sai Suggala and Sivaram Balakrishnan and Pradeep Ravikumar},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1802.06485}\n}\n@inproceedings{prashanth2013actor,\n\ttitle        = {Actor-critic algorithms for risk-sensitive MDPs},\n\tauthor       = {Prashanth, LA and Ghavamzadeh, Mohammad},\n\tyear         = 2013,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {252--260}\n}\n@inproceedings{prashanth2014fast,\n\ttitle        = {Fast LSTD using stochastic approximation: Finite time analysis and application to traffic control},\n\tauthor       = {Prashanth, LA and Korda, Nathaniel and Munos, R{\\'e}mi},\n\tyear         = 2014,\n\tbooktitle    = {Joint European Conference on Machine Learning and Knowledge Discovery in Databases},\n\tpages        = {66--81},\n\torganization = {Springer}\n}\n@inproceedings{prashanth2014policy,\n\ttitle        = {Policy gradients for CVaR-constrained MDPs},\n\tauthor       = {Prashanth, LA},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Algorithmic Learning Theory},\n\tpages        = {155--169},\n\torganization = {Springer}\n}\n@article{pregibon1981logistic,\n\ttitle        = {Logistic regression diagnostics},\n\tauthor       = {Daryl Pregibon and others},\n\tyear         = 1981,\n\tjournal      = {Annals of Statistics},\n\tvolume       = 9,\n\tnumber       = 4,\n\tpages        = {705--724}\n}\n@inproceedings{press2017language,\n\ttitle        = {Language Generation with Recurrent Generative Adversarial Networks without Pre-training},\n\tauthor       = {Ofir Press and Amir Bar and Ben Bogin and Jonathan Berant and Lior Wolf},\n\tyear         = 2017,\n\tbooktitle    = {Fist Workshop on Learning to Generate Natural Language@ICML}\n}\n@article{priamuthu2012online,\n\ttitle        = {Input online review data and related bias in recommender systems},\n\tauthor       = {Selwyn Piramuthu and Gaurav Kapoor and Wei Zhou and Sjouke Mauw},\n\tyear         = 2012,\n\tjournal      = {Decision Support Systems},\n\tvolume       = 53,\n\tpages        = {418--424}\n}\n@inproceedings{price1990atis,\n\ttitle        = {Evaluation of spoken language systems: The {ATIS} domain},\n\tauthor       = {Patti Price},\n\tyear         = 1990,\n\tbooktitle    = {Proceedings of the Third DARPA Speech and Natural Language Workshop},\n\tpages        = {91--95}\n}\n@inproceedings{priedhorsky2007creating,\n\ttitle        = {Creating, destroying, and restoring value in {W}ikipedia},\n\tauthor       = {Reid Priedhorsky and Jilin Chen and Shyong T. K. Lam and Katherine Panciera and Loren Terveen and John Riedl},\n\tyear         = 2007,\n\tbooktitle    = {International {ACM} Conference on Supporting Group Work},\n\tpages        = {259--268}\n}\n@inproceedings{prolat2015approximateDP,\n\ttitle        = {Approximate Dynamic Programming for Two-Player Zero-Sum Markov Games},\n\tauthor       = {Julien P{\\'e}rolat and Bruno Scherrer and Bilal Piot and Olivier Pietquin},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the 32th International Conference on International Conference on Machine Learning},\n\tseries       = {ICML'15}\n}\n@article{propp1996exact,\n\ttitle        = {Exact sampling with coupled {M}arkov chains and applications to statistical mechanics},\n\tauthor       = {JG Propp and DB Wilson},\n\tyear         = 1996,\n\tjournal      = {Random structures and Algorithms},\n\tvolume       = 9,\n\tpages        = {223--252}\n}\n@phdthesis{pross09grounded,\n\ttitle        = {Grounded Discourse Representation Theory: Towards a Semantics-Pragmatics Interface for Human-Machine Collaboration},\n\tauthor       = {Tillmann Pross},\n\tyear         = 2009,\n\tschool       = {University of Stuttgart}\n}\n@inproceedings{prsz18,\n\ttitle        = {Convergence Results for Neural Networks via Electrodynamics},\n\tauthor       = {Panigrahy, Rina and Rahimi, Ali and Sachdeva, Sushant and Zhang, Qiuyi},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1702.00458},\n\tbooktitle    = {ITCS}\n}\n@article{PRTV,\n\ttitle        = {Latent semantic indexing: a probabilistic analysis},\n\tauthor       = {C. Papadimitriou and P. Raghavan and H. Tamaki and S. Vempala},\n\tyear         = 2000,\n\tjournal      = {JCSS},\n\tpages        = {217--235},\n\tnote         = {Preliminary version in {\\em PODS} 1998}\n}\n@inproceedings{pruthi2019misspellings,\n\ttitle        = {Combating Adversarial Misspellings with Robust Word Recognition},\n\tauthor       = {Danish Pruthi and Bhuwan Dhingra and Zachary C. Lipton},\n\tyear         = 2019,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{pryzant2017domainmix,\n\ttitle        = {Effective Domain Mixing for Neural Machine Translation},\n\tauthor       = {Reid Pryzant and Denny Britz and Quoc V. Le},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the Second Conference on Machine Translation},\n\tpages        = {118--126}\n}\n@inproceedings{PSX11,\n\ttitle        = {A Spectral Algorithm for Latent Tree Graphical Models},\n\tauthor       = {A. Parikh and L. Song and E. P. Xing},\n\tyear         = 2011,\n\tbooktitle    = {ICML}\n}\n@inproceedings{pulina2010abstraction,\n\ttitle        = {An abstraction-refinement approach to verification of artificial neural networks},\n\tauthor       = {Luca Pulina and Armando Tacchella},\n\tyear         = 2010,\n\tbooktitle    = {Computer Aided Verification (CAV)},\n\tpages        = {243--257}\n}\n@article{pulley2012operational,\n\ttitle        = {Operational implementation of prospective genotyping for personalized medicine: the design of the Vanderbilt PREDICT project},\n\tauthor       = {Pulley, Jill M and Denny, Joshua C and Peterson, Josh F and Bernard, Gordon R and Vnencak-Jones, Cindy L and Ramirez, Andrea H and Delaney, Jessica T and Bowton, Erica and Brothers, Kyle and Johnson, Kevin and others},\n\tyear         = 2012,\n\tjournal      = {Clinical Pharmacology \\& Therapeutics},\n\tpublisher    = {Wiley Online Library},\n\tvolume       = 92,\n\tnumber       = 1,\n\tpages        = {87--95}\n}\n@inproceedings{punyakanok05constrained,\n\ttitle        = {Learning and Inference over Constrained Output},\n\tauthor       = {Vasin Punyakanok and Dan Roth and Wen-tau Yih and Dav Zimak},\n\tyear         = 2005,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@inproceedings{purushotham2017variational,\n\ttitle        = {Variational Recurrent Adversarial Deep Domain Adaptation},\n\tauthor       = {Sanjay Purushotham and Wilka Carvalho and Tanachat Nilanon and Yan Liu},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{pust2015using,\n\ttitle        = {Using syntax-based machine translation to parse {E}nglish into abstract meaning representation},\n\tauthor       = {Michael Pust and Ulf Hermjakob and Kevin Knight and Daniel Marcu and Jonathan May},\n\tyear         = 2015,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@book{puterman2014markov,\n\ttitle        = {{M}arkov decision processes: discrete stochastic dynamic programming},\n\tauthor       = {Puterman, Martin L},\n\tyear         = 2014,\n\tpublisher    = {John Wiley \\& Sons}\n}\n@article{putin2016deep,\n\ttitle        = {Deep biomarkers of human aging: application of deep neural networks to biomarker development},\n\tauthor       = {Evgeny Putin and Polina Mamoshina and Alexander Aliper and Mikhail Korzinkin and Alexey Moskalev and Alexey Kolosov and Alexander Ostrovskiy and Charles Cantor and Jan Vijg and Alex Zhavoronkov},\n\tyear         = 2016,\n\tjournal      = {Aging},\n\tvolume       = 8,\n\tnumber       = 5\n}\n@inproceedings{pw17,\n\ttitle        = {Nonlinear random matrix theory for deep learning},\n\tauthor       = {Pennington, Jeffrey and Worah, Pratik},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NIPS)}\n}\n@article{pyrkov2018extracting,\n\ttitle        = {Extracting biological age from biomedical data via deep learning: too much of a good thing?},\n\tauthor       = {Timothy V Pyrkov and Konstantin Slipensky and Mikhail Barg and Alexey Kondrashin and Boris Zhurov and Alexander Zenin and Mikhail Pyatnitskiy and Leonid Menshikov and Sergei Markov and Peter O Fedichev},\n\tyear         = 2018,\n\tjournal      = {Scientific Reports},\n\tvolume       = 8,\n\tnumber       = 1\n}\n@article{pytorchlightning2019,\n\ttitle        = {PyTorch Lightning},\n\tauthor       = {William {Falcon et al.}},\n\tyear         = 2019,\n\tjournal      = {GitHub. Note: https://github.com/PyTorchLightning/pytorch-lightning},\n\tvolume       = 3\n}\n@article{qi2005eigenvalues,\n\ttitle        = {Eigenvalues of a real supersymmetric tensor},\n\tauthor       = {Qi, L.},\n\tyear         = 2005,\n\tjournal      = {Journal of Symbolic Computation},\n\tpublisher    = {Elsevier},\n\tvolume       = 40,\n\tnumber       = 6,\n\tpages        = {1302--1324}\n}\n@article{qi2020stanza,\n\ttitle        = {Stanza: A Python Natural Language Processing Toolkit for Many Human Languages},\n\tauthor       = {Peng Qi and Yuhao Zhang and Yuhui Zhang and Jason Bolton and Christopher D. Manning},\n\tyear         = 2020,\n\tjournal      = {arXiv}\n}\n@inproceedings{qing2014gradable,\n\ttitle        = {Gradable adjectives, vagueness, and optimal language use: A speaker-oriented model},\n\tauthor       = {Ciyang Qing and Michael Franke},\n\tyear         = 2014,\n\tbooktitle    = {Semantics and Linguistic Theory},\n\tvolume       = 24,\n\tpages        = {23--41}\n}\n@article{qiu2011discovering,\n\ttitle        = {Discovering biological progression underlying microarray samples},\n\tauthor       = {Peng Qiu and Andrew J Gentles and Sylvia K Plevritis},\n\tyear         = 2011,\n\tjournal      = {PLoS Computational Biology},\n\tvolume       = 7,\n\tnumber       = 4\n}\n@article{QRZ-arbitrary-sampling,\n\ttitle        = {Randomized Dual Coordinate Ascent with Arbitrary Sampling},\n\tauthor       = {Zheng Qu and Peter Richt{\\'{a}}rik and Tong Zhang},\n\tyear         = 2014,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1411.5873}\n}\n@article{qu2011generalized,\n\ttitle        = {Generalized constraint neural network regression model subject to linear priors},\n\tauthor       = {Ya-Jun Qu and Bao-Gang Hu},\n\tyear         = 2011,\n\tjournal      = {IEEE Transactions on Neural Networks},\n\tvolume       = 22,\n\tnumber       = 12,\n\tpages        = {2447--2459}\n}\n@inproceedings{quadrianto08labels,\n\ttitle        = {Estimating labels from label proportions},\n\tauthor       = {Novi Quadrianto and Alex J. Smola and Tiberio S. Caetano and Quoc V. Le},\n\tyear         = 2008,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {776--783}\n}\n@inproceedings{quadrianto2019discovering,\n\ttitle        = {Discovering fair representations in the data domain},\n\tauthor       = {Novi Quadrianto and Viktoriia Sharmanska and Oliver Thomas},\n\tyear         = 2019,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {8227--8236}\n}\n@article{quah2006maximum,\n\ttitle        = {Maximum reward reinforcement learning: A non-cumulative reward criterion},\n\tauthor       = {Quah, Kian Hong and Quek, Chai},\n\tyear         = 2006,\n\tjournal      = {Expert Systems with Applications},\n\tpublisher    = {Elsevier},\n\tvolume       = 31,\n\tnumber       = 2,\n\tpages        = {351--359}\n}\n@article{quang2019factornet,\n\ttitle        = {FactorNet: a deep learning framework for predicting cell type specific transcription factor binding from nucleotide-resolution sequential data},\n\tauthor       = {Daniel Quang and Xiaohui Xie},\n\tyear         = 2019,\n\tjournal      = {Methods},\n\tvolume       = 166,\n\tpages        = {40--47}\n}\n@inproceedings{quattoni04crf,\n\ttitle        = {Conditional Random Fields for Object Recognition},\n\tauthor       = {A. Quattoni and M. Collins and T. Darrell},\n\tyear         = 2004,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{quattoni2007hidden,\n\ttitle        = {Hidden-state conditional random fields},\n\tauthor       = {Ariadna Quattoni and Sybor Wang and Louis-Phillipe Morency and Michael Collins and Trevor Darrell},\n\tyear         = 2007,\n\tjournal      = {IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI)},\n\tvolume       = 29,\n\tnumber       = 10,\n\tpages        = {1848--1852}\n}\n@book{quinonero2009dataset,\n\ttitle        = {Dataset shift in machine learning},\n\tauthor       = {Joaquin {Qui\\~nonero-Candela} and Masashi Sugiyama and Anton Schwaighofer and Neil D. Lawrence},\n\tyear         = 2009,\n\tpublisher    = {The MIT Press}\n}\n@inproceedings{quirk2015language,\n\ttitle        = {Language to Code: Learning Semantic Parsers for If-This-Then-That Recipes},\n\tauthor       = {Chris Quirk and Raymond J. Mooney and Michel Galley},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{rabanser2019failing,\n\ttitle        = {Failing loudly: An empirical study of methods for detecting dataset shift},\n\tauthor       = {Stephan Rabanser and Stephan G{\\\"u}nnemann and Zachary Lipton},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1396--1408}\n}\n@article{rabiner1989tutorial,\n\ttitle        = {A tutorial on hidden Markov models and selected applications in speech recognition},\n\tauthor       = {Rabiner, Lawrence R},\n\tyear         = 1989,\n\tjournal      = {Proceedings of the IEEE},\n\tpublisher    = {Ieee},\n\tvolume       = 77,\n\tnumber       = 2,\n\tpages        = {257--286}\n}\n@inproceedings{rabinovich2017abstract,\n\ttitle        = {Abstract Syntax Networks for Code Generation and Semantic Parsing},\n\tauthor       = {Maxim Rabinovich and Mitchell Stern and Dan Klein},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{rad2018scalable,\n\ttitle        = {A scalable estimate of the extra-sample prediction error via approximate leave-one-out},\n\tauthor       = {Kamiar Rahnama Rad and Arian Maleki},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1801.10243}\n}\n@article{radford2016dcgan,\n\ttitle        = {Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks},\n\tauthor       = {Alec Radford and Luke Metz and Soumith Chintala},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@article{radford2018improving,\n\ttitle        = {Improving language understanding by generative pre-training},\n\tauthor       = {Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya},\n\tyear         = 2018,\n\tinstitution  = {OpenAI}\n}\n@article{radford2019language,\n\ttitle        = {Language models are unsupervised multitask learners},\n\tauthor       = {Alec Radford and Jeffrey Wu and Rewon Child and David Luan and Dario Amodei and Ilya Sutskever},\n\tyear         = 2019,\n\tjournal      = {OpenAI Blog},\n\tvolume       = 1,\n\tnumber       = 8\n}\n@inproceedings{radford2021clip,\n\ttitle        = {Learning Transferable Visual Models From Natural Language Supervision},\n\tauthor       = {Alec Radford and Jong Wook Kim and Chris Hallacy and Aditya Ramesh and Gabriel Goh and Sandhini Agarwal and Girish Sastry and Amanda Askell and Pamela Mishkin and Jack Clark and Gretchen Krueger and Ilya Sutskever},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tvolume       = 139,\n\tpages        = {8748--8763}\n}\n@article{radinsky2015data,\n\ttitle        = {Data monopolists like {G}oogle are threatening the economy},\n\tauthor       = {Kira Radinsky},\n\tyear         = 2015,\n\tjournal      = {Harvard Business Review},\n\tvolume       = 2\n}\n@inproceedings{radlinski2008learning,\n\ttitle        = {Learning diverse rankings with multi-armed bandits},\n\tauthor       = {Radlinski, Filip and Kleinberg, Robert and Joachims, Thorsten},\n\tyear         = 2008,\n\tbooktitle    = {Proceedings of the 25th international conference on Machine learning},\n\tpages        = {784--791},\n\torganization = {ACM}\n}\n@article{raffel2019exploring,\n\ttitle        = {Exploring the limits of transfer learning with a unified text-to-text transformer},\n\tauthor       = {Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1910.10683}\n}\n@inproceedings{rafiei1997similarity,\n\ttitle        = {Similarity-based queries for time series data},\n\tauthor       = {Rafiei, Davood and Mendelzon, Alberto},\n\tyear         = 1997,\n\tbooktitle    = {\n\t\tProceedings of the 1997 ACM SIGMOD international conference on Management\n\n\t\tof data\n\t},\n\tlocation     = {Tucson, Arizona, United States},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {SIGMOD '97},\n\tpages        = {13--25},\n\tdoi          = {http://doi.acm.org/10.1145/253260.253264},\n\tisbn         = {0-89791-911-4},\n\tacmid        = 253264,\n\tnumpages     = 13\n}\n@inproceedings{raghavan2005interactive,\n\ttitle        = {InterActive Feature Selection},\n\tauthor       = {Hema Raghavan and Omid Madani and Rosie Jones},\n\tyear         = 2005,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)},\n\tvolume       = 5,\n\tpages        = {841--846}\n}\n@inproceedings{raghavan2018externalities,\n\ttitle        = {The externalities of exploration and how data diversity helps exploitation},\n\tauthor       = {Manish Raghavan and Aleksandrs Slivkins and Jennifer Vaughan Wortman and Zhiwei Steven Wu},\n\tyear         = 2018,\n\tbooktitle    = {Conference on Learning Theory (COLT)},\n\tpages        = {1724--1738}\n}\n@inproceedings{raghavendra2010graph,\n\ttitle        = {Graph expansion and the unique games conjecture},\n\tauthor       = {Raghavendra, Prasad and Steurer, David},\n\tyear         = 2010,\n\tbooktitle    = {Proceedings of the forty-second ACM symposium on Theory of computing},\n\tpages        = {755--764}\n}\n@article{raghu2016expressive,\n\ttitle        = {On the expressive power of deep neural networks},\n\tauthor       = {Raghu, Maithra and Poole, Ben and Kleinberg, Jon and Ganguli, Surya and Sohl-Dickstein, Jascha},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1606.05336}\n}\n@inproceedings{raghu2019direct,\n\ttitle        = {Direct uncertainty prediction for medical second opinions},\n\tauthor       = {Maithra Raghu and Katy Blumer and Rory Sayres and Ziad Obermeyer and Bobby Kleinberg and Sendhil Mullainathan and Jon Kleinberg},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {5281--5290}\n}\n@inproceedings{raghu2019transfusion,\n\ttitle        = {Transfusion: Understanding transfer learning for medical imaging},\n\tauthor       = {Raghu, Maithra and Zhang, Chiyuan and Kleinberg, Jon and Bengio, Samy},\n\tyear         = 2019,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {3347--3357}\n}\n@article{raghu2020survey,\n\ttitle        = {A survey of deep learning for scientific discovery},\n\tauthor       = {Maithra Raghu and Eric Schmidt},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2003.11755}\n}\n@inproceedings{raghunathan2016linear,\n\ttitle        = {Estimation from Indirect Supervision with Linear Moments},\n\tauthor       = {Aditi Raghunathan and Roy Frostig and John Duchi and Percy Liang},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{raghunathan2018certified,\n\ttitle        = {Certified defenses against adversarial examples},\n\tauthor       = {Aditi Raghunathan and Jacob Steinhardt and Percy Liang},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{raghunathan2018sdp,\n\ttitle        = {Semidefinite relaxations for certifying robustness to adversarial examples},\n\tauthor       = {Aditi Raghunathan and Jacob Steinhardt and Percy Liang},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{raghunathan2019hurt,\n\ttitle        = {Adversarial Training Can Hurt Generalization},\n\tauthor       = {Aditi Raghunathan and Sang Michael Xie and Fanny Yang and John C. Duchi and Percy Liang},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1906.06032}\n}\n@inproceedings{raghunathan2020understanding,\n\ttitle        = {Understanding and Mitigating the Tradeoff Between Robustness and Accuracy},\n\tauthor       = {Aditi Raghunathan and Sang Michael Xie and Fanny Yang and John C. Duchi and Percy Liang},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{raginsky2017non,\n\ttitle        = {Non-convex learning via stochastic gradient Langevin dynamics: a nonasymptotic analysis},\n\tauthor       = {Raginsky, Maxim and Rakhlin, Alexander and Telgarsky, Matus},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1702.03849},\n\tbooktitle    = {Conference on Learning Theory},\n\tpages        = {1674--1703}\n}\n@inproceedings{Rahimi05,\n\ttitle        = {Learning Appearance Manifolds from Video},\n\tauthor       = {Ali Rahimi and Ben Recht and Trevor Darrell},\n\tyear         = 2005,\n\tbooktitle    = {Proc.~IEEE CVPR},\n\tdate-added   = {2016-04-04 17:32:01 +0000},\n\tdate-modified = {2016-04-04 17:32:01 +0000}\n}\n@inproceedings{rahimi2007random,\n\ttitle        = {Random Features for Large-Scale Kernel Machines},\n\tauthor       = {Ali Rahimi and Ben Recht},\n\tyear         = 2007,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{raileanu2018modeling,\n\ttitle        = {Modeling Others using Oneself in Multi-Agent Reinforcement Learning},\n\tauthor       = {Roberta Raileanu and Emily L. Denton and Arthur Szlam and Rob Fergus},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{raileanu2020ride,\n\ttitle        = {RIDE: Rewarding Impact-Driven Exploration for Procedurally-Generated Environments},\n\tauthor       = {Roberta Raileanu and Tim Rockt{\\\"{a}}schel},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{raina04hybrid,\n\ttitle        = {Classification with hybrid generative/discriminative models},\n\tauthor       = {R. Raina and Y. Shen and A. Ng and A. McCallum},\n\tyear         = 2004,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{raina2006constructing,\n\ttitle        = {Constructing informative priors using transfer learning},\n\tauthor       = {Raina, Rajat and Ng, Andrew Y and Koller, Daphne},\n\tyear         = 2006,\n\tbooktitle    = {Proceedings of the 23rd international conference on Machine learning},\n\tpages        = {713--720},\n\torganization = {ACM}\n}\n@inproceedings{raina2007self,\n\ttitle        = {Self-taught learning: transfer learning from unlabeled data},\n\tauthor       = {Rajar Raina and Alexis Battle and Honglak Lee and Benjamin Packer and Andrew Y. Ng},\n\tyear         = 2007,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {759--766}\n}\n@inproceedings{rajeswaran2020game,\n\ttitle        = {A game theoretic framework for model based reinforcement learning},\n\tauthor       = {Rajeswaran, Aravind and Mordatch, Igor and Kumar, Vikash},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {7953--7963},\n\torganization = {PMLR}\n}\n@inproceedings{rajpurkar2016squad,\n\ttitle        = {{SQuAD}: 100,000+ Questions for Machine Comprehension of Text},\n\tauthor       = {Pranav Rajpurkar and Jian Zhang and Konstantin Lopyrev and Percy Liang},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{rajpurkar2018squadrun,\n\ttitle        = {Know What You Don't Know: Unanswerable Questions for {SQuAD}},\n\tauthor       = {Pranav Rajpurkar and Robin Jia and Percy Liang},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{rajpurkar2020chexpedition,\n\ttitle        = {CheXpedition: Investigating Generalization Challenges for Translation of Chest X{-}ray Algorithms to the Clinical Setting},\n\tauthor       = {Pranav Rajpurkar and Anirudh Joshi and Anuj Pareek and Phil Chen and Amirhossein Kiani and Jeremy Irvin and Andrew Ng and Matthew Lungren},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2002.11379}\n}\n@article{rakelly2019efficient,\n\ttitle        = {Efficient off-policy meta-reinforcement learning via probabilistic context variables},\n\tauthor       = {Kate Rakelly and Aurick Zhou and Deirdre Quillen and Chelsea Finn and Sergey Levine},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1903.08254}\n}\n@article{rakhlin2009lecture,\n\ttitle        = {Lecture notes on online learning},\n\tauthor       = {Rakhlin, Alexander},\n\tyear         = 2009,\n\tjournal      = {Draft},\n\tnote         = {Available at \\url{http://www-stat.wharton.upenn.edu/~rakhlin/courses/stat991/papers/lecture_notes.pdf}}\n}\n@inproceedings{rakhlin2011making,\n\ttitle        = {Making gradient descent optimal for strongly convex stochastic optimization},\n\tauthor       = {Rakhlin, Alexander and Shamir, Ohad and Sridharan, Karthik},\n\tyear         = 2012,\n\tbooktitle    = {ICML}\n}\n@inproceedings{rakhlin2011online,\n\ttitle        = {Online learning: stochastic, constrained, and smoothed adversaries},\n\tauthor       = {Rakhlin, Alexander and Sridharan, Karthik and Tewari, Ambuj},\n\tyear         = 2011,\n\tbooktitle    = {Proceedings of the 24th International Conference on Neural Information Processing Systems},\n\tpages        = {1764--1772}\n}\n@inproceedings{rakhlin2013online,\n\ttitle        = {Online Learning with Predictable Sequences},\n\tauthor       = {Alexander Rakhlin and Karthik Sridharan},\n\tyear         = 2013,\n\tbooktitle    = {Conference on Learning Theory (COLT)},\n\tpages        = {993--1019}\n}\n@inproceedings{rakhlin2013optimization,\n\ttitle        = {Optimization, learning, and games with predictable sequences},\n\tauthor       = {Rakhlin, Sasha and Sridharan, Karthik},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {3066--3074}\n}\n@article{rakhlin2015online,\n\ttitle        = {Online Learning via Sequential Complexities},\n\tauthor       = {Rakhlin, Alexander and Sridharan, Karthik and Tewari, Ambuj},\n\tyear         = 2015,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 16,\n\tnumber       = 6,\n\tpages        = {155--186}\n}\n@article{rakhlin2015sequential,\n\ttitle        = {Sequential complexities and uniform martingale laws of large numbers},\n\tauthor       = {Rakhlin, Alexander and Sridharan, Karthik and Tewari, Ambuj},\n\tyear         = 2015,\n\tjournal      = {Probability Theory and Related Fields},\n\tpublisher    = {Springer},\n\tvolume       = 161,\n\tnumber       = {1-2},\n\tpages        = {111--153}\n}\n@article{ram1986government,\n\ttitle        = {Government size and economic growth: A new framework and some evidence from cross-section and time-series data},\n\tauthor       = {Rati Ram},\n\tyear         = 1986,\n\tjournal      = {The American Economic Review},\n\tvolume       = 76,\n\tnumber       = 1,\n\tpages        = {191--203}\n}\n@article{ram2009incremental,\n\ttitle        = {Incremental stochastic subgradient algorithms for convex optimization},\n\tauthor       = {Ram, S. Sundhar and Nedi\\'c, Angelia and Veeravalli, Venugopal V.},\n\tyear         = 2009,\n\tjournal      = {SIAM J. Optim.},\n\tvolume       = 20,\n\tnumber       = 2,\n\tpages        = {691--717},\n\tdoi          = {10.1137/080726380},\n\tissn         = {1052-6234},\n\turl          = {http://dx.doi.org/10.1137/080726380},\n\tfjournal     = {SIAM Journal on Optimization},\n\tmrclass      = {90C25 (68W15)},\n\tmrnumber     = 2515792,\n\tmrreviewer   = {Teemu Pennanen}\n}\n@inproceedings{ram2011density,\n\ttitle        = {Density estimation trees},\n\tauthor       = {Parikshit Ram and Alexander G Gray},\n\tyear         = 2011,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)},\n\tpages        = {627--635}\n}\n@article{ramachandran2017searching,\n\ttitle        = {Searching for activation functions},\n\tauthor       = {Ramachandran, Prajit and Zoph, Barret and Le, Quoc V},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1710.05941}\n}\n@article{ramachandran2018pretraining,\n\ttitle        = {Unsupervised pretraining for sequence to sequence learning},\n\tauthor       = {P. Ramachandran and P. J. Liu and Q. V. Le},\n\tyear         = 2018,\n\tjournal      = {arXiv}\n}\n@inproceedings{ramanathan2013event,\n\ttitle        = {Video Event Understanding using Natural Language Descriptions},\n\tauthor       = {Vignesh Ramanathan and Percy Liang and Li Fei-Fei},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Computer Vision (ICCV)}\n}\n@inproceedings{ramanathan2014linking,\n\ttitle        = {Linking people with \"their\" names using coreference resolution},\n\tauthor       = {Vignesh Ramanathan and Armand Joulin and Percy Liang and Li Fei-Fei},\n\tyear         = 2014,\n\tbooktitle    = {European Conference on Computer Vision (ECCV)}\n}\n@article{ramdas2012optimal,\n\ttitle        = {Optimal rates for first-order stochastic convex optimization under Tsybakov noise condition},\n\tauthor       = {Ramdas, Aaditya and Singh, Aarti},\n\tyear         = 2012,\n\tjournal      = {arXiv preprint arXiv:1207.3012}\n}\n@inproceedings{ramos2008c,\n\ttitle        = {C-Oracle: Predictive Thermal Management for Data Centers},\n\tauthor       = {Luiz Ramos and Ricardo Bianchini},\n\tyear         = 2008,\n\tmonth        = feb,\n\tbooktitle    = {\n\t\tHPCA 2008. IEEE 14th International Symposium on High Performance\n\n\t\tComputer Architecture\n\t},\n\tpages        = {111--122},\n\tdoi          = {10.1109/HPCA.2008.4658632},\n\tissn         = {1530-0897},\n\tkeywords     = {\n\t\tC-Oracle;Internet services;data centers;dynamic voltage/frequency\n\n\t\tscaling;load redistribution;multitier services;power-dense server\n\n\t\tclusters;predictive thermal management policy;software infrastructure;computer\n\n\t\tcentres;thermal management (packaging);\n\t}\n}\n@inproceedings{randlov1998learning,\n\ttitle        = {Learning to Drive a Bicycle Using Reinforcement Learning and Shaping},\n\tauthor       = {Jette Randl{\\o{}}v and Preben Alstr{\\o{}}m},\n\tyear         = 1998,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@book{RandomizedAlgorithms,\n\ttitle        = {Randomized algorithms},\n\tauthor       = {Motwani, Rajeev and Raghavan, Prabhakar},\n\tyear         = 1995,\n\tpublisher    = {Cambridge University Press},\n\taddress      = {New York, NY, USA}\n}\n@article{RandomMatrices:Tropp,\n\ttitle        = {User-Friendly Tail Bounds for Sums of Random Matrices},\n\tauthor       = {Joel A. Tropp},\n\tyear         = 2012,\n\tjournal      = {Foundations of Computational Mathematics},\n\tvolume       = 12,\n\tnumber       = 4,\n\tpages        = {389--434}\n}\n@inproceedings{ranganath2014black,\n\ttitle        = {Black box variational inference},\n\tauthor       = {Rajesh Ranganath and Sean Gerrish and David Blei},\n\tyear         = 2014,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)},\n\tpages        = {814--822}\n}\n@inproceedings{rangel2014features,\n\ttitle        = {Features and Pitfalls that Users Should Seek in Natural Language Interfaces to Databases},\n\tauthor       = {Rodolfo A Pazos Rangel and Marco A Aguirre and Juan J González and Juan Martín Carpio},\n\tyear         = 2014,\n\tbooktitle    = {Recent Advances on Hybrid Approaches for Designing Intelligent Systems},\n\tpages        = {617--630}\n}\n@inproceedings{ranger2007evaluating,\n\ttitle        = {Evaluating MapReduce for Multi-core and Multiprocessor Systems},\n\tauthor       = {\n\t\tRanger, Colby and Raghuraman, Ramanan and Penmetsa, Arun and Bradski,\n\n\t\tGary and Kozyrakis, Christos\n\t},\n\tyear         = 2007,\n\tbooktitle    = {\n\t\tProceedings of the 2007 IEEE 13th International Symposium on High\n\n\t\tPerformance Computer Architecture\n\t},\n\tpublisher    = {IEEE Computer Society},\n\taddress      = {Washington, DC, USA},\n\tpages        = {13--24},\n\tdoi          = {10.1109/HPCA.2007.346181},\n\tisbn         = {1-4244-0804-0},\n\tacmid        = 1318097,\n\tnumpages     = 12\n}\n@article{ranzato2015sequence,\n\ttitle        = {Sequence level training with recurrent neural networks},\n\tauthor       = {Marc'Aurelio Ranzato and Sumit Chopra and Michael Auli and Wojciech Zaremba},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1511.06732}\n}\n@book{rao2007linear,\n\ttitle        = {Linear Models and Generalizations: Least Squares and Alternatives},\n\tauthor       = {C. Radhakrishna Rao and Helge Toutenburg and Shalabh and Christian Heumann},\n\tyear         = 2007,\n\tpublisher    = {Springer Publishing Company}\n}\n@article{rao2018learning,\n\ttitle        = {Learning to Ask Good Questions: Ranking Clarification Questions using Neural Expected Value of Perfect Information},\n\tauthor       = {Sudha Rao and Hal Daum\\'{e} III},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.04655}\n}\n@article{rashkin2018know,\n\ttitle        = {{I} Know the Feeling: Learning to Converse with Empathy},\n\tauthor       = {Hannah Rashkin and Eric Michael Smith and Margaret Li and Y-Lan Boureau},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1811.00207}\n}\n@article{raskutti2012minimax,\n\ttitle        = {Minimax-optimal rates for sparse additive models over kernel classes via convex programming},\n\tauthor       = {Raskutti, Garvesh and Wainwright, Martin J and Yu, Bin},\n\tyear         = 2012,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 13,\n\tnumber       = {Feb},\n\tpages        = {389--427}\n}\n@article{raskutti2015information,\n\ttitle        = {The information geometry of mirror descent},\n\tauthor       = {Raskutti, Garvesh and Mukherjee, Sayan},\n\tyear         = 2015,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tpublisher    = {IEEE},\n\tvolume       = 61,\n\tnumber       = 3,\n\tpages        = {1451--1457}\n}\n@article{rasmussen1996delve,\n\ttitle        = {Delve data for evaluating learning in valid experiments},\n\tauthor       = {Rasmussen, Carl Edward and Neal, Radford M and Hinton, Georey and van Camp, Drew and Revow, Michael and Ghahramani, Zoubin and Kustra, Rafal and Tibshirani, Rob},\n\tyear         = 1996,\n\tjournal      = {URL http://www. cs. toronto. edu/~ delve}\n}\n@article{ratcliff1990connectionist,\n\ttitle        = {Connectionist models of recognition memory: constraints imposed by learning and forgetting functions},\n\tauthor       = {Roger Ratcliff},\n\tyear         = 1990,\n\tjournal      = {Psychological review},\n\tvolume       = 97\n}\n@inproceedings{ratinov2011local,\n\ttitle        = {Local and Global Algorithms for Disambiguation to {W}ikipedia},\n\tauthor       = {Lee Ratinov and Dan Roth and D. Downey and M. Anderson},\n\tyear         = 2011,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{ratliff2009chomp,\n\ttitle        = {{CHOMP}: Gradient Optimization Techniques for Efficient Motion Planning},\n\tauthor       = {N. Ratliff and M. Zucker and D. Bagnell and S. Srinivasa},\n\tyear         = 2009,\n\tbooktitle    = {International Conference on Robotics and Automation (ICRA)}\n}\n@article{ratnaparkhi02surface,\n\ttitle        = {Trainable approaches to surface natural language generation and their application to conversational dialog systems},\n\tauthor       = {Adwait Ratnaparkhi},\n\tyear         = 2002,\n\tjournal      = {Computer, Speech \\& Language},\n\tvolume       = 16,\n\tpages        = {435--455}\n}\n@phdthesis{ratnaparkhi98thesis,\n\ttitle        = {Maximum entropy models for natural language ambiguity resolution},\n\tauthor       = {Adwait Ratnaparkhi},\n\tyear         = 1998,\n\tschool       = {University of Pennsylvania}\n}\n@inproceedings{ratner2016data,\n\ttitle        = {Data programming: Creating large training sets, quickly},\n\tauthor       = {Alexander J Ratner and Christopher M De Sa and Sen Wu and Daniel Selsam and Christopher R\\'{e}},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {3567--3575}\n}\n@inproceedings{ratner2017snorkel,\n\ttitle        = {Snorkel: Rapid training data creation with weak supervision},\n\tauthor       = {Alexander Ratner and Stephen H Bach and Henry Ehrenberg and Jason Fries and Sen Wu and Christopher R\\'{e}},\n\tyear         = 2017,\n\tbooktitle    = {Very Large Data Bases (VLDB)},\n\tnumber       = 3,\n\tpages        = {269--282}\n}\n@article{rauch1965maximum,\n\ttitle        = {Maximum likelihood estimates of linear dynamic systems},\n\tauthor       = {Rauch, H. E. and Tung, F. and Striebel, C. T.},\n\tyear         = 1965,\n\tmonth        = aug,\n\tjournal      = {AIAA Journal},\n\tvolume       = 3,\n\tnumber       = 8,\n\tpages        = {1445--1450},\n\tdoi          = {10.2514/3.3166},\n\tissn         = {0001-1452},\n\tciteulike-article-id = 9533564,\n\tciteulike-linkout-0 = {http://dx.doi.org/10.2514/3.3166},\n\tkeywords     = {smoother},\n\tposted-at    = {2011-07-11 21:56:40},\n\tpriority     = 2\n}\n@inproceedings{ravi2011deciphering,\n\ttitle        = {Deciphering foreign language},\n\tauthor       = {Sujith Ravi and Kevin Knight},\n\tyear         = 2011,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {12--21}\n}\n@inproceedings{ravi2017metalearning,\n\ttitle        = {Optimization as a Model for Few-shot Learning},\n\tauthor       = {Sachin Ravi and Hugo Larochelle},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{ravichandran2002learning,\n\ttitle        = {Learning surface text patterns for a question answering system},\n\tauthor       = {Deepak Ravichandran and Eduard Hovy},\n\tyear         = 2002,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {41--47}\n}\n@inproceedings{ravikumar2006quadratic,\n\ttitle        = {Quadratic programming relaxations for metric labeling and {M}arkov random field {MAP} estimation},\n\tauthor       = {P. Ravikumar and J. Lafferty},\n\tyear         = 2006,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {737--744}\n}\n@inproceedings{ravuri2019seeing,\n\ttitle        = {Seeing is Not Necessarily Believing: Limitations of BigGANs for Data Augmentation},\n\tauthor       = {Suman Ravuri and Oriol Vinyals},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations Workshop (ICLR)}\n}\n@misc{rawlison1976letterpos,\n\ttitle        = {The significance of letter position in word recognition},\n\tauthor       = {Graham Ernest Rawlinson},\n\tyear         = 1976,\n\thowpublished = {Ph.D. thesis, University of Nottingham}\n}\n@book{rawls2001,\n\ttitle        = {Justice as fairness: a restatement},\n\tauthor       = {John Rawls},\n\tyear         = 2001,\n\tpublisher    = {Harvard University Press}\n}\n@book{rawls2009,\n\ttitle        = {A theory of justice: Revised edition},\n\tauthor       = {John Rawls},\n\tyear         = 2009,\n\tpublisher    = {Harvard University Press}\n}\n@inproceedings{raymond2007generative,\n\ttitle        = {Generative and discriminative algorithms for spoken language understanding},\n\tauthor       = {Christian Raymond and Giuseppe Riccardi},\n\tyear         = 2007,\n\tbooktitle    = {InterSpeech}\n}\n@inproceedings{raz2016fast,\n\ttitle        = {Fast learning requires good memory: A time-space lower bound for parity learning},\n\tauthor       = {Raz, Ran},\n\tyear         = 2016,\n\tbooktitle    = {Foundations of Computer Science (FOCS), 2016 IEEE 57th Annual Symposium on},\n\tpages        = {266--275},\n\torganization = {IEEE}\n}\n@inproceedings{raz2017time,\n\ttitle        = {A Time-Space Lower Bound for a Large Class of Learning Problems},\n\tauthor       = {Raz, Ran},\n\tyear         = 2017,\n\tbooktitle    = {Foundations of Computer Science (FOCS), 2017 IEEE 58th Annual Symposium on},\n\tvolume       = 24,\n\torganization = {IEEE}\n}\n@article{razin2020implicit,\n\ttitle        = {Implicit Regularization in Deep Learning May Not Be Explainable by Norms},\n\tauthor       = {Razin, Noam and Cohen, Nadav},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2005.06398}\n}\n@inproceedings{RBL,\n\ttitle        = {Sparse Feature Learning for Deep Belief Networks},\n\tauthor       = {Marc'Aurelio Ranzato and Y{-}Lan Boureau and Yann LeCun},\n\tyear         = 2007,\n\tbooktitle    = {Advances in Neural Information Processing Systems 20, Proceedings of the Twenty-First Annual Conference on Neural Information Processing Systems, Vancouver, British Columbia, Canada, December 3-6, 2007},\n\tpages        = {1185--1192},\n\turl          = {http://papers.nips.cc/paper/3363-sparse-feature-learning-for-deep-belief-networks},\n\tcrossref     = {DBLP:conf/nips/2007},\n\ttimestamp    = {Thu, 11 Dec 2014 17:34:08 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/nips/RanzatoBL07},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@article{rebrova2015coverings,\n\ttitle        = {Coverings of random ellipsoids, and invertibility of matrices with iid heavy-tailed entries},\n\tauthor       = {Elizaveta Rebrova and Konstantin Tikhomirov},\n\tyear         = 2015,\n\tjournal      = {arXiv}\n}\n@article{rebrova2016norms,\n\ttitle        = {Norms of random matrices: local and global problems},\n\tauthor       = {Elizaveta Rebrova and Roman Vershynin},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@article{recasens2010paraphrase,\n\ttitle        = {On paraphrase and coreference},\n\tauthor       = {Marta Recasens and Marta Vila},\n\tyear         = 2010,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 36,\n\tnumber       = 4,\n\tpages        = {639--647}\n}\n@inproceedings{recasens2013bias,\n\ttitle        = {Linguistic Models for Analyzing and Detecting Biased Language},\n\tauthor       = {Marta Recasens and Cristian Danescu-Niculescu-Mizil and Dan Jurafsky},\n\tyear         = 2013,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@misc{Recht09,\n\ttitle        = {A simpler approach to matrix completion},\n\tauthor       = {B.~Recht},\n\tyear         = 2009,\n\tnote         = {arXiv:0910.0651v2}\n}\n@article{Recht11,\n\ttitle        = {A Simpler Approach to Matrix Completion},\n\tauthor       = {Recht, Benjamin},\n\tyear         = 2011,\n\tmonth        = dec,\n\tjournal      = {J. Mach. Learn. Res.},\n\tpublisher    = {JMLR.org},\n\tvolume       = 12,\n\tpages        = {3413--3430},\n\tissn         = {1532-4435},\n\turl          = {http://dl.acm.org/citation.cfm?id=1953048.2185803},\n\tissue_date   = {2/1/2011},\n\tnumpages     = 18,\n\tacmid        = 2185803\n}\n@inproceedings{recht2011hogwild,\n\ttitle        = {Hogwild: A lock-free approach to parallelizing stochastic gradient descent},\n\tauthor       = {Recht, Benjamin and Re, Christopher and Wright, Stephen and Niu, Feng},\n\tyear         = 2011,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {693--701}\n}\n@article{recht2013parallel,\n\ttitle        = {Parallel stochastic gradient algorithms for large-scale matrix completion},\n\tauthor       = {B. Recht and C. Ré},\n\tyear         = 2013,\n\tjournal      = {Mathematical Programming Computation},\n\tvolume       = 5,\n\tpages        = {1--26}\n}\n@article{recht2018cifar,\n\ttitle        = {Do {CIFAR}-10 Classifiers Generalize to {CIFAR}-10?},\n\tauthor       = {Benjamin Recht and Rebecca Roelofs and Ludwig Schmidt and Vaishaal Shankar},\n\tyear         = 2018,\n\tjournal      = {arXiv}\n}\n@inproceedings{recht2019doimagenet,\n\ttitle        = {Do ImageNet Classifiers Generalize to ImageNet?},\n\tauthor       = {Benjamin Recht and Rebecca Roelofs and Ludwig Schmidt and Vaishaal Shankar},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{recht2019imagenet,\n\ttitle        = {Do imagenet classifiers generalize to imagenet?},\n\tauthor       = {Recht, Benjamin and Roelofs, Rebecca and Schmidt, Ludwig and Shankar, Vaishaal},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1902.10811}\n}\n@inproceedings{reckman2010virtualgame,\n\ttitle        = {Learning meanings of words and constructions, grounded in a virtual game},\n\tauthor       = {Hilke Reckman and Jeff Orkin and Deb Roy},\n\tyear         = 2010,\n\tbooktitle    = {Conference on Natural Language Processing (KONVENS)}\n}\n@article{Reddi2016-nonconvexSAGA,\n\ttitle        = {Fast Incremental Method for Nonconvex Optimization},\n\tauthor       = {Sashank J. Reddi and Suvrit Sra and Barnabas Poczos and Alex Smola},\n\tyear         = 2016,\n\tmonth        = mar,\n\tjournal      = {ArXiv e-prints},\n\tvolume       = {abs/1603.06159}\n}\n@article{Reddi2016-nonconvexSVRG,\n\ttitle        = {Stochastic Variance Reduction for Nonconvex Optimization},\n\tauthor       = {Sashank J. Reddi and Ahmed Hefny and Suvrit Sra and Barnabas Poczos and Alex Smola},\n\tyear         = 2016,\n\tmonth        = mar,\n\tjournal      = {ArXiv e-prints},\n\tvolume       = {abs/1603.06160}\n}\n@inproceedings{reddi2016stochastic,\n\ttitle        = {Stochastic Variance Reduction for Nonconvex Optimization},\n\tauthor       = {Reddi, Sashank J and Hefny, Ahmed and Sra, Suvrit and P{\\'o}cz{\\'o}s, Barnab{\\'a}s and Smola, Alex},\n\tyear         = 2016,\n\tjournal      = {ICML}\n}\n@article{reddy2014large,\n\ttitle        = {Large-scale Semantic Parsing without Question-Answer Pairs},\n\tauthor       = {Siva Reddy and Mirella Lapata and Mark Steedman},\n\tyear         = 2014,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 2,\n\tnumber       = 10,\n\tpages        = {377--392}\n}\n@inproceedings{reddy2016transforming,\n\ttitle        = {Transforming dependency structures to logical forms for semantic parsing},\n\tauthor       = {Siva Reddy and Oscar T{\\\"a}ckstr{\\\"o}m and Michael Collins and Tom Kwiatkowski and Dipanjan Das and Mark Steedman and Mirella Lapata},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {127--140}\n}\n@article{reddy2018coqa,\n\ttitle        = {{CoQA}: A conversational question answering challenge},\n\tauthor       = {Siva Reddy and Danqi Chen and Christopher D Manning},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1808.07042}\n}\n@inproceedings{reddy2018shared,\n\ttitle        = {Shared autonomy via deep reinforcement learning},\n\tauthor       = {Siddharth Reddy and Anca D Dragan and Sergey Levine},\n\tyear         = 2018,\n\tbooktitle    = {Robotics: Science and Systems (RSS)}\n}\n@inproceedings{redmon2016yolo,\n\ttitle        = {You Only Look Once: Unified, Real-Time Object Detection},\n\tauthor       = {Joseph Redmon and Santosh Divvala and Ross B. Girshick and Ali Farhadi},\n\tyear         = 2016,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {779--788}\n}\n@article{redmond2002data,\n\ttitle        = {A data-driven software tool for enabling cooperative information sharing among police departments},\n\tauthor       = {Michael Redmond and Alok Baveja},\n\tyear         = 2002,\n\tjournal      = {European Journal of Operational Research},\n\tvolume       = 141,\n\tnumber       = 3,\n\tpages        = {660--678}\n}\n@article{reed2001pareto,\n\ttitle        = {The Pareto, Zipf and other power laws},\n\tauthor       = {Reed, William J},\n\tyear         = 2001,\n\tjournal      = {Economics letters},\n\tpublisher    = {Elsevier},\n\tvolume       = 74,\n\tnumber       = 1,\n\tpages        = {15--19}\n}\n@inproceedings{reed2016neural,\n\ttitle        = {Neural Programmer-Interpreters},\n\tauthor       = {Scott Reed and Nando de Freitas},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{reed2021self,\n\ttitle        = {Self-Supervised Pretraining Improves Self-Supervised Pretraining},\n\tauthor       = {Colorado J. Reed and Xiangyu Yue and Ani Nrusimha and Sayna Ebrahimi and Vivek Vijaykumar and Richard Mao and Bo Li and Shanghang Zhang and Devin Guillory and Sean Metzger and Kurt Keutzer and Trevor Darrell},\n\tyear         = 2021,\n\tjournal      = {arXiv}\n}\n@article{reeves2009managing,\n\ttitle        = {\n\t\tManaging massive time series streams with multi-scale compressed\n\n\t\ttrickles\n\t},\n\tauthor       = {Reeves, Galen and Liu, Jie and Nath, Suman and Zhao, Feng},\n\tyear         = 2009,\n\tmonth        = aug,\n\tjournal      = {Proc. VLDB Endow.},\n\tpublisher    = {VLDB Endowment},\n\tvolume       = 2,\n\tpages        = {97--108},\n\tissn         = {2150-8097},\n\tacmid        = 1687639,\n\tissue        = 1,\n\tissue_date   = {August 2009},\n\tnumpages     = 12\n}\n@article{Regev,\n\ttitle        = {On lattices, learning with errors, random linear codes, and cryptography},\n\tauthor       = {Regev, Oded},\n\tyear         = 2009,\n\tmonth        = sep,\n\tjournal      = {J. ACM},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tvolume       = 56,\n\tnumber       = 6,\n\tpages        = {34:1--34:40},\n\tdoi          = {10.1145/1568318.1568324},\n\tissn         = {0004-5411},\n\turl          = {http://doi.acm.org/10.1145/1568318.1568324},\n\tissue_date   = {September 2009},\n\tarticleno    = 34,\n\tnumpages     = 40,\n\tacmid        = 1568324,\n\tkeywords     = {Lattice, average-case hardness, cryptography, public key encryption, quantum computation}\n}\n@inproceedings{regev2017learning,\n\ttitle        = {On Learning Mixtures of Well-Separated Gaussians},\n\tauthor       = {Oded Regev and Aravindan Vijayaraghavan},\n\tyear         = 2017,\n\tbooktitle    = {Foundations of Computer Science (FOCS)}\n}\n@article{regier2001grounding,\n\ttitle        = {Grounding spatial language in perception: An empirical and computational investigation},\n\tauthor       = {Terry Regier and Laura A Carlson},\n\tyear         = 2001,\n\tjournal      = {Journal of experimental psychology: General},\n\tvolume       = 130,\n\tnumber       = 2\n}\n@article{rehsommer,\n\ttitle        = {A network that uses few active neurones to code visual input predicts the diverse shapes of cortical receptive fields},\n\tauthor       = {Martin Rehn and Friedrich T. Sommer},\n\tyear         = 2007,\n\tjournal      = {Journal of Computational Neuroscience},\n\tvolume       = 22,\n\tnumber       = 2,\n\tpages        = {135--146},\n\tdoi          = {10.1007/s10827-006-0003-9},\n\turl          = {http://dx.doi.org/10.1007/s10827-006-0003-9},\n\ttimestamp    = {Thu, 16 Oct 2014 21:34:03 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/journals/jcns/RehnS07},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@article{reid2011information,\n\ttitle        = {Information, divergence and risk for binary experiments},\n\tauthor       = {Mark D Reid and Robert C Williamson},\n\tyear         = 2011,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 12,\n\tpages        = {731--817}\n}\n@article{reid2016pseudotime,\n\ttitle        = {Pseudotime estimation: deconfounding single cell time series},\n\tauthor       = {John E Reid and Lorenz Wernisch},\n\tyear         = 2016,\n\tjournal      = {Bioinformatics},\n\tvolume       = 32,\n\tnumber       = 19,\n\tpages        = {2973--2980}\n}\n@inproceedings{reimers2019sentence,\n\ttitle        = {Sentence-{BERT}: Sentence Embeddings using Siamese {BERT}-Networks},\n\tauthor       = {Nils Reimers and Iryna Gurevych},\n\tyear         = 2019,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{reimers2020multisbert,\n\ttitle        = {Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation},\n\tauthor       = {Nils Reimers and Iryna Gurevych},\n\tyear         = 2020,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{reiner2018mortality,\n\ttitle        = {Variation in Childhood Diarrheal Morbidity and Mortality in Africa, 2000–2015},\n\tauthor       = {Robert C. Reiner and Nicholas Graetz and Daniel C. Casey and Christopher Troeger and Gregory M. Garcia and Jonathan F. Mosser and Aniruddha Deshpande and Scott J. Swartz and  Sarah E. Ray and Brigette F. Blacker and Puja C. Rao and Aaron Osgood-Zimmerman and Roy Burstein and David M. Pigott and Ian M. Davis and Ian D. Letourneau and Lucas Earl and Jennifer M. Ross and Ibrahim A. Khalil and Tamer H. Farag and Oliver J. Brady and Moritz U.G. Kraemer and David L. Smith and Samir Bhatt and Daniel J. Weiss and Peter W. Gething and Nicholas J. Kassebaum and Ali H. Mokdad and Christopher J.L. Murray and Simon I. Hay},\n\tyear         = 2018,\n\tjournal      = {New England Journal of Medicine},\n\tvolume       = 379\n}\n@inproceedings{reinhardt2007multi,\n\ttitle        = {\n\t\tA Multi-Level Parallel Implementation of a Program for Finding Frequent\n\n\t\tPatterns in a Large Sparse Graph\n\t},\n\tauthor       = {Steve Reinhardt and George Karypis},\n\tyear         = 2007,\n\tbooktitle    = {\n\t\tIPDPS 2007. IEEE International Parallel and Distributed Processing\n\n\t\tSymposium\n\t},\n\tpages        = {1--8},\n\tee           = {http://dx.doi.org/10.1109/IPDPS.2007.370404}\n}\n@article{reiter05sumtime,\n\ttitle        = {Choosing words in computer-generated weather forecasts},\n\tauthor       = {Ehud Reiter and Somayajulu Sripada and Jim Hunter and Jin Yu and Ian Davy},\n\tyear         = 2005,\n\tjournal      = {Artificial Intelligence},\n\tvolume       = 167,\n\tpages        = {137--169}\n}\n@article{reker2020practical,\n\ttitle        = {Practical considerations for active machine learning in drug discovery},\n\tauthor       = {Daniel Reker},\n\tyear         = 2020,\n\tjournal      = {Drug Discovery Today: Technologies}\n}\n@article{relethford1978use,\n\ttitle        = {The use of principal components in the analysis of cross-sectional growth data},\n\tauthor       = {John H Relethford and Francis C Lees and Pamela J Byard},\n\tyear         = 1978,\n\tjournal      = {Human Biology},\n\tpages        = {461--475}\n}\n@inproceedings{ren06figure,\n\ttitle        = {Figure/Ground Assignment in Natural Images},\n\tauthor       = {Xiaofeng Ren and Charless C. Fowlkes and Jitendra Malik},\n\tyear         = 2006,\n\tbooktitle    = {European Conference on Computer Vision (ECCV)}\n}\n@inproceedings{ren2002state,\n\ttitle        = {State aggregation in {Markov} decision processes},\n\tauthor       = {Ren, Zhiyuan and Krogh, Bruce H},\n\tyear         = 2002,\n\tbooktitle    = {Proceedings of the 41st IEEE Conference on Decision and Control},\n\tvolume       = 4,\n\tpages        = {3819--3824},\n\torganization = {IEEE}\n}\n@inproceedings{ren2012rgb,\n\ttitle        = {Rgb-(d) scene labeling: Features and algorithms},\n\tauthor       = {Xiaofeng Ren and Liefeng Bo and Dieter Fox},\n\tyear         = 2012,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {2759--2766}\n}\n@inproceedings{ren2015exploring,\n\ttitle        = {Exploring models and data for image question answering},\n\tauthor       = {Mengye Ren and Ryan Kiros and Richard Zemel},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {2953--2961}\n}\n@article{ren2015frcnn,\n\ttitle        = {Faster {R}-{CNN}: Towards Real-Time Object Detection with Region Proposal Networks},\n\tauthor       = {Shaoqing Ren and Kaiming He and Ross B. Girshick and Jian Sun},\n\tyear         = 2015,\n\tjournal      = {IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI)},\n\tvolume       = 39,\n\tpages        = {1137--1149}\n}\n@inproceedings{ren2018adversarial,\n\ttitle        = {Adversarial domain adaptation for classification of prostate histopathology whole-slide images},\n\tauthor       = {Jian Ren and Ilker Hacihaliloglu and Eric A Singer and David J Foran and Xin Qi},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Medical Image Computing and Computer-Assisted Intervention},\n\tpages        = {201--209}\n}\n@inproceedings{ren2018learning,\n\ttitle        = {Learning to Reweight Examples for Robust Deep Learning},\n\tauthor       = {Ren, Mengye and Zeng, Wenyuan and Yang, Bin and Urtasun, Raquel},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {4334--4343}\n}\n@inproceedings{ren2018reweighting,\n\ttitle        = {Learning to Reweight Examples for Robust Deep Learning},\n\tauthor       = {Mengye Ren and Wenyuan Zeng and B. Yang and R. Urtasun},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{ren2021nearly,\n\ttitle        = {Nearly Horizon-Free Offline Reinforcement Learning},\n\tauthor       = {Ren, Tongzheng and Li, Jialian and Dai, Bo and Du, Simon S and Sanghavi, Sujay},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2103.14077}\n}\n@inproceedings{rendle2010factorizing,\n\ttitle        = {Factorizing personalized {M}arkov chains for next-basket recommendation},\n\tauthor       = {Rendle, Steffen and Freudenthaler, Christoph and Schmidt-Thieme, Lars},\n\tyear         = 2010,\n\tbooktitle    = {International Conference on World Wide Web}\n}\n@article{renegar2014efficient,\n\ttitle        = {Efficient first-order methods for linear programming and semidefinite programming},\n\tauthor       = {James Renegar},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1409.5832}\n}\n@inproceedings{rennie2005fast,\n\ttitle        = {Fast maximum margin matrix factorization for collaborative prediction},\n\tauthor       = {Rennie, Jasson DM and Srebro, Nathan},\n\tyear         = 2005,\n\tbooktitle    = {Proceedings of the 22nd international conference on Machine learning},\n\tpages        = {713--719},\n\torganization = {ACM}\n}\n@inproceedings{resnick2007influence,\n\ttitle        = {The influence limiter: provably manipulation-resistant recommender systems},\n\tauthor       = {Paul Resnick and Rahul Sami},\n\tyear         = 2007,\n\tbooktitle    = {ACM Conference on Recommender Systems},\n\tpages        = {25--32}\n}\n@misc{reuters2011factbox,\n\ttitle        = {Factbox: A look at the \\$65 billion video games industry},\n\tauthor       = {Reuters},\n\tyear         = 2011,\n\tmonth        = jun,\n\turl          = {http://uk.reuters.com/article/2011/06/06/us-videogames-factbox-idUKTRE75552I20110606},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@inproceedings{reyes2018self,\n\ttitle        = {Self-consistent trajectory autoencoder: hierarchical reinforcement learning with trajectory embeddings},\n\tauthor       = {John Co-Reyes and YuXuan Liu and Abhishek Gupta and Benjamin Eysenbach and Pieter Abbeel and Sergey Levine},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1009--1018}\n}\n@inproceedings{RGapprox,\n\ttitle        = {New Tools for Graph Coloring},\n\tauthor       = {Arora, Sanjeev and Ge, Rong},\n\tyear         = 2011,\n\tbooktitle    = {Proceedings of Approximation, Randomization, and Combinatorial Optimization. Algorithms and Techniques - 14th International Workshop, APPROX 2011, and 15th International Workshop, RANDOM 2011,},\n\tpages        = {1--12}\n}\n@article{RGcacm,\n\ttitle        = {Computational Complexity and Information Asymmetry in Financial Products},\n\tauthor       = {Arora, Sanjeev and Barak, Boaz and Brunnermeier, Markus and Ge, Rong},\n\tyear         = 2010,\n\tjournal      = {Communications of the ACM},\n\tbooktitle    = {The First Symposium on Innovations in Computer Science, ICS 2010},\n\tpublisher    = {Tsinghua University Press},\n\taddress      = {Beijing},\n\tvolume       = 54,\n\tnumber       = 5,\n\tpages        = {101--107},\n\tdoi          = {10.1145/1941487.1941511},\n\tisbn         = {978-7-89474-827-0}\n}\n@inproceedings{RGdeep,\n\ttitle        = {Provable Bounds for Learning Some Deep Representations},\n\tauthor       = {Sanjeev Arora and Aditya Bhaskara and Rong Ge and Tengyu Ma},\n\tyear         = 2014,\n\tbooktitle    = {Proceedings of the 31th International Conference on Machine Learning, {ICML} 2014, Beijing, China, 21-26 June 2014},\n\tpages        = {584--592},\n\turl          = {http://jmlr.org/proceedings/papers/v32/arora14.html},\n\tcrossref     = {DBLP:conf/icml/2014},\n\ttimestamp    = {Sun, 26 Oct 2014 02:38:30 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/icml/AroraBGM14},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{RGec,\n\ttitle        = {Finding overlapping communities in social networks: toward a rigorous approach},\n\tauthor       = {Sanjeev Arora and Rong Ge and Sushant Sachdeva and Grant Schoenebeck},\n\tyear         = 2012,\n\tbooktitle    = {ACM Conference on Electronic Commerce, EC '12, Valencia, Spain, June 4-8, 2012},\n\tpages        = {37--54}\n}\n@inproceedings{RGicalp,\n\ttitle        = {New Algorithms for Learning in Presence of Errors},\n\tauthor       = {Sanjeev Arora and Rong Ge},\n\tyear         = 2011,\n\tbooktitle    = {Automata, Languages and Programming - 38th International Colloquium, ICALP 2011, Zurich, Switzerland, July 4-8, 2011, Proceedings, Part I},\n\tpages        = {403--415},\n\tee           = {http://dx.doi.org/10.1007/978-3-642-22006-7_34}\n}\n@inproceedings{RGisaac,\n\ttitle        = {New Results on Simple Stochastic Games},\n\tauthor       = {Dai, Decheng and Ge, Rong},\n\tyear         = 2009,\n\tbooktitle    = {ISAAC '09: Proceedings of the 20th International Symposium on Algorithms and Computation},\n\tlocation     = {Honolulu, Hawaii},\n\tpublisher    = {Springer-Verlag},\n\taddress      = {Berlin, Heidelberg},\n\tpages        = {1014--1023},\n\tdoi          = {http://dx.doi.org/10.1007/978-3-642-10631-6_102},\n\tisbn         = {978-3-642-10630-9}\n}\n@inproceedings{RGnips,\n\ttitle        = {Provable ICA with Unknown Gaussian Noise, and Implications for Gaussian Mixtures and Autoencoders},\n\tauthor       = {Sanjeev Arora and Rong Ge and Ankur Moitra and Sushant Sachdeva},\n\tyear         = 2012,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NIPS)}\n}\n@article{RGovercomplete1,\n\ttitle        = {Guaranteed Non-Orthogonal Tensor Decomposition via Alternating Rank-1 Updates},\n\tauthor       = {Animashree Anandkumar and Rong Ge and Majid Janzamin},\n\tyear         = 2014,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1402.5180},\n\turl          = {http://arxiv.org/abs/1402.5180},\n\ttimestamp    = {Sun, 26 Oct 2014 02:41:41 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/journals/corr/AnandkumarGJ14},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@article{RGovercomplete2,\n\ttitle        = {Provable Learning of Overcomplete Latent Variable Models: Semi-supervised and Unsupervised Settings},\n\tauthor       = {Animashree Anandkumar and Rong Ge and Majid Janzamin},\n\tyear         = 2014,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1408.0553},\n\turl          = {http://arxiv.org/abs/1408.0553},\n\ttimestamp    = {Sun, 26 Oct 2014 02:41:44 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/journals/corr/AnandkumarGJ14a},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{rgSparse,\n\ttitle        = {Towards a better approximation for sparsest cut?},\n\tauthor       = {Arora, Sanjeev and Ge, Rong and Sinop, Ali Kemal},\n\tyear         = 2013,\n\tbooktitle    = {Foundations of Computer Science (FOCS), 2013 IEEE 54th Annual Symposium on},\n\tpages        = {270--279},\n\torganization = {IEEE}\n}\n@inproceedings{RGstoc,\n\ttitle        = {Computing a Nonnegative Matrix Factorization \\--- Provably.},\n\tauthor       = {Sanjeev Arora and Rong Ge and Ravindran Kannan and Ankur Moitra},\n\tyear         = 2012,\n\tbooktitle    = {Proceedings of the 44th Symposium on Theory of Computing Conference, STOC 2012, New York, NY, USA, May 19 - 22},\n\tpages        = {145--162}\n}\n@inproceedings{rgStochastic,\n\ttitle        = {Competing with the Emprical Risk Minimizer in a Single Pass},\n\tauthor       = {Roy Frostig and Rong Ge and Sham M. Kakade and Aaron Sidford},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of The 28th Conference on Learning Theory},\n\tpp           = {728–763}\n}\n@book{rhoten2011knowledge,\n\ttitle        = {Knowledge matters: The public mission of the research university},\n\tauthor       = {Diana Rhoten and Craig Calhoun},\n\tyear         = 2011,\n\tpublisher    = {Columbia University Press}\n}\n@inproceedings{ribeiro2016lime,\n\ttitle        = {\"{Why Should {I} Trust You?}\": Explaining the Predictions of Any Classifier},\n\tauthor       = {Marco Tulio Ribeiro and Sameer Singh and Carlos Guestrin},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)}\n}\n@inproceedings{ribeiro2018anchors,\n\ttitle        = {Anchors: High-Precision Model-Agnostic Explanations},\n\tauthor       = {Marco Tulio Ribeiro and Sameer Singh and Carlos Guestrin},\n\tyear         = 2018,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{ribeiro2018sears,\n\ttitle        = {Semantically Equivalent Adversarial Rules for Debugging {NLP} Models},\n\tauthor       = {Marco Tulio Ribeiro and Sameer Singh and Carlos Guestrin},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{ribeiro2019red,\n\ttitle        = {Are Red Roses Red? Evaluating Consistency of Question-Answering Models},\n\tauthor       = {Marco Tulio Ribeiro and Carlos Guestrin and Sameer Singh},\n\tyear         = 2019,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{ribeiro2020beyond,\n\ttitle        = {Beyond Accuracy: Behavioral Testing of {NLP} Models with {C}heck{L}ist},\n\tauthor       = {Marco Tulio Ribeiro and Tongshuang Wu and Carlos Guestrin and Sameer Singh},\n\tyear         = 2020,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {4902--4912}\n}\n@article{rice1953classes,\n\ttitle        = {Classes of recursively enumerable sets and their decision problems},\n\tauthor       = {Henry Gordon Rice},\n\tyear         = 1953,\n\tjournal      = {Transactions of the American Mathematical Society},\n\tvolume       = 74,\n\tnumber       = 2,\n\tpages        = {358--366}\n}\n@misc{richards2018lyapunov,\n\ttitle        = {The Lyapunov Neural Network: Adaptive Stability Certification for Safe Learning of Dynamical Systems},\n\tauthor       = {Spencer M. Richards and Felix Berkenkamp and Andreas Krause},\n\tyear         = 2018,\n\teprint       = {1808.00924},\n\tarchiveprefix = {arXiv},\n\tprimaryclass = {cs.SY}\n}\n@article{richardson06mln,\n\ttitle        = {{M}arkov Logic Networks},\n\tauthor       = {Matthew Richardson and Pedro Domingos},\n\tyear         = 2006,\n\tjournal      = {Machine Learning},\n\tvolume       = 62,\n\tpages        = {107--136}\n}\n@inproceedings{richardson2013mctest,\n\ttitle        = {MCTest: A Challenge Dataset for the Open-Domain Machine Comprehension of Text},\n\tauthor       = {Matthew Richardson and Christopher JC Burges and Erin Renshaw},\n\tyear         = 2013,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {193--203}\n}\n@inproceedings{richardson2018polyglot,\n\ttitle        = {Polyglot Semantic Parsing in {API}s},\n\tauthor       = {Kyle Richardson and Jonathan Berant and Jonas Kuhn},\n\tyear         = 2018,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{RichtarikTakac2012parallel,\n\ttitle        = {Parallel coordinate descent methods for big data optimization},\n\tauthor       = {Richt{\\'a}rik, Peter and Tak{\\'a}{\\v{c}}, Martin},\n\tyear         = 2012,\n\tjournal      = {Mathematical Programming},\n\tpublisher    = {Springer},\n\tpages        = {1--52}\n}\n@article{RichtarikTakac2013distributed,\n\ttitle        = {Distributed coordinate descent method for learning with big data},\n\tauthor       = {Richt{\\'a}rik, Peter and Tak{\\'a}{\\v{c}}, Martin},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1310.2059}\n}\n@article{RichtarikTakac2014,\n\ttitle        = {Iteration complexity of randomized block-coordinate descent methods for minimizing a composite function},\n\tauthor       = {Richt{\\'a}rik, Peter and Tak{\\'a}{\\v{c}}, Martin},\n\tyear         = 2014,\n\tjournal      = {Mathematical Programming},\n\tpublisher    = {Springer},\n\tvolume       = 144,\n\tnumber       = {1-2},\n\tpages        = {1--38}\n}\n@article{richter2016gta5,\n\ttitle        = {Playing for Data: Ground Truth from Computer Games},\n\tauthor       = {Richter, Stephan R. and Vineet, Vibhav and Roth, Stefan and Koltun, Vladlen},\n\tyear         = 2016,\n\tjournal      = {Lecture Notes in Computer Science},\n\tpublisher    = {Springer International Publishing},\n\tpages        = {102–118},\n\tdoi          = {10.1007/978-3-319-46475-6_7},\n\tisbn         = 9783319464756,\n\tissn         = {1611-3349},\n\turl          = {http://dx.doi.org/10.1007/978-3-319-46475-6_7}\n}\n@inproceedings{richter2016playing,\n\ttitle        = {Playing for data: Ground truth from computer games},\n\tauthor       = {Stephan R Richter and Vibhav Vineet and Stefan Roth and Vladlen Koltun},\n\tyear         = 2016,\n\tbooktitle    = {European conference on computer vision},\n\tpages        = {102--118}\n}\n@inproceedings{richter2017novelty,\n\ttitle        = {Safe Visual Navigation via Deep Learning and Novelty Detection},\n\tauthor       = {Charles Richter and Nicholas Roy},\n\tyear         = 2017,\n\tbooktitle    = {Robotics: Science and Systems}\n}\n@misc{ridge,\n\ttitle        = {An Analysis of Random Design Linear Regression},\n\tauthor       = {D. Hsu and S. M. Kakade and T. Zhang},\n\tyear         = 2011,\n\tnote         = {arXiv:1106.2363},\n\teprint       = {arXiv:1106.2363}\n}\n@inproceedings{riedel13universal,\n\ttitle        = {Relation Extraction with Matrix Factorization and Universal Schemas},\n\tauthor       = {Sebastian Riedel and Limin Yao and Andrew McCallum},\n\tyear         = 2013,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@inproceedings{riedel2010modeling,\n\ttitle        = {Modeling relations and their mentions without labeled text},\n\tauthor       = {Sebastian Riedel and Limin Yao and Andrew McCallum},\n\tyear         = 2010,\n\tbooktitle    = {Machine Learning and Knowledge Discovery in Databases (ECML PKDD)},\n\tpages        = {148--163}\n}\n@inproceedings{riedel2010relaxed,\n\ttitle        = {Relaxed marginal inference and its application to dependency parsing},\n\tauthor       = {Sebastian Riedel and David A Smith},\n\tyear         = 2010,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {760--768}\n}\n@inproceedings{riedel2011robust,\n\ttitle        = {Robust biomedical event extraction with dual decomposition and minimal domain adaptation},\n\tauthor       = {Sebastian Riedel and Andrew McCallum},\n\tyear         = 2011,\n\tbooktitle    = {Proceedings of the BioNLP Shared Task 2011 Workshop}\n}\n@inproceedings{riedel2012parse,\n\ttitle        = {Parse, price and cut: delayed column and row generation for graph based parsers},\n\tauthor       = {S. Riedel and D. Smith and A. McCallum},\n\tyear         = 2012,\n\tbooktitle    = {Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP/CoNLL)},\n\tpages        = {732--743}\n}\n@article{riedel2016programming,\n\ttitle        = {Programming with a differentiable forth interpreter},\n\tauthor       = {Sebastian Riedel and Matko Bosnjak and Tim Rockt{\\\"a}schel},\n\tyear         = 2016,\n\tjournal      = {CoRR, abs/1605.06640}\n}\n@article{rieger2019interpretations,\n\ttitle        = {Interpretations are useful: penalizing explanations to align neural networks with prior knowledge},\n\tauthor       = {Laura Rieger and Chandan Singh and W James Murdoch and Bin Yu},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1909.13584}\n}\n@article{rieser2011reinforcement,\n\ttitle        = {Reinforcement learning for adaptive dialogue systems: a data-driven methodology for dialogue management and natural language generation},\n\tauthor       = {Verena Rieser and Oliver Lemon},\n\tyear         = 2011,\n\tjournal      = {Springer Science \\& Business Media}\n}\n@article{rietz1974proof,\n\ttitle        = {A proof of the {G}rothendieck inequality},\n\tauthor       = {R. E. Rietz},\n\tyear         = 1974,\n\tjournal      = {Israel Journal of Mathematics},\n\tvolume       = 19,\n\tnumber       = 3,\n\tpages        = {271--276}\n}\n@inproceedings{rigaki2018bringing,\n\ttitle        = {Bringing a gan to a knife-fight: Adapting malware communication to avoid detection},\n\tauthor       = {Maria Rigaki and Sebastian Garcia},\n\tyear         = 2018,\n\tbooktitle    = {2018 IEEE Security and Privacy Workshops (SPW)},\n\tpages        = {70--75}\n}\n@article{rigollet2007generalization,\n\ttitle        = {Generalization Error Bounds in Semi-supervised Classification Under the Cluster Assumption},\n\tauthor       = {Philippe Rigollet},\n\tyear         = 2007,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 8,\n\tpages        = {1369--1392}\n}\n@inproceedings{riloff2000rule,\n\ttitle        = {A rule-based question answering system for reading comprehension tests},\n\tauthor       = {Ellen Riloff and Michael Thelen},\n\tyear         = 2000,\n\tbooktitle    = {ANLP/NAACL Workshop on reading comprehension tests as evaluation for computer-based language understanding sytems - Volume 6},\n\tpages        = {13--19}\n}\n@inproceedings{rimell2009unbounded,\n\ttitle        = {Unbounded Dependency Recovery for Parser Evaluation},\n\tauthor       = {Laura Rimell and Stephen Clark and Mark Steedman},\n\tyear         = 2009,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{rinott1994normal,\n\ttitle        = {On normal approximation rates for certain sums of dependent random variables},\n\tauthor       = {Yosef Rinott},\n\tyear         = 1994,\n\tjournal      = {Journal of Computational and Applied Mathematics},\n\tvolume       = 55,\n\tnumber       = 2,\n\tpages        = {135--143}\n}\n@article{rintanen2012planning,\n\ttitle        = {Planning as satisfiability: Heuristics},\n\tauthor       = {J. Rintanen},\n\tyear         = 2012,\n\tjournal      = {Artificial Intelligence},\n\tvolume       = 193\n}\n@book{ripley2009stochastic,\n\ttitle        = {Stochastic simulation},\n\tauthor       = {Brian D Ripley},\n\tyear         = 2009,\n\tpublisher    = {John Wiley \\& Sons}\n}\n@inproceedings{ritchie2005computational,\n\ttitle        = {Computational Mechanisms for Pun Generation},\n\tauthor       = {Graeme Ritchie},\n\tyear         = 2005,\n\tbooktitle    = {the 10th European Natural Language Generation Workshop}\n}\n@inproceedings{ritter2011data,\n\ttitle        = {Data-driven response generation in social media},\n\tauthor       = {Alan Ritter and Colin Cherry and William B Dolan},\n\tyear         = 2011,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {583--593}\n}\n@article{ritter2013modeling,\n\ttitle        = {Modeling missing data in distant supervision for information extraction},\n\tauthor       = {Alan Ritter and Luke Zettlemoyer and Mausam and Oren Etzioni},\n\tyear         = 2013,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 1\n}\n@article{ritter2018been,\n\ttitle        = {Been there, done that: Meta-learning with episodic recall},\n\tauthor       = {Samuel Ritter and Jane X Wang and Zeb Kurth-Nelson and Siddhant M Jayakumar and Charles Blundell and Razvan Pascanu and Matthew Botvinick},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.09692}\n}\n@article{RK03,\n\ttitle        = {Monotonic convergence of fixed-point algorithms for {ICA}},\n\tauthor       = {P. A. Regalia and E. Kofidis},\n\tyear         = 2003,\n\tjournal      = {IEEE Transactions on Neural Networks},\n\tvolume       = 14,\n\tpages        = {943--949}\n}\n@article{RM51,\n\ttitle        = {A stochastic approximation method},\n\tauthor       = {Robbins, Herbert and Monro, Sutton},\n\tyear         = 1951,\n\tjournal      = {The annals of mathematical statistics},\n\tpublisher    = {JSTOR},\n\tpages        = {400--407}\n}\n@article{robbins1952some,\n\ttitle        = {Some Aspects of the Sequential Design of Experiments},\n\tauthor       = {Robbins, Herbert},\n\tyear         = 1952,\n\tjournal      = {Bulletin of the American Mathematical Society},\n\tvolume       = 58,\n\tpages        = {527--535}\n}\n@book{robert04mc,\n\ttitle        = {{M}onte {C}arlo Statistical Methods},\n\tauthor       = {Christian P. Robert and George Casella},\n\tyear         = 2004,\n\tpublisher    = {Springer}\n}\n@book{robert2005monte,\n\ttitle        = {{M}onte {C}arlo Statistical Methods},\n\tauthor       = {Robert, Christian P. and Casella, George},\n\tyear         = 2005,\n\tpublisher    = {Springer-Verlag New York, Inc.},\n\taddress      = {Secaucus, NJ, USA}\n}\n@article{roberts1996exponential,\n\ttitle        = {Exponential convergence of Langevin distributions and their discrete approximations},\n\tauthor       = {Roberts, Gareth O and Tweedie, Richard L and others},\n\tyear         = 1996,\n\tjournal      = {Bernoulli},\n\tpublisher    = {Bernoulli Society for Mathematical Statistics and Probability},\n\tvolume       = 2,\n\tnumber       = 4,\n\tpages        = {341--363}\n}\n@article{roberts1999bounds,\n\ttitle        = {Bounds on regeneration times and convergence rates for {M}arkov chains},\n\tauthor       = {GO Roberts and RL Tweedie},\n\tyear         = 1999,\n\tjournal      = {Stochastic Processes and their applications},\n\tvolume       = 80,\n\tnumber       = 2,\n\tpages        = {211--229}\n}\n@incollection{roberts2014navigating,\n\ttitle        = {Navigating the Local Modes of Big Data: The Case of Topic Models},\n\tauthor       = {Margaret E. Roberts and Brandon M. Stewart and Dustin Tingley},\n\tyear         = 2014,\n\tbooktitle    = {Data Science for Politics, Policy and Government}\n}\n@article{robertson2009probabilistic,\n\ttitle        = {The probabilistic relevance framework: {BM25} and beyond},\n\tauthor       = {Stephen Robertson and Hugo Zaragoza},\n\tyear         = 2009,\n\tjournal      = {Foundations and Trends in Information Retrieval},\n\tvolume       = 3\n}\n@article{robey2020modelbased,\n\ttitle        = {Model-Based Robust Deep Learning},\n\tauthor       = {Alexander Robey and Hamed Hassani and George J. Pappas},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2005.10247}\n}\n@article{robey2021model,\n\ttitle        = {Model-Based Domain Generalization},\n\tauthor       = {Alexander Robey and George J Pappas and Hamed Hassani},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2102.11436}\n}\n@article{robins2000inference,\n\ttitle        = {Inference for imputation estimators},\n\tauthor       = {Robins, James M and Wang, Naisyin},\n\tyear         = 2000,\n\tjournal      = {Biometrika},\n\tvolume       = 87,\n\tnumber       = 1,\n\tpages        = {113--124}\n}\n@article{Roch:CompBio,\n\ttitle        = {A Short Proof that Phylogenetic Tree Reconstruction by Maximum Likelihood Is Hard},\n\tauthor       = {Roch, S.},\n\tyear         = 2006,\n\tjournal      = {IEEE/ACM Trans. Comput. Biol. Bioinformatics},\n\tvolume       = 3,\n\tnumber       = 1\n}\n@article{rochtaschel2015reasoning,\n\ttitle        = {Reasoning about Entailment with Neural Attention},\n\tauthor       = {Tim Rockt{\\\"{a}}schel and Edward Grefenstette and Karl Moritz Hermann and Tom{'{a}}s Kocisk{'{y}} and Phil Blunsom},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1509.06664}\n}\n@article{rockafellar1976ppa,\n\ttitle        = {Monotone operators and the proximal point algorithm},\n\tauthor       = {R. Tyrrell Rockafellar},\n\tyear         = 1976,\n\tjournal      = {SIAM Journal on Control and Optimization},\n\tvolume       = 14,\n\tnumber       = 5,\n\tpages        = {877--898}\n}\n@book{Rockafellar1996convex,\n\ttitle        = {Convex Analysis (Princeton Landmarks in Mathematics and Physics)},\n\tauthor       = {Rockafellar, R. Tyrrell},\n\tyear         = 1996,\n\tpublisher    = {Princeton University Press}\n}\n@article{rockafellar2000optimization,\n\ttitle        = {Optimization of Conditional Value-at-Risk},\n\tauthor       = {R. Tyrrell Rockafellar and Stanislav Uryasev},\n\tyear         = 2000,\n\tjournal      = {Journal of Risk},\n\tvolume       = 2,\n\tpages        = {21--41}\n}\n@book{rockafellar2015convex,\n\ttitle        = {Convex analysis},\n\tauthor       = {Rockafellar, Ralph Tyrell},\n\tyear         = 2015,\n\tpublisher    = {Princeton university press}\n}\n@book{rockefellerconvex,\n\ttitle        = {Convex Analysis},\n\tauthor       = {Rockefeller, RT},\n\tyear         = 1970,\n\tpublisher    = {Princeton University Press, Princeton, NJ}\n}\n@inproceedings{rocktaschel2014lowlogic,\n\ttitle        = {Low-dimensional embeddings of logic},\n\tauthor       = {Tim Rockt{\\\"a}schel and Matko Bosnjak and Sameer Singh and Sebastian Riedel},\n\tyear         = 2014,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{rocktaschel2015injecting,\n\ttitle        = {Injecting Logical Background Knowledge into Embeddings for Relation Extraction},\n\tauthor       = {Tim Rockt{\\\"a}schel and Sameer Singh and Sebastian Riedel},\n\tyear         = 2015,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{roderick2017deep,\n\ttitle        = {Deep Abstract {Q}-Networks},\n\tauthor       = {M. Roderick and C. Grimm and S. Tellex},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1710.00459}\n}\n@inproceedings{roderick2021provably,\n\ttitle        = {Provably Safe PAC-MDP Exploration Using Analogies},\n\tauthor       = {Roderick, Melrose and Nagarajan, Vaishnavh and Kolter, Zico},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2007.03574},\n\tbooktitle    = {International Conference on Artificial Intelligence and Statistics},\n\tpages        = {1216--1224},\n\torganization = {PMLR}\n}\n@article{rodriguez08ndp,\n\ttitle        = {The nested {D}irichlet process},\n\tauthor       = {Abel Rodriguez and David B. Dunson and Alan E. Gelfand},\n\tyear         = 2008,\n\tjournal      = {Journal of the American Statistical Association (JASA)},\n\tvolume       = 103,\n\tpages        = {1131--1144}\n}\n@article{rodriguez2019quizbowl,\n\ttitle        = {Quizbowl: The Case for Incremental Question Answering},\n\tauthor       = {Pedro Rodriguez and Shi Feng and Mohit Iyyer and He He and Jordan Boyd-Graber},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1904.04792}\n}\n@inproceedings{roemmele2011copa,\n\ttitle        = {Choice of Plausible Alternatives: An Evaluation of Commonsense Causal Reasoning},\n\tauthor       = {Melissa Roemmele and Cosmin Adrian Bejan and Andrew S. Gordon},\n\tyear         = 2011,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{roemmele2016writing,\n\ttitle        = {Writing Stories with Help from Recurrent Neural Networks},\n\tauthor       = {Melissa Roemmele},\n\tyear         = 2016,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@article{rogers1991aggregation,\n\ttitle        = {Aggregation and disaggregation techniques and methodology in optimization},\n\tauthor       = {Rogers, David F and Plante, Robert D and Wong, Richard T and Evans, James R},\n\tyear         = 1991,\n\tjournal      = {Operations Research},\n\tpublisher    = {INFORMS},\n\tvolume       = 39,\n\tnumber       = 4,\n\tpages        = {553--582}\n}\n@article{rogers2020primer,\n\ttitle        = {A primer in bertology: What we know about how bert works},\n\tauthor       = {Rogers, Anna and Kovaleva, Olga and Rumshisky, Anna},\n\tyear         = 2020,\n\tjournal      = {Transactions of the Association for Computational Linguistics},\n\tpublisher    = {MIT Press},\n\tvolume       = 8,\n\tpages        = {842--866}\n}\n@article{rohde2006,\n\ttitle        = {An improved model of semantic similarity based on lexical co-occurence},\n\tauthor       = {Douglas L. T. Rohde and Laura M. Gonnerman and David C. Plaut},\n\tyear         = 2006,\n\tjournal      = {Communication of the Association for Computing Machinery}\n}\n@article{roland2007frequency,\n\ttitle        = {Frequency of Basic {E}nglish Grammatical Structures: A Corpus Analysis},\n\tauthor       = {Douglas Roland and Frederic Dick and Jeffrey Elman},\n\tyear         = 2007,\n\tjournal      = {Journal of Memory and Language},\n\tvolume       = 57\n}\n@article{rolf2020generalizable,\n\ttitle        = {A Generalizable and Accessible Approach to Machine Learning with Global Satellite Imagery},\n\tauthor       = {Esther Rolf and Jonathan Proctor and Tamma Carleton and Ian Bolliger and Vaishaal Shankar and Miyabi Ishihara and Benjamin Recht and Solomon Hsiang},\n\tyear         = 2020,\n\tjournal      = {NBER Working Paper},\n\tnumber       = 28045\n}\n@inproceedings{rolf2020post,\n\ttitle        = {Post-Estimation Smoothing: A Simple Baseline for Learning with Side Information},\n\tauthor       = {Esther Rolf and Michael I. Jordan and Benjamin Recht},\n\tyear         = 2020,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)}\n}\n@article{rolnick2019tackling,\n\ttitle        = {Tackling climate change with machine learning},\n\tauthor       = {David Rolnick and Priya L Donti and Lynn H Kaack and Kelly Kochanski and Alexandre Lacoste and Kris Sankaran and Andrew Ross and Nikola Milojevic-Dupont and Natasha Jaques and Anna Waldman-Brown and others},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1906.05433}\n}\n@inproceedings{romano2006investigating,\n\ttitle        = {Investigating a Generic Paraphrase-based Approach for Relation Extraction},\n\tauthor       = {Lorenzo Romano and Milen kouylekov and Idan Szpektor and Ido Dagan and Alberto Lavelli},\n\tyear         = 2006,\n\tbooktitle    = {European Association for Computational Linguistics (EACL)}\n}\n@inproceedings{romano2019conformalized,\n\ttitle        = {Conformalized quantile regression},\n\tauthor       = {Romano, Yaniv and Patterson, Evan and Candes, Emmanuel},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {3543--3553}\n}\n@inproceedings{romanov2018mednli,\n\ttitle        = {Lessons from Natural Language Inference in the Clinical Domain},\n\tauthor       = {Alexey Romanov and Chaitanya Shivade},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{ron98pfa,\n\ttitle        = {On the Learnability and Usage of Acyclic Probabilistic Finite Automata},\n\tauthor       = {Dana Ron and Yoram Singer and Naftali Tishby},\n\tyear         = 1998,\n\tjournal      = {Journal of Computer and System Sciences},\n\tvolume       = 56,\n\tpages        = {133--152}\n}\n@article{rong2020self,\n\ttitle        = {Self-supervised graph transformer on large-scale molecular data},\n\tauthor       = {Yu Rong and Yatao Bian and Tingyang Xu and Weiyang Xie and Ying Wei and Wenbing Huang and Junzhou Huang},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.02835}\n}\n@misc{rong2021extrapolating,\n\ttitle        = {Extrapolating to Unnatural Language Processing with {GPT-3}'s In-context Learning: The Good, the Bad, and the Mysterious},\n\tauthor       = {Frieda Rong},\n\tyear         = 2021\n}\n@article{ronneberger2015unet,\n\ttitle        = {{U-Net}: Convolutional Networks for Biomedical Image Segmentation},\n\tauthor       = {Olaf Ronneberger and Philipp Fischer and Thomas Brox},\n\tyear         = 2015,\n\tjournal      = {arXiv}\n}\n@inproceedings{ros2010one,\n\ttitle        = {Which one? grounding the referent based on efficient human-robot interaction},\n\tauthor       = {R. Ros and S. Lemaignan and E. A. Sisbot and R. Alami and J. Steinwender and K. Hamann and F. Warneken},\n\tyear         = 2010,\n\tbooktitle    = {RO-MAN},\n\tpages        = {570--575}\n}\n@inproceedings{ros2016synthia,\n\ttitle        = {The synthia dataset: A large collection of synthetic images for semantic segmentation of urban scenes},\n\tauthor       = {German Ros and Laura Sellart and Joanna Materzynska and David Vazquez and Antonio M Lopez},\n\tyear         = 2016,\n\tbooktitle    = {Proceedings of the IEEE conference on computer vision and pattern recognition},\n\tpages        = {3234--3243}\n}\n@incollection{RoS71,\n\ttitle        = {A Convergence Theorem for Non Negative Almost Supermartingales and Some Applications},\n\tauthor       = {Robbins, Herbert and Siegmund, David},\n\tyear         = 1985,\n\tbooktitle    = {Herbert Robbins Selected Papers},\n\tpages        = {111--135},\n\tdoi          = {10.1007/978-1-4612-5110-1_10},\n\tisbn         = {978-1-4612-5110-1},\n\turl          = {http://dx.doi.org/10.1007/978-1-4612-5110-1_10},\n\t//address    = {New York, NY},\n\t//publisher  = {Springer New York}\n}\n@techreport{rosales1998improved,\n\ttitle        = {\n\t\tImproved Tracking of Multiple Humans with Trajectory Predcition and\n\n\t\tOcclusion Modeling\n\t},\n\tauthor       = {Romer Rosales and Stan Sclaroff},\n\tyear         = 1998,\n\tmonth        = {2,},\n\tnumber       = {1998-007}\n}\n@inproceedings{rose1996efficient,\n\ttitle        = {Efficient generation of motion transitions using spacetime constraints},\n\tauthor       = {\n\t\tCharles Rose and Brian Guenter and Bobby Bodenheimer and Michael\n\n\t\tF. Cohen\n\t},\n\tyear         = 1996,\n\tbooktitle    = {\n\t\tSIGGRAPH '96: Proceedings of the 23rd annual conference on Computer\n\n\t\tgraphics and interactive techniques\n\t},\n\tpublisher    = {ACM Press},\n\taddress      = {New York, NY, USA},\n\tpages        = {147--154},\n\tdoi          = {http://doi.acm.org/10.1145/237170.237229},\n\tisbn         = {0-89791-746-4}\n}\n@article{rosenbaum1984consequences,\n\ttitle        = {The consequences of adjustment for a concomitant variable that has been affected by the treatment},\n\tauthor       = {Paul R Rosenbaum},\n\tyear         = 1984,\n\tjournal      = {Journal of the Royal Statistical Society: Series A (General)},\n\tvolume       = 147,\n\tnumber       = 5,\n\tpages        = {656--666}\n}\n@inproceedings{rosenberg2005semi,\n\ttitle        = {Semi-supervised self-training of object detection models},\n\tauthor       = {Chuck Rosenberg and Martial Hebert and Henry Schneiderman},\n\tyear         = 2005,\n\tbooktitle    = {Proceedings of the Seventh IEEE Workshops on Application of Computer Vision}\n}\n@inproceedings{rosenberg2019online,\n\ttitle        = {Online Convex Optimization in Adversarial Markov Decision Processes},\n\tauthor       = {Rosenberg, Aviv and Mansour, Yishay},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {5478--5486}\n}\n@article{rosenfeld2021online,\n\ttitle        = {An Online Learning Approach to Interpolation and Extrapolation in Domain Generalization},\n\tauthor       = {Elan Rosenfeld and Pradeep Ravikumar and Andrej Risteski},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2102.13128}\n}\n@inproceedings{rosenfeld2021the,\n\ttitle        = {The Risks of Invariant Risk Minimization},\n\tauthor       = {Elan Rosenfeld and Pradeep Ravikumar and Andrej Risteski},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Learning Representations},\n\turl          = {https://openreview.net/forum?id=BbNIbVPJ-42}\n}\n@article{rosenthal1995minorization,\n\ttitle        = {Minorization conditions and convergence rates for {M}arkov chain {M}onte {C}arlo},\n\tauthor       = {Jeffrey S Rosenthal},\n\tyear         = 1995,\n\tjournal      = {Journal of the American Statistical Association (JASA)},\n\tvolume       = 90,\n\tnumber       = 430,\n\tpages        = {558--566}\n}\n@techreport{ross2009turkers,\n\ttitle        = {Who are the turkers? worker demographics in amazon mechanical turk},\n\tauthor       = {Joel Ross and Andrew Zaldivar and Lilly Irani and Bill Tomlinson},\n\tyear         = 2009,\n\tinstitution  = {Department of Informatics, University of California, Irvine}\n}\n@inproceedings{ross2010efficient,\n\ttitle        = {Efficient reductions for imitation learning},\n\tauthor       = {S. Ross and D. Bagnell},\n\tyear         = 2010,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)},\n\tpages        = {661--668}\n}\n@inproceedings{ross2011learning,\n\ttitle        = {Learning message-passing inference machines for structured prediction},\n\tauthor       = {Stephane Ross and Daniel Munoz and Martial Hebert and J Andrew Bagnell},\n\tyear         = 2011,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {2737--2744}\n}\n@inproceedings{ross2011reduction,\n\ttitle        = {A reduction of imitation learning and structured prediction to no-regret online learning},\n\tauthor       = {Stéphane Ross and Geoffrey Gordon and Andrew Bagnell},\n\tyear         = 2011,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)}\n}\n@article{ross2014reinforce,\n\ttitle        = {Reinforcement and Imitation Learning via Interactive No-Regret Learning},\n\tauthor       = {St{'{e}}phane Ross and J. Andrew Bagnell},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1406.5979}\n}\n@inproceedings{rosset2004margin,\n\ttitle        = {Margin maximizing loss functions},\n\tauthor       = {Saharon Rosset and Ji Zhu and Trevor J Hastie},\n\tyear         = 2004,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {1237--1244}\n}\n@article{rossol2012cd14brightcd16,\n\ttitle        = {The {CD}14bright{CD}16+ monocyte subset is expanded in rheumatoid arthritis and promotes expansion of the {T}h17 cell population},\n\tauthor       = {Manuela Rossol and Stephan Kraus and Matthias Pierer and Christoph Baerwald and Ulf Wagner},\n\tyear         = 2012,\n\tjournal      = {Arthritis \\& Rheumatology},\n\tvolume       = 64,\n\tnumber       = 3,\n\tpages        = {671--677}\n}\n@inproceedings{roth2005integer,\n\ttitle        = {Integer linear programming inference for conditional random fields},\n\tauthor       = {Dan Roth and Wen-tau Yih},\n\tyear         = 2005,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{roth2006active,\n\ttitle        = {Active learning with {P}erceptron for structured output},\n\tauthor       = {Dan Roth and Kevin Small},\n\tyear         = 2006,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{roth2013combining,\n\ttitle        = {Combining Generative and Discriminative Model Scores for Distant Supervision},\n\tauthor       = {Benjamin Roth and Dietrich Klakow},\n\tyear         = 2013,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {24--29}\n}\n@article{rothblum2018probably,\n\ttitle        = {Probably Approximately Metric-Fair Learning},\n\tauthor       = {Guy N Rothblum and Gal Yona},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1803.03242}\n}\n@article{rothenhausler2018anchor,\n\ttitle        = {Anchor regression: heterogeneous data meets causality},\n\tauthor       = {Domink Rothenh\\\"ausler and Peter B\\\"uhlmann and Nicolai Meinshausen and Jonas Peters},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1801.06229}\n}\n@article{rothfuss2018promp,\n\ttitle        = {Promp: Proximal meta-policy search},\n\tauthor       = {Jonas Rothfuss and Dennis Lee and Ignasi Clavera and Tamim Asfour and Pieter Abbeel},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1810.06784}\n}\n@article{rothvoss2017matching,\n\ttitle        = {The matching polytope has exponential extension complexity},\n\tauthor       = {Rothvo{\\ss}, Thomas},\n\tyear         = 2017,\n\tjournal      = {Journal of the ACM}\n}\n@article{rotskoff2018neural,\n\ttitle        = {Neural networks as Interacting Particle Systems: Asymptotic convexity of the Loss Landscape and Universal Scaling of the Approximation Error},\n\tauthor       = {Rotskoff, Grant M and Vanden-Eijnden, Eric},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.00915}\n}\n@book{roughgarden2020beyond,\n\ttitle        = {Beyond the Worst-Case Analysis of Algorithms},\n\tauthor       = {Roughgarden, Tim},\n\tyear         = 2020,\n\tpublisher    = {Cambridge University Press}\n}\n@techreport{roundy1985identifying,\n\ttitle        = {Identifying the Set of Always-Active Constraints in a System of Linear Inequalities by a Single Linear Program},\n\tauthor       = {Robert M. Freund and Robin Roundy and Michael J Todd},\n\tyear         = 1985,\n\tinstitution  = {Massachusetts Institute of Technology, Alfred P. Sloan School of Management}\n}\n@inproceedings{roy01active,\n\ttitle        = {Toward Optimal Active Learning through Sampling Estimation of Error Reduction},\n\tauthor       = {Nicholas Roy and Andrew McCallum},\n\tyear         = 2001,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {441--448}\n}\n@inproceedings{roy2000spoken,\n\ttitle        = {Spoken dialogue management using probabilistic reasoning},\n\tauthor       = {Nicholas Roy and Joelle Pineau and Sebastian Thrun},\n\tyear         = 2000,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {93--100}\n}\n@article{roy2015reasoning,\n\ttitle        = {Reasoning about quantities in natural language},\n\tauthor       = {Subhro Roy and Tom Vieira and Dan Roth},\n\tyear         = 2015,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 1\n}\n@article{roy2019Comments,\n\ttitle        = {Comments on the {Du-Kakade-Wang-Yang} Lower Bounds},\n\tauthor       = {Benjamin Van Roy and Shi-Hai Dong},\n\tyear         = 2019,\n\tjournal      = {ArXiv},\n\tvolume       = {abs/1911.07910}\n}\n@inproceedings{RSS11,\n\ttitle        = {Making Gradient Descent Optimal for Strongly Convex Stochastic Optimization},\n\tauthor       = {Alexander Rakhlin and Ohad Shamir and Karthik Sridharan},\n\tyear         = 2012,\n\tbooktitle    = {Proceedings of the 29th International Conference on Machine Learning},\n\tseries       = {ICML '12},\n\tpages        = {449--456},\n\tisbn         = {978-1-4503-1285-1},\n\t//address    = {New York, NY, USA},\n\t//location   = {Edinburgh, Scotland, GB},\n\t//month      = {July},\n\t//publisher  = {Omnipress}\n}\n@book{rubens2011active,\n\ttitle        = {Active Learning in Recommender Systems},\n\tauthor       = {Neil Rubens and Dain Kaplan and Masashi Sugiyami},\n\tyear         = 2011,\n\tpublisher    = {Springer}\n}\n@article{rubin1987calculation,\n\ttitle        = {The Calculation of Posterior Distributions by Data Augmentation: Comment: A Noniterative Sampling/Importance Resampling Alternative to the Data Augmentation Algorithm for Creating a Few Imputations When Fractions of Missing Information Are Modest: The {SIR} Algorithm},\n\tauthor       = {Rubin, Donald B.},\n\tyear         = 1987,\n\tjournal      = {Journal of the American Statistical Association},\n\tpublisher    = {American Statistical Association},\n\tvolume       = 82,\n\tnumber       = 398,\n\tpages        = {pp. 543--546},\n\tissn         = {01621459},\n\tcopyright    = {Copyright © 1987 American Statistical Association},\n\tjstor_articletype = {research-article},\n\tjstor_formatteddate = {Jun., 1987},\n\tlanguage     = {English}\n}\n@inproceedings{rubinstein1997discriminative,\n\ttitle        = {Discriminative vs Informative Learning},\n\tauthor       = {Y Dan Rubinstein and Trevor Hastie},\n\tyear         = 1997,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)},\n\tvolume       = 5,\n\tpages        = {49--53}\n}\n@inproceedings{rubinstein2009antidote,\n\ttitle        = {Antidote: Understanding and defending against poisoning of anomaly detectors},\n\tauthor       = {Benjamin Rubinstein and Blaine Nelson and Ling Huang and Anthony D. Joseph and Shing-Hon Lau and Satish Rao and Nina Taft and JD Tygar},\n\tyear         = 2009,\n\tbooktitle    = {ACM SIGCOMM Conference on Internet measurement conference}\n}\n@book{rubinstein2013cross,\n\ttitle        = {The cross-entropy method: a unified approach to combinatorial optimization, Monte-Carlo simulation and machine learning},\n\tauthor       = {Rubinstein, Reuven Y and Kroese, Dirk P},\n\tyear         = 2013,\n\tpublisher    = {Springer Science \\& Business Media}\n}\n@article{rudelson1999random,\n\ttitle        = {Random vectors in the isotropic position},\n\tauthor       = {Mark Rudelson},\n\tyear         = 1999,\n\tjournal      = {Journal of Functional Analysis},\n\tvolume       = 164,\n\tpages        = {60--72}\n}\n@article{Rudelson2009smallest,\n\ttitle        = {Smallest singular value of a random rectangular matrix},\n\tauthor       = {Rudelson, Mark and Vershynin, Roman},\n\tyear         = 2009,\n\tjournal      = {Communications on Pure and Applied Mathematics},\n\tpublisher    = {Wiley Online Library},\n\tvolume       = 62,\n\tnumber       = 12,\n\tpages        = {1707--1739}\n}\n@inproceedings{rudelson2010non,\n\ttitle        = {Non-asymptotic theory of random matrices: extreme singular values},\n\tauthor       = {Rudelson, Mark and Vershynin, Roman},\n\tyear         = 2010,\n\tbooktitle    = {Proceedings of the International Congress of Mathematicians 2010 (ICM 2010) (In 4 Volumes) Vol. I: Plenary Lectures and Ceremonies Vols. II--IV: Invited Lectures},\n\tpages        = {1576--1602},\n\torganization = {World Scientific}\n}\n@article{RudelsonVershynin2009,\n\ttitle        = {The smallest singular value of a random rectangular matrix},\n\tauthor       = {M. Rudelson and R. Vershynin},\n\tyear         = 2009,\n\tjournal      = {Communications on Pure and Applied Mathematics},\n\tvolume       = 62,\n\tnumber       = 12,\n\tpages        = {1707--1739}\n}\n@article{ruder2017overview,\n\ttitle        = {An overview of multi-task learning in deep neural networks},\n\tauthor       = {Sebastian Ruder},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1706.05098}\n}\n@inproceedings{rudinger2015script,\n\ttitle        = {Script Induction as Language Modeling},\n\tauthor       = {Rachel Rudinger and Pushpendre Rastogi and Francis Ferraro and Benjamin Van Durme},\n\tyear         = 2015,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{rudinger2018gender,\n\ttitle        = {Gender Bias in Coreference Resolution},\n\tauthor       = {Rachel Rudinger and Jason Naradowsky and Brian Leonard and Benjamin Van Durme},\n\tyear         = 2018,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{rumelhart1988learning,\n\ttitle        = {Learning representations by back-propagating errors},\n\tauthor       = {Rumelhart, David E and Hinton, Geoffrey E and Williams, Ronald J},\n\tyear         = 1988,\n\tjournal      = {Cognitive modeling},\n\tvolume       = 5,\n\tnumber       = 3,\n\tpages        = 1\n}\n@inproceedings{rush2010dual,\n\ttitle        = {On dual decomposition and linear programming relaxations for natural language processing},\n\tauthor       = {A. Rush and D. Sontag and M. Collins and T. Jaakkola},\n\tyear         = 2010,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1--11}\n}\n@inproceedings{rush2011exact,\n\ttitle        = {Exact decoding of syntactic translation models through {L}agrangian relaxation},\n\tauthor       = {Alexander M Rush and Michael Collins},\n\tyear         = 2011,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {72--82}\n}\n@inproceedings{rush2012vine,\n\ttitle        = {Vine pruning for efficient multi-pass dependency parsing},\n\tauthor       = {Alexander Rush and Slav Petrov},\n\tyear         = 2012,\n\tbooktitle    = {Human Language Technology and North American Association for Computational Linguistics (HLT/NAACL)}\n}\n@book{russell1991right,\n\ttitle        = {Do the right thing: studies in limited rationality},\n\tauthor       = {Stuart Russell and Eric Wefald},\n\tyear         = 1991,\n\tpublisher    = {MIT Press}\n}\n@inproceedings{russell1998learning,\n\ttitle        = {Learning agents for uncertain environments},\n\tauthor       = {Stuart J. Russell},\n\tyear         = 1998,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@misc{russell2015research,\n\ttitle        = {Research priorities for robust and beneficial artificial intelligence},\n\tauthor       = {Stuart Russell and Daniel Dewey and Max Tegmark and Janos Kramar and Richard Mallah},\n\tyear         = 2015\n}\n@inproceedings{russo2013eluder,\n\ttitle        = {Eluder dimension and the sample complexity of optimistic exploration},\n\tauthor       = {Russo, Daniel and Roy, Benjamin Van},\n\tyear         = 2013,\n\tjournal      = {Advances in Neural Information Processing Systems},\n\tbooktitle    = {Proceedings of the 26th International Conference on Neural Information Processing Systems-Volume 2},\n\tpages        = {2256--2264}\n}\n@article{russo2014learning,\n\ttitle        = {Learning to optimize via posterior sampling},\n\tauthor       = {Russo, Daniel and Van Roy, Benjamin},\n\tyear         = 2014,\n\tjournal      = {Mathematics of Operations Research},\n\tpublisher    = {INFORMS},\n\tvolume       = 39,\n\tnumber       = 4,\n\tpages        = {1221--1243}\n}\n@article{russo2017tutorial,\n\ttitle        = {A tutorial on thompson sampling},\n\tauthor       = {Daniel Russo and Benjamin Van Roy and Abbas Kazerouni and Ian Osband and Zheng Wen},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1707.02038}\n}\n@inproceedings{russo2019worst,\n\ttitle        = {Worst-case regret bounds for exploration via randomized value functions},\n\tauthor       = {Russo, Daniel},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {14433--14443}\n}\n@inproceedings{russwurm2020meta,\n\ttitle        = {Meta-Learning for Few-Shot Land Cover Classification},\n\tauthor       = {Marc Ru{\\ss}wurm and Sherrie Wang and Marco Korner and David Lobell},\n\tyear         = 2020,\n\tbooktitle    = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops},\n\tpages        = {200--201}\n}\n@article{rusu2015policy,\n\ttitle        = {Policy distillation},\n\tauthor       = {Andrei A Rusu and Sergio Gomez Colmenarejo and Caglar Gulcehre and Guillaume Desjardins and James Kirkpatrick and Razvan Pascanu and Volodymyr Mnih and Koray Kavukcuoglu and Raia Hadsell},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1511.06295}\n}\n@article{ruszczynski2003stochastic,\n\ttitle        = {Stochastic programming models},\n\tauthor       = {Ruszczy{\\'n}ski, Andrzej and Shapiro, Alexander},\n\tyear         = 2003,\n\tjournal      = {Handbooks in operations research and management science},\n\tpublisher    = {Elsevier},\n\tvolume       = 10,\n\tpages        = {1--64}\n}\n@article{RW84,\n\ttitle        = {Mixture densities, maximum likelihood and the {EM} algorithm},\n\tauthor       = {R. A. Redner and H. F. Walker},\n\tyear         = 1984,\n\tjournal      = {SIAM Review},\n\tvolume       = 26,\n\tnumber       = 2,\n\tpages        = {195--239}\n}\n@inproceedings{ryzhov2010approximate,\n\ttitle        = {Approximate dynamic programming with correlated Bayesian beliefs},\n\tauthor       = {Ryzhov, Ilya O and Powell, Warren B},\n\tyear         = 2010,\n\tbooktitle    = {Communication, Control, and Computing (Allerton), 2010 48th Annual Allerton Conference on},\n\tpages        = {1360--1367},\n\torganization = {IEEE}\n}\n@article{S07,\n\ttitle        = {Graph clustering},\n\tauthor       = {S. E. Schaeffer},\n\tyear         = 2007,\n\tjournal      = {Computer Science Review,},\n\tvolume       = 1,\n\tnumber       = 1,\n\tpages        = {27--64}\n}\n@article{s17,\n\ttitle        = {Learning {R}e{LU}s via Gradient Descent},\n\tauthor       = {Mahdi Soltanolkotabi},\n\tyear         = 2017,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1705.04591},\n\turl          = {http://arxiv.org/abs/1705.04591},\n\tarchiveprefix = {arXiv},\n\teprint       = {1705.04591},\n\ttimestamp    = {Mon, 13 Aug 2018 16:48:16 +0200},\n\tbiburl       = {https://dblp.org/rec/bib/journals/corr/Soltanolkotabi17a},\n\tbibsource    = {dblp computer science bibliography, https://dblp.org}\n}\n@article{s18,\n\ttitle        = {Distribution-specific hardness of learning neural networks},\n\tauthor       = {Shamir, Ohad},\n\tyear         = 2018,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 19,\n\tnumber       = 32\n}\n@article{S61,\n\ttitle        = {On the definition of a family of automata},\n\tauthor       = {M. P. Sch\\\"utzenberger},\n\tyear         = 1961,\n\tjournal      = {Inf. Control},\n\tvolume       = 4,\n\tpages        = {245--270}\n}\n@inproceedings{sa15,\n\ttitle        = {Provable methods for training neural networks with sparse connectivity},\n\tauthor       = {Sedghi, Hanie and Anandkumar, Anima},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1412.2693},\n\tbooktitle    = {ICLR},\n\tpublisher    = {arXiv preprint arXiv:1412.2693}\n}\n@article{sa96,\n\ttitle        = {A model of multiplicative neural responses in parietal cortex},\n\tauthor       = {Salinas, Emilio and Abbott, Laurence F.},\n\tyear         = 1996,\n\tjournal      = {Proceedings of the National Academy of Sciences},\n\tvolume       = 93,\n\tnumber       = 21,\n\tpages        = {11956--11961}\n}\n@article{saad1995line,\n\ttitle        = {On-line learning in soft committee machines},\n\tauthor       = {Saad, David and Solla, Sara A},\n\tyear         = 1995,\n\tjournal      = {Physical Review E},\n\tpublisher    = {APS},\n\tvolume       = 52,\n\tnumber       = 4,\n\tpages        = 4225\n}\n@inproceedings{sachan2015learning,\n\ttitle        = {Learning answer-entailing structures for machine comprehension},\n\tauthor       = {Mrinmaya Sachan and Avinava Dubey and Eric P Xing and Matthew Richardson},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{sachs1981language,\n\ttitle        = {Language learning with restricted input: Case studies of two hearing children of deaf parents},\n\tauthor       = {Jacqueline Sachs and Barbara Bard and Marie L Johnson},\n\tyear         = 1981,\n\tjournal      = {Applied Psycholinguistics},\n\tvolume       = {0}\n}\n@article{sacks1958asymptotic,\n\ttitle        = {Asymptotic Distribution of Stochastic Approximation Procedures},\n\tauthor       = {Jerome Sacks},\n\tyear         = 1958,\n\tjournal      = {Annals of Mathematical Statistics},\n\tvolume       = 29,\n\tnumber       = 2,\n\tpages        = {373--405}\n}\n@inproceedings{sadigh2016planning,\n\ttitle        = {Planning for autonomous cars that leverage effects on human actions.},\n\tauthor       = {Sadigh, Dorsa and Sastry, Shankar and Seshia, Sanjit A and Dragan, Anca D},\n\tyear         = 2016,\n\tbooktitle    = {Robotics: Science and Systems},\n\tvolume       = 2,\n\tpages        = {1--9},\n\torganization = {Ann Arbor, MI, USA}\n}\n@article{saemundsson2018meta,\n\ttitle        = {Meta reinforcement learning with latent variable gaussian processes},\n\tauthor       = {Steind{'o}r S{\\ae}mundsson and Katja Hofmann and Marc Peter Deisenroth},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1803.07551}\n}\n@inproceedings{saenko2010adapting,\n\ttitle        = {Adapting visual category models to new domains},\n\tauthor       = {Kate Saenko and Brian Kulis and Mario Fritz and Trevor Darrell},\n\tyear         = 2010,\n\tbooktitle    = {European conference on computer vision},\n\tpages        = {213--226}\n}\n@article{saerens2002adjusting,\n\ttitle        = {Adjusting the outputs of a classifier to new a priori probabilities: a simple procedure},\n\tauthor       = {Marco Saerens and Patrice Latinne and Christine Decaestecker},\n\tyear         = 2002,\n\tjournal      = {Neural computation},\n\tvolume       = 14,\n\tnumber       = 1,\n\tpages        = {21--41}\n}\n@inproceedings{safonova2003optimizing,\n\ttitle        = {Optimizing Human Motion for the Control of a Humanoid Robot},\n\tauthor       = {Alla Safonova and Nancy Pollard and Jessica K Hodgins},\n\tyear         = 2003,\n\tmonth        = mar,\n\tbooktitle    = {AMAM2003},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@article{safonova2007construction,\n\ttitle        = {Construction and optimal search of interpolated motion graphs},\n\tauthor       = {Safonova, Alla and Hodgins, Jessica K.},\n\tyear         = 2007,\n\tbooktitle    = {ACM SIGGRAPH 2007 papers},\n\tlocation     = {San Diego, California},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {SIGGRAPH '07},\n\tdoi          = {http://doi.acm.org/10.1145/1275808.1276510},\n\tacmid        = 1276510,\n\tarticleno    = 106,\n\tkeywords     = {\n\t\thuman animation, motion capture, motion graph, motion interpolation,\n\n\t\tmotion planning\n\t}\n}\n@article{safran2015quality,\n\ttitle        = {On the Quality of the Initial Basin in Overspecified Neural Networks},\n\tauthor       = {Safran, Itay and Shamir, Ohad},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1511.04210},\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {774--782}\n}\n@inproceedings{safran2017depth,\n\ttitle        = {Depth-Width Tradeoffs in Approximating Natural Functions with Neural Networks},\n\tauthor       = {Safran, Itay and Shamir, Ohad},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {2979--2987}\n}\n@article{safran2017spurious,\n\ttitle        = {Spurious local minima are common in two-layer relu neural networks},\n\tauthor       = {Safran, Itay and Shamir, Ohad},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1712.08968}\n}\n@inproceedings{Sagawa*2020Distributionally,\n\ttitle        = {Distributionally Robust Neural Networks},\n\tauthor       = {Shiori Sagawa* and Pang Wei Koh* and Tatsunori B. Hashimoto and Percy Liang},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Learning Representations},\n\turl          = {https://openreview.net/forum?id=ryxGuJrFvS}\n}\n@inproceedings{sagawa2020group,\n\ttitle        = {Distributionally Robust Neural Networks for Group Shifts: On the Importance of Regularization for Worst-Case Generalization},\n\tauthor       = {Shiori Sagawa and Pang Wei Koh and Tatsunori B. Hashimoto and Percy Liang},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{sagawa2020overparameterization,\n\ttitle        = {An investigation of why overparameterization exacerbates spurious correlations},\n\tauthor       = {Shiori Sagawa and Aditi Raghunathan and Pang Wei Koh and Percy Liang},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{Saha2011,\n\ttitle        = {{New Approximation Algorithms for Minimum Enclosing Convex Shapes}},\n\tauthor       = {Saha, Ankan and Vishwanathan, S. V. N. and Zhang, Xinhua},\n\tyear         = 2011,\n\tmonth        = sep,\n\tbooktitle    = {Proceedings of the Twenty-Second Annual ACM-SIAM Symposium on Discrete Algorithms - SODA '11},\n\tpages        = {1146--1160},\n\tabstract     = {Given \\$n\\$ points in a \\$d\\$ dimensional Euclidean space, the Minimum Enclosing Ball (MEB) problem is to find the ball with the smallest radius which contains all \\$n\\$ points. We give a \\$O(nd\\backslash Qcal/\\backslash sqrt\\{\\backslash epsilon\\})\\$ approximation algorithm for producing an enclosing ball whose radius is at most \\$\\backslash epsilon\\$ away from the optimum (where \\$\\backslash Qcal\\$ is an upper bound on the norm of the points). This improves existing results using $\\backslash$emph\\{coresets\\}, which yield a \\$O(nd/\\backslash epsilon)\\$ greedy algorithm. Finding the Minimum Enclosing Convex Polytope (MECP) is a related problem wherein a convex polytope of a fixed shape is given and the aim is to find the smallest magnification of the polytope which encloses the given points. For this problem we present a \\$O(mnd\\backslash Qcal/\\backslash epsilon)\\$ approximation algorithm, where \\$m\\$ is the number of faces of the polytope. Our algorithms borrow heavily from convex duality and recently developed techniques in non-smooth optimization, and are in contrast with existing methods which rely on geometric arguments. In particular, we specialize the excessive gap framework of $\\backslash$citet\\{Nesterov05a\\} to obtain our results.},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {0909.1062},\n\teprint       = {0909.1062},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Saha, Vishwanathan, Zhang - 2011 - New Approximation Algorithms for Minimum Enclosing Convex Shapes.pdf:pdf},\n\tmendeley-groups = {Algorithms/Computational Geometry}\n}\n@inproceedings{sahavechaphan06xsnippet,\n\ttitle        = {XSnippet: Mining For Sample Code},\n\tauthor       = {Naiyana Sahavechaphan and Kajal Claypool},\n\tyear         = 2006,\n\tbooktitle    = {Object-Oriented Programming, Systems, Languages, and Applications (OOPSLA)},\n\tvolume       = 41\n}\n@article{sahn2003asset,\n\ttitle        = {Exploring Alternative Measures of Welfare in the Absence of Expenditure Data},\n\tauthor       = {David E. Sahn and David Stifel},\n\tyear         = 2003,\n\tjournal      = {The Review of Income and Wealth},\n\tvolume       = 49\n}\n@inproceedings{sahuguet1999wysiwyg,\n\ttitle        = {{WysiWyg} Web Wrapper Factory ({W4F})},\n\tauthor       = {Arnaud Sahuguet and Fabien Azavant},\n\tyear         = 1999,\n\tbooktitle    = {WWW Conference}\n}\n@inproceedings{saito2017asymmetric,\n\ttitle        = {Asymmetric tri-training for unsupervised domain adaptation},\n\tauthor       = {Kuniaki Saito and Yoshitaka Ushiku and Tatsuya Harada},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {2988--2997}\n}\n@inproceedings{saito2018maximum,\n\ttitle        = {Maximum classifier discrepancy for unsupervised domain adaptation},\n\tauthor       = {Kuniaki Saito and Kohei Watanabe and Yoshitaka Ushiku and Tatsuya Harada},\n\tyear         = 2018,\n\tbooktitle    = {Proceedings of the IEEE conference on computer vision and pattern recognition},\n\tpages        = {3723--3732}\n}\n@article{saito2021tune,\n\ttitle        = {Tune it the Right Way: Unsupervised Validation of Domain Adaptation via Soft Neighborhood Density},\n\tauthor       = {Kuniaki Saito and Donghyun Kim and Piotr Teterwak and Stan Sclaroff and Trevor Darrell and Kate Saenko},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2108.10860}\n}\n@inproceedings{sajjadi2016regularization,\n\ttitle        = {Regularization with stochastic transformations and perturbations for deep semi-supervised learning},\n\tauthor       = {Mehdi Sajjadi and Mehran Javanmardi and Tolga Tasdizen},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1163--1171}\n}\n@article{sajjadi2018precision,\n\ttitle        = {Assessing Generative Models via Precision and Recall},\n\tauthor       = {Mehdi S. M. Sajjadi and Olivier Bachem and Mario Lucic and Olivier Bousquet and Sylvain Gelly},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1806.00035}\n}\n@inproceedings{sakaguchi2017robsut,\n\ttitle        = {Robsut Wrod Reocginiton via Semi-Character Recurrent Neural Network},\n\tauthor       = {Keisuke Sakaguchi and Kevin Duh and Matt Post and Benjamin Van Durme},\n\tyear         = 2017,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{sakai2008information,\n\ttitle        = {On information retrieval metrics designed for evaluation with incomplete relevance assessments},\n\tauthor       = {Tetsuya Sakai and Noriko Kando},\n\tyear         = 2008,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)},\n\tpages        = {447--470}\n}\n@article{sakakibara05grammar,\n\ttitle        = {Grammatical Inference in Bioinformatics},\n\tauthor       = {Yasubumi Sakakibara},\n\tyear         = 2005,\n\tjournal      = {IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI)},\n\tvolume       = 27,\n\tpages        = {1051--1062}\n}\n@inproceedings{sakurai2005braid,\n\ttitle        = {BRAID: stream mining through group lag correlations},\n\tauthor       = {Sakurai, Yasushi and Papadimitriou, Spiros and Faloutsos, Christos},\n\tyear         = 2005,\n\tbooktitle    = {\n\t\tProceedings of the 2005 ACM SIGMOD international conference on Management\n\n\t\tof data\n\t},\n\tlocation     = {Baltimore, Maryland},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {SIGMOD '05},\n\tpages        = {599--610},\n\tdoi          = {http://doi.acm.org/10.1145/1066157.1066226},\n\tisbn         = {1-59593-060-4},\n\tacmid        = 1066226,\n\tnumpages     = 12\n}\n@inproceedings{sakurai2005ftw,\n\ttitle        = {FTW: fast similarity search under the time warping distance},\n\tauthor       = {Sakurai, Yasushi and Yoshikawa, Masatoshi and Faloutsos, Christos},\n\tyear         = 2005,\n\tbooktitle    = {\n\t\tProceedings of the twenty-fourth ACM SIGMOD-SIGACT-SIGART symposium\n\n\t\ton Principles of database systems\n\t},\n\tlocation     = {Baltimore, Maryland},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {PODS '05},\n\tpages        = {326--337},\n\tdoi          = {http://doi.acm.org/10.1145/1065167.1065210},\n\tisbn         = {1-59593-062-0},\n\tacmid        = 1065210,\n\tnumpages     = 12\n}\n@inproceedings{sakurai2007stream,\n\ttitle        = {Stream Monitoring under the Time Warping Distance},\n\tauthor       = {Yasushi Sakurai and Christos Faloutsos and Masashi Yamamuro},\n\tyear         = 2007,\n\tmonth        = apr,\n\tbooktitle    = {ICDE 2007. IEEE 23rd International Conference on Data Engineering},\n\taddress      = {Istanbul, Turkey},\n\tpages        = {1046--1055},\n\tdoi          = {10.1109/ICDE.2007.368963}\n}\n@inproceedings{salakhutdinov03ecg,\n\ttitle        = {Optimization with {EM} and expectation-conjugate-gradient},\n\tauthor       = {Ruslan Salakhutdinov and Sam Roweis and Zoubin Ghahramani},\n\tyear         = 2003,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{salakhutdinov09softmax,\n\ttitle        = {Replicated Softmax: an Undirected Topic Model},\n\tauthor       = {Ruslan Salakhutdinov and Geoff Hinton},\n\tyear         = 2009,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{salakhutdinov2008ais,\n\ttitle        = {On the quantitative analysis of deep belief networks},\n\tauthor       = {R. Salakhutdinov and I. Murray},\n\tyear         = 2008,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {872--879}\n}\n@inproceedings{salakhutdinov2010collaborative,\n\ttitle        = {Collaborative filtering in a non-uniform world: Learning with the weighted trace norm},\n\tauthor       = {Salakhutdinov, Ruslan and Srebro, Nathan},\n\tyear         = 2010,\n\tbooktitle    = {Proc. of NIPS}\n}\n@inproceedings{salant2018contextualized,\n\ttitle        = {Contextualized Word Representations for Reading Comprehension},\n\tauthor       = {Shimi Salant and Jonathan Berant},\n\tyear         = 2018,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{salimans2013fixed,\n\ttitle        = {Fixed-form variational posterior approximation through stochastic linear regression},\n\tauthor       = {Tim Salimans and David A. Knowles},\n\tyear         = 2013,\n\tjournal      = {Bayesian Analysis},\n\tvolume       = 8,\n\tnumber       = 4,\n\tpages        = {837--882}\n}\n@inproceedings{salimans2015markov,\n\ttitle        = {{M}arkov Chain {M}onte {C}arlo and Variational Inference: Bridging the Gap},\n\tauthor       = {Tim Salimans and Diederik Kingma and Max Welling},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{salimans2016gan,\n\ttitle        = {Improved Techniques for Training GANs},\n\tauthor       = {Tim Salimans and Ian Goodfellow and Wojciech Zaremba and Vicki Cheung and Alec Radford and Xi Chen},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{salimans2016weight,\n\ttitle        = {Weight normalization: A simple reparameterization to accelerate training of deep neural networks},\n\tauthor       = {Salimans, Tim and Kingma, Diederik P},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {901--909}\n}\n@article{salimans2017evolution,\n\ttitle        = {Evolution strategies as a scalable alternative to reinforcement learning},\n\tauthor       = {Salimans, Tim and Ho, Jonathan and Chen, Xi and Sidor, Szymon and Sutskever, Ilya},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1703.03864}\n}\n@article{salman2020adversarially,\n\ttitle        = {Do Adversarially Robust ImageNet Models Transfer Better?},\n\tauthor       = {Salman, Hadi and Ilyas, Andrew and Engstrom, Logan and Kapoor, Ashish and Madry, Aleksander},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.08489}\n}\n@article{salton1965smart,\n\ttitle        = {The {SMART} automatic document retrieval systems—an illustration},\n\tauthor       = {Gerard Salton and Michael E. Lesk},\n\tyear         = 1965,\n\tjournal      = {Communications of the ACM},\n\tvolume       = 8,\n\tnumber       = 6,\n\tpages        = {391--398}\n}\n@article{samuel1959some,\n\ttitle        = {Some studies in machine learning using the game of checkers},\n\tauthor       = {Samuel, Arthur L},\n\tyear         = 1959,\n\tjournal      = {IBM Journal of research and development},\n\tpublisher    = {IBM},\n\tvolume       = 3,\n\tnumber       = 3,\n\tpages        = {210--229}\n}\n@inproceedings{samvelyan2019starcraft,\n\ttitle        = {The StarCraft Multi-Agent Challenge},\n\tauthor       = {Samvelyan, Mikayel and Rashid, Tabish and Schroeder de Witt, Christian and Farquhar, Gregory and Nardelli, Nantas and Rudner, Tim GJ and Hung, Chia-Man and Torr, Philip HS and Foerster, Jakob and Whiteson, Shimon},\n\tyear         = 2019,\n\tbooktitle    = {Proceedings of the 18th International Conference on Autonomous Agents and MultiAgent Systems},\n\tpages        = {2186--2188}\n}\n@manual{sandhaus2008new,\n\ttitle        = {The {N}ew {Y}ork {T}imes annotated corpus},\n\tauthor       = {Evan Sandhaus},\n\tyear         = 2008\n}\n@article{sanh2018hierarchical,\n\ttitle        = {A Hierarchical Multi-task Approach for Learning Embeddings from Semantic Tasks},\n\tauthor       = {Victor Sanh and Thomas Wolf and Sebastian Ruder},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1811.06031}\n}\n@article{sanjabi2018solving,\n\ttitle        = {Solving Approximate Wasserstein {GANs} to Stationarity},\n\tauthor       = {Sanjabi, Maziar and Ba, Jimmy and Razaviyayn, Meisam and Lee, Jason D},\n\tyear         = 2018,\n\tjournal      = {Neural Information Processing Systems (NIPS)}\n}\n@article{sankaran2016temporal,\n\ttitle        = {Temporal Attention Model for Neural Machine Translation},\n\tauthor       = {Baskaran Sankaran and Haitao Mi and Yaser Al-Onaizan and Abe Ittycheriah},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@article{sannai2019universal,\n\ttitle        = {Universal approximations of permutation invariant/equivariant functions by deep neural networks},\n\tauthor       = {Sannai, Akiyoshi and Takai, Yuuki and Cordonnier, Matthieu},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1903.01939}\n}\n@article{santoro2016one,\n\ttitle        = {One-shot learning with memory-augmented neural networks},\n\tauthor       = {Adam Santoro and Sergey Bartunov and Matthew Botvinick and Daan Wierstra and Timothy Lillicrap},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1605.06065}\n}\n@article{santoro2017simple,\n\ttitle        = {A simple neural network module for relational reasoning},\n\tauthor       = {Adam Santoro and David Raposo and David GT Barrett and Mateusz Malinowski and Razvan Pascanu and Peter Battaglia and Timothy Lillicrap},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@article{santurkar2018does,\n\ttitle        = {How Does Batch Normalization Help Optimization?},\n\tauthor       = {Santurkar, Shibani and Tsipras, Dimitris and Ilyas, Andrew and Madry, Aleksander},\n\tyear         = 2018,\n\tjournal      = {Advances in neural information processing systems},\n\tnumber       = 31\n}\n@article{santurkar2020breeds,\n\ttitle        = {BREEDS: Benchmarks for Subpopulation Shift},\n\tauthor       = {Shibani Santurkar and Dimitris Tsipras and Aleksander Madry},\n\tyear         = 2020,\n\tjournal      = {arXiv}\n}\n@inproceedings{sap2019atomic,\n\ttitle        = {ATOMIC: An Atlas of Machine Commonsense for If-Then Reasoning},\n\tauthor       = {Maarten Sap and Ronan LeBras and Emily Allaway and Chandra Bhagavatula and Nicholas Lourie and Hannah Rashkin and Brendan Roof and Noah A. Smith and Yejin Choi},\n\tyear         = 2019,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{sap2019risk,\n\ttitle        = {The Risk of Racial Bias in Hate Speech Detection},\n\tauthor       = {Maarten Sap and Dallas Card and Saadia Gabriel and Yejin Choi and Noah A Smith},\n\tyear         = 2019,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{sapiezynski2017,\n\ttitle        = {Academic performance prediction in a gender-imbalanced environment},\n\tauthor       = {Piotr Sapiezynski and Valentin Kassarnig and Christo Wilson and Sune Lehmann and Alan Mislove},\n\tyear         = 2017,\n\tbooktitle    = {FATREC},\n\tvolume       = 1,\n\tpages        = {48--51}\n}\n@inproceedings{sapp2010cascaded,\n\ttitle        = {Cascaded models for articulated pose estimation},\n\tauthor       = {B Sapp and A Toshev and B Taskar},\n\tyear         = 2010,\n\tbooktitle    = {European Conference on Computer Vision (ECCV)},\n\tpages        = {406--420}\n}\n@inproceedings{sarawagi2014open,\n\ttitle        = {Open-domain quantity queries on web tables: annotation, response, and consensus models},\n\tauthor       = {Sunita Sarawagi and Soumen Chakrabarti},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)},\n\tpages        = {711--720}\n}\n@inproceedings{SaReOlukotun2015-online1PCA,\n\ttitle        = {Global Convergence of Stochastic Gradient Descent for Some Non-convex Matrix Problems},\n\tauthor       = {Sa, Christopher De and Re, Christopher and Olukotun, Kunle},\n\tyear         = 2015,\n\tbooktitle    = {ICML},\n\tpages        = {2332--2341}\n}\n@article{sargan1958estimation,\n\ttitle        = {The estimation of economic relationships using instrumental variables},\n\tauthor       = {John D. Sargan},\n\tyear         = 1958,\n\tjournal      = {Econometrica},\n\tpages        = {393--415}\n}\n@article{sargan1959estimation,\n\ttitle        = {The estimation of relationships with autocorrelated residuals by the use of instrumental variables},\n\tauthor       = {John D. Sargan},\n\tyear         = 1959,\n\tjournal      = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)},\n\tpages        = {91--105}\n}\n@inproceedings{sarlos2006improved,\n\ttitle        = {Improved approximation algorithms for large matrices via random projections},\n\tauthor       = {Tamas Sarlos},\n\tyear         = 2006,\n\tbooktitle    = {Foundations of Computer Science (FOCS)},\n\tpages        = {143--152}\n}\n@article{sato00online,\n\ttitle        = {On-line {EM} Algorithm for the Normalized {G}aussian Network},\n\tauthor       = {Masa-aki Sato and Shin Ishii},\n\tyear         = 2000,\n\tjournal      = {Neural Computation},\n\tvolume       = 12,\n\tpages        = {407--432}\n}\n@article{saul96sigmoid,\n\ttitle        = {Mean Field Theory for Sigmoid Belief Networks},\n\tauthor       = {L. Saul and T. Jaakkola and M. I. Jordan},\n\tyear         = 1996,\n\tjournal      = {Journal of Artificial Intelligence Research (JAIR)},\n\tvolume       = 4,\n\tpages        = {61--76}\n}\n@article{saunshi2020mathematical,\n\ttitle        = {A Mathematical Exploration of Why Language Models Help Solve Downstream Tasks},\n\tauthor       = {Saunshi, Nikunj and Malladi, Sadhika and Arora, Sanjeev},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.03648}\n}\n@article{savage1971elicitation,\n\ttitle        = {Elicitation of personal probabilities and expectations},\n\tauthor       = {Savage, Leonard J},\n\tyear         = 1971,\n\tjournal      = {Journal of the American Statistical Association},\n\tpublisher    = {Taylor \\& Francis},\n\tvolume       = 66,\n\tnumber       = 336,\n\tpages        = {783--801}\n}\n@inproceedings{savage1998models,\n\ttitle        = {Models of computation - exploring the power of computing},\n\tauthor       = {J. Savage},\n\tyear         = 1998\n}\n@article{savarese2019function,\n\ttitle        = {How do infinite width bounded norm networks look in function space?},\n\tauthor       = {Pedro Savarese and Itay Evron and Daniel Soudry and Nathan Srebro},\n\tyear         = 2019,\n\tjournal      = {arXiv}\n}\n@article{savinov2018semi,\n\ttitle        = {Semi-parametric topological memory for navigation},\n\tauthor       = {Savinov, Nikolay and Dosovitskiy, Alexey and Koltun, Vladlen},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1803.00653}\n}\n@inproceedings{saxe2007causal,\n\ttitle        = {Knowing Who Dunnit: Infants Identify the Causal Agent in an Unseen Causal Interaction},\n\tauthor       = {Rebecca Saxe and Tania Tzelnic and Susan Carey},\n\tyear         = 2007,\n\tbooktitle    = {Developmental Psychology}\n}\n@inproceedings{saxena07scene,\n\ttitle        = {Learning 3-{D} Scene Structure from a Single Still Image},\n\tauthor       = {Ashutosh Saxena and Min Sun and Andrew Y. Ng},\n\tyear         = 2007,\n\tbooktitle    = {International Conference on Computer Vision (ICCV)}\n}\n@article{saxena2014robobrain,\n\ttitle        = {RoboBrain: Large-Scale Knowledge Engine for Robots},\n\tauthor       = {Ashutosh Saxena and Ashesh Jain and Ozan Sener and Aditya Jami and Dipendra Kumar Misra and Hema S Koppula},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1412.0691}\n}\n@inproceedings{SBG10,\n\ttitle        = {Reduced-Rank Hidden {M}arkov Models},\n\tauthor       = {S. M. Siddiqi and B. Boots and G. J. Gordon},\n\tyear         = 2010,\n\tbooktitle    = {AISTATS}\n}\n@article{sbsbcv17,\n\ttitle        = {Recent Advances in Recurrent Neural Networks},\n\tauthor       = {Salehinejad, Hojjat and Baarbe, Julianne and Sankar, Sharan and Barfett, Joseph and Colak, Errol and Valaee, Shahrokh},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1801.01078}\n}\n@article{sc16,\n\ttitle        = {No bad local minima: Data independent training error guarantees for multilayer neural networks},\n\tauthor       = {Soudry, Daniel and Carmon, Yair},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1605.08361}\n}\n@article{scalise2018natural,\n\ttitle        = {Natural language instructions for human–robot collaborative manipulation},\n\tauthor       = {Rosario Scalise and Shen Li and H. Admoni and Stephanie Rosenthal and S. Srinivasa},\n\tyear         = 2018,\n\tjournal      = {International Journal of Robotics Research (IJRR)},\n\tvolume       = 37,\n\tpages        = {558--565}\n}\n@inproceedings{scaria2013biological,\n\ttitle        = {Learning biological processes with global constraints},\n\tauthor       = {Aju Thalappillil Scaria and Jonathan Berant and Mengqiu Wang and Christopher D Manning and Justin Lewis and Brittany Harding and Peter Clark},\n\tyear         = 2013,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{scarselli2008computational,\n\ttitle        = {Computational capabilities of graph neural networks},\n\tauthor       = {Scarselli, Franco and Gori, Marco and Tsoi, Ah Chung and Hagenbuchner, Markus and Monfardini, Gabriele},\n\tyear         = 2008,\n\tjournal      = {IEEE Transactions on Neural Networks},\n\tpublisher    = {IEEE},\n\tvolume       = 20,\n\tnumber       = 1,\n\tpages        = {81--102}\n}\n@article{scassellati2012robots,\n\ttitle        = {Robots for Use in Autism Research},\n\tauthor       = {Brian Scassellati and Henny Admoni and Maja Mataric},\n\tyear         = 2012,\n\tjournal      = {Annual review of biomedical engineering},\n\tvolume       = 14,\n\tpages        = {275--294}\n}\n@article{schaeffer1941,\n\ttitle        = {Inequalities of A. Markoff and S. Bernstein for polynomials and related functions},\n\tauthor       = {Schaeffer, A. C.},\n\tyear         = 1941,\n\tmonth        = {08},\n\tjournal      = {Bull. Amer. Math. Soc.},\n\tpublisher    = {American Mathematical Society},\n\tvolume       = 47,\n\tnumber       = 8,\n\tpages        = {565--579},\n\turl          = {http://projecteuclid.org/euclid.bams/1183503783},\n\tfjournal     = {Bulletin of the American Mathematical Society}\n}\n@article{schafer2007recurrent,\n\ttitle        = {Recurrent neural networks are universal approximators},\n\tauthor       = {Sch{\\\"a}fer, Anton Maximilian and Zimmermann, Hans-Georg},\n\tyear         = 2007,\n\tjournal      = {International journal of neural systems},\n\tpublisher    = {World Scientific},\n\tvolume       = 17,\n\tnumber       = {04},\n\tpages        = {253--263}\n}\n@phdthesis{schain2015robustness,\n\ttitle        = {Machine Learning Algorithms and Robustness},\n\tauthor       = {Mariano Schain},\n\tyear         = 2015,\n\tschool       = {Tel Aviv University}\n}\n@inproceedings{schapire2002prior,\n\ttitle        = {Incorporating Prior Knowledge into Boosting},\n\tauthor       = {Robert E. Schapire and Marie Rochery and Mazin G. Rahim and Narendra Gupta},\n\tyear         = 2002,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{schatzmann2006survey,\n\ttitle        = {A survey of statistical user simulation techniques for reinforcement-learning of dialogue management strategies},\n\tauthor       = {Jost Schatzmann and Karl Weilhammer and Matt Stuttle and Steve Young},\n\tyear         = 2006,\n\tjournal      = {The knowledge engineering review},\n\tvolume       = 21,\n\tnumber       = 2,\n\tpages        = {97--126}\n}\n@inproceedings{schaul14unittests,\n\ttitle        = {Unit Tests for Stochastic Optimization},\n\tauthor       = {Tom Schaul and Ioannis Antonoglou and David Silver},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{schaul2015prioritized,\n\ttitle        = {Prioritized experience replay},\n\tauthor       = {T. Schaul and J. Quan and I. Antonoglou and D. Silver},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{schaul2015uvf,\n\ttitle        = {Universal Value Function Approximators},\n\tauthor       = {Tom Schaul and Dan Horgan and K. Gregor and D. Silver},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{scheffer2001active,\n\ttitle        = {Active hidden {M}arkov models for information extraction},\n\tauthor       = {Tobias Scheffer and Christian Decomain and Stefan Wrobel},\n\tyear         = 2001,\n\tbooktitle    = {International Symposium on Intelligent Data Analysis},\n\tpages        = {309--318}\n}\n@phdthesis{schein05active,\n\ttitle        = {Active Learning for Logistic Regression},\n\tauthor       = {Andrew I. Schein},\n\tyear         = 2005,\n\tschool       = {Carnegie Mellon University}\n}\n@article{schein2007active,\n\ttitle        = {Active learning for logistic regression: An evaluation},\n\tauthor       = {A. Schein and Lyle H. Ungar},\n\tyear         = 2007,\n\tjournal      = {Machine Learning},\n\tvolume       = 68,\n\tpages        = {235--265}\n}\n@inproceedings{scherrer2013improved,\n\ttitle        = {Improved and generalized upper bounds on the complexity of policy iteration},\n\tauthor       = {Scherrer, Bruno},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {386--394},\n\tdate-added   = {2017-05-19 05:08:36 +0000},\n\tdate-modified = {2017-05-19 05:08:36 +0000}\n}\n@inproceedings{scherrer2014approximate,\n\ttitle        = {Approximate policy iteration schemes: a comparison},\n\tauthor       = {Scherrer, Bruno},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {1314--1322}\n}\n@inproceedings{scherrer2014local,\n\ttitle        = {Local policy search in a convex space and conservative policy iteration as boosted policy search},\n\tauthor       = {Scherrer, Bruno and Geist, Matthieu},\n\tyear         = 2014,\n\tbooktitle    = {Joint European Conference on Machine Learning and Knowledge Discovery in Databases},\n\tpages        = {35--50},\n\torganization = {Springer}\n}\n@book{Schilders2008model,\n\ttitle        = {Model order reduction: theory, research aspects and applications},\n\tauthor       = {Schilders, Wilhelmus H.A. and Van der Vorst, Henk A. and Rommes, Joost},\n\tyear         = 2008,\n\tpublisher    = {Springer},\n\tvolume       = 13\n}\n@inproceedings{schkufza2013stochastic,\n\ttitle        = {Stochastic Superoptimization},\n\tauthor       = {Eric Schkufza and Rahul Sharma and Alex Aiken},\n\tyear         = 2013,\n\tbooktitle    = {Architectural Support for Programming Languages and Operating Systems (ASPLOS)}\n}\n@phdthesis{schmidhuber1987evolutionary,\n\ttitle        = {Evolutionary principles in self-referential learning, or on learning how to learn: the meta-meta-... hook},\n\tauthor       = {J{\\\"u}rgen Schmidhuber},\n\tyear         = 1987,\n\tschool       = {Technische Universit{\\\"a}t M{\\\"u}nchen}\n}\n@techreport{schmidhuber1991adaptive,\n\ttitle        = {Adaptive Confidence And Adaptive Curiosity},\n\tauthor       = {J{\\\"{u}}rgen Schmidhuber},\n\tyear         = 1991,\n\tinstitution  = {Institut fur Informatik, Technische Universitat Munchen, Arcisstr. 21, 800 Munchen 2}\n}\n@article{schmidhuber1991learning,\n\ttitle        = {Learning to generate artificial fovea trajectories for target detection},\n\tauthor       = {Juergen Schmidhuber and Rudolf Huber},\n\tyear         = 1991,\n\tjournal      = {International Journal of Neural Systems},\n\tvolume       = 2,\n\tnumber       = 1,\n\tpages        = {125--134}\n}\n@article{schmidhuber1992learning,\n\ttitle        = {Learning to control fast-weight memories: An alternative to dynamic recurrent networks},\n\tauthor       = {J{\\\"u}rgen Schmidhuber},\n\tyear         = 1992,\n\tjournal      = {Neural Computation},\n\tvolume       = 4,\n\tnumber       = 1,\n\tpages        = {131--139}\n}\n@inproceedings{schmidhuber1993planning,\n\ttitle        = {Planning simple trajectories using neural subgoal generators},\n\tauthor       = {J{\\\"u}rgen Schmidhuber},\n\tyear         = 1993,\n\tbooktitle    = {From Animals to Animats 2: Proceedings of the Second International Conference on Simulation of Adaptive Behavior},\n\tvolume       = 2\n}\n@article{schmidhuber2007evolino,\n\ttitle        = {Training recurrent networks by {E}volino},\n\tauthor       = {J{\\\"u}rgen Schmidhuber and Daan Wierstra and Matteo Gagliolo and Faustino Gomez},\n\tyear         = 2007,\n\tjournal      = {Neural Computation},\n\tvolume       = 19,\n\tnumber       = 3,\n\tpages        = {757--779}\n}\n@article{Schmidt2013-SAG,\n\ttitle        = {{Minimizing finite sums with the stochastic average gradient}},\n\tauthor       = {Schmidt, Mark and {Le Roux}, Nicolas and Bach, Francis},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1309.2388},\n\tpages        = {1--45},\n\turl          = {http://arxiv.org/abs/1309.2388},\n\tnote         = {Preliminary version appeared in NIPS 2012},\n\tabstract     = {We propose the stochastic average gradient (SAG) method for optimizing the sum of a finite number of smooth convex functions. Like stochastic gradient (SG) methods, the SAG method's iteration cost is independent of the number of terms in the sum. However, by incorporating a memory of previous gradient values the SAG method achieves a faster convergence rate than black-box SG methods. The convergence rate is improved from O(1/k\\^{}\\{1/2\\}) to O(1/k) in general, and when the sum is strongly-convex the convergence rate is improved from the sub-linear O(1/k) to a linear convergence rate of the form O(p\\^{}k) for p < 1. Further, in many cases the convergence rate of the new method is also faster than black-box deterministic gradient methods, in terms of the number of gradient evaluations. Numerical experiments indicate that the new algorithm often dramatically outperforms existing SG and deterministic gradient methods, and that the performance may be further improved through the use of non-uniform sampling strategies.},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {1309.2388},\n\teprint       = {1309.2388},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/6bb9f6695c64ca57938706579bcdff9c8712f8e9.pdf:pdf},\n\tmendeley-groups = {Optimization/Variance Reduction}\n}\n@inproceedings{schmidt2018adversarially,\n\ttitle        = {Adversarially robust generalization requires more data},\n\tauthor       = {Ludwig Schmidt and Shibani Santurkar and Dimitris Tsipras and Kunal Talwar and Aleksander Madry},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {5014--5026}\n}\n@article{schnaufer1997adaptive,\n\ttitle        = {Adaptive fault tolerance for reliable {LMS} adaptive filtering},\n\tauthor       = {Bernard A Schnaufer and W Jenkins},\n\tyear         = 1997,\n\tjournal      = {IEEE Transactions on Circuits and Systems},\n\tvolume       = 44,\n\tnumber       = 12,\n\tpages        = {1001--1014}\n}\n@article{schoenberg1942positive,\n\ttitle        = {Positive definite functions on spheres},\n\tauthor       = {Isaac J Schoenberg},\n\tyear         = 1942,\n\tjournal      = {Duke Mathematical Journal},\n\tvolume       = 9,\n\tpages        = {96--108}\n}\n@article{schoenfinkel24combinatory,\n\ttitle        = {Über die Bausteine der mathematischen Logik},\n\tauthor       = {Moses Schönfinkel},\n\tyear         = 1924,\n\tjournal      = {Mathematische Annalen},\n\tvolume       = 92,\n\tpages        = {305--316}\n}\n@article{schoenholz2016deep,\n\ttitle        = {Deep information propagation},\n\tauthor       = {Schoenholz, Samuel S and Gilmer, Justin and Ganguli, Surya and Sohl-Dickstein, Jascha},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1611.01232},\n\tbooktitle    = {ICLR},\n\turl          = {https://openreview.net/pdf?id=H1W1UN9gg}\n}\n@article{schoenick2016moving,\n\ttitle        = {Moving Beyond the {T}uring Test with the {A}llen {AI} Science Challenge},\n\tauthor       = {Carissa Schoenick and Peter Clark and Oyvind Tafjord and Peter Turney and Oren Etzioni},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1604.04315}\n}\n@inproceedings{schoenmackers10horn,\n\ttitle        = {Learning First-Order Horn Clauses from Web Text},\n\tauthor       = {Stefan Schoenmackers and Oren Etzioni and Daniel S. Weld and Jesse Davis},\n\tyear         = 2010,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{scholkopf1997improving,\n\ttitle        = {Improving the accuracy and speed of support vector machines},\n\tauthor       = {Simard P Sch{\\\"o}lkopf and Patrice Simard and Vladimir Vapnik and AJ Smola},\n\tyear         = 1997,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {375--381}\n}\n@inproceedings{scholkopf1999support,\n\ttitle        = {Support vector method for novelty detection},\n\tauthor       = {Bernhard Sch{\\\"o}lkopf and Robert Williamson and Alex Smola and John Shawe-Taylor and John Platt},\n\tyear         = 1999,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{schrammsteurer17,\n\ttitle        = {Fast and robust tensor decomposition with applications to dictionary learning},\n\tauthor       = {Schramm, Tselil and Steurer, David},\n\tyear         = 2017,\n\tmonth        = {07--10 Jul},\n\tbooktitle    = {Proceedings of the 2017 Conference on Learning Theory},\n\tpublisher    = {PMLR},\n\tseries       = {Proceedings of Machine Learning Research},\n\tvolume       = 65,\n\tpages        = {1760--1793},\n\turl          = {https://proceedings.mlr.press/v65/schramm17a.html},\n\teditor       = {Kale, Satyen and Shamir, Ohad},\n\tpdf          = {http://proceedings.mlr.press/v65/schramm17a/schramm17a.pdf},\n\tabstract     = {We develop fast spectral algorithms for tensor decomposition that match the robustness guarantees of the best known polynomial-time algorithms for this problem based on the sum-of-squares (SOS) semidefinite programming hierarchy. Our algorithms can decompose a 4-tensor with $n$-dimensional orthonormal components in the presence of error with constant spectral norm (when viewed as an $n^2$-by-$n^2$ matrix).  The running time is $n^5$ which is close to linear in the input size $n^4$. We also obtain algorithms with similar running time to learn sparsely-used orthogonal dictionaries even when feature representations have constant relative sparsity and non-independent coordinates. The only previous polynomial-time algorithms to solve these problem are based on solving large semidefinite programs.  In contrast, our algorithms are easy to implement directly and are based on spectral projections and tensor-mode rearrangements. Or work is inspired by recent of Hopkins, Schramm, Shi, and Steurer (STOC’16) that shows how fast spectral algorithms can achieve the guarantees of SOS for average-case problems.  In this work, we introduce general techniques to capture the guarantees of SOS for worst-case problems.}\n}\n@inproceedings{schulam2019can,\n\ttitle        = {Can You Trust This Prediction? {Auditing} Pointwise Reliability After Learning},\n\tauthor       = {Peter Schulam and Suchi Saria},\n\tyear         = 2019,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)},\n\tpages        = {1022--1031}\n}\n@inproceedings{schuler03interpretation,\n\ttitle        = {Using model-theoretic semantic interpretation to guide statistical parsing and word recognition in a spoken language interface},\n\tauthor       = {William Schuler},\n\tyear         = 2003,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {529--536}\n}\n@inproceedings{schulman15trust,\n\ttitle        = {Trust Region Policy Optimization},\n\tauthor       = {John Schulman and Sergey Levine and Pieter Abbeel and Michael I. Jordan and Philipp Moritz},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the Thirty-Second International Conference on Machine Learning (ICML-15)},\n\tpages        = {1889--1897}\n}\n@inproceedings{schulman2015trust,\n\ttitle        = {Trust region policy optimization},\n\tauthor       = {Schulman, John and Levine, Sergey and Abbeel, Pieter and Jordan, Michael and Moritz, Philipp},\n\tyear         = 2015,\n\tbooktitle    = {International conference on machine learning},\n\tpages        = {1889--1897}\n}\n@article{schulman2017ppo,\n\ttitle        = {Proximal Policy Optimization Algorithms},\n\tauthor       = {John Schulman and Filip Wolski and Prafulla Dhariwal and Alec Radford and Oleg Klimov},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1707.06347}\n}\n@article{schulman2017proximal,\n\ttitle        = {Proximal policy optimization algorithms},\n\tauthor       = {Schulman, John and Wolski, Filip and Dhariwal, Prafulla and Radford, Alec and Klimov, Oleg},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1707.06347}\n}\n@inproceedings{schulz2008unobserved,\n\ttitle        = {Going beyond the evidence: Abstract laws and preschoolers’ responses to anomalous data},\n\tauthor       = {Laura E. Schulz and Noah D. Goodman and Joshua B. Tenenbaum and Adrianna C. Jenkins},\n\tyear         = 2008,\n\tbooktitle    = {Cognition}\n}\n@article{schuurmans2020intent,\n\ttitle        = {Intent Classification for Dialogue Utterances},\n\tauthor       = {J. Schuurmans and F. Frasincar and E. Cambria},\n\tyear         = 2020,\n\tjournal      = {IEEE Intelligent Systems},\n\tvolume       = 35,\n\tpages        = {82--88}\n}\n@article{schwartz2006ranking,\n\ttitle        = {Efficient learning of label ranking by soft projections onto polyhedra},\n\tauthor       = {S. Shalev-Schwartz and Y. Singer},\n\tyear         = 2006,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 7\n}\n@inproceedings{schwartz2017roc,\n\ttitle        = {The Effect of Different Writing Tasks on Linguistic Style: A Case Study of the {ROC} Story Cloze Task},\n\tauthor       = {Roy Schwartz and Maarten Sap and Yannis Konstas and Li Zilles and Yejin Choi and Noah A. Smith},\n\tyear         = 2017,\n\tbooktitle    = {Computational Natural Language Learning (CoNLL)}\n}\n@article{schweitzer85generalized,\n\ttitle        = {Generalized Polynomial Approximations in {Markovian} Decision Processes},\n\tauthor       = {Paul J. Schweitzer and Abraham Seidmann},\n\tyear         = 1985,\n\tjournal      = {Journal of Mathematical Analysis and Applications},\n\tvolume       = 110,\n\tnumber       = 2,\n\tpages        = {568--582}\n}\n@inproceedings{schwitter2010controlled,\n\ttitle        = {Controlled natural languages for knowledge representation},\n\tauthor       = {Rolf Schwitter},\n\tyear         = 2010,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)},\n\tpages        = {1113--1121}\n}\n@article{SCORE,\n\ttitle        = {Fast community detection by {SCORE}},\n\tauthor       = {Jin, Jiashun},\n\tyear         = 2015,\n\tjournal      = {Annals of Statistics},\n\tpublisher    = {Institute of Mathematical Statistics},\n\tvolume       = 43,\n\tnumber       = 1,\n\tpages        = {57--89}\n}\n@article{scudder1965probability,\n\ttitle        = {Probability of error of some adaptive pattern-recognition machines},\n\tauthor       = {H Scudder},\n\tyear         = 1965,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tvolume       = 11,\n\tnumber       = 3,\n\tpages        = {363--371}\n}\n@inproceedings{sculley2007online,\n\ttitle        = {Online Active Learning Methods for Fast Label-Efficient Spam Filtering},\n\tauthor       = {D Sculley},\n\tyear         = 2007,\n\tbooktitle    = {Conference on Email and Anti-spam (CEAS)}\n}\n@inproceedings{sculley2015hidden,\n\ttitle        = {Hidden Technical Debt in Machine Learning Systems},\n\tauthor       = {D. Sculley and Gary Holt and Daniel Golovin and Eugene Davydov and Todd Phillips and Dietmar Ebner and Vinay Chaudhary and Michael Young and Jean-Fran{\\c{c}}ois Crespo and Dan Dennison},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {2494--2502}\n}\n@article{SDCA,\n\ttitle        = {Stochastic dual coordinate ascent methods for regularized loss},\n\tauthor       = {Shalev-Shwartz, Shai and Zhang, Tong},\n\tyear         = 2013,\n\tjournal      = {The Journal of Machine Learning Research},\n\tpublisher    = {JMLR. org},\n\tvolume       = 14,\n\tnumber       = 1,\n\tpages        = {567--599}\n}\n@article{sedghi2014provable,\n\ttitle        = {Provable methods for training neural networks with sparse connectivity},\n\tauthor       = {Hanie Sedghi and Anima Anandkumar},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1412.2693}\n}\n@article{sedghi2016training,\n\ttitle        = {Training Input-Output Recurrent Neural Networks through Spectral Methods},\n\tauthor       = {Hanie Sedghi and Anima Anandkumar},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1603.00954}\n}\n@inproceedings{see2017point,\n\ttitle        = {Get To The Point: Summarization with Pointer-Generator Networks},\n\tauthor       = {Abigail See and Peter J. Liu and Christopher D. Manning},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{seeger08cs,\n\ttitle        = {Compressed Sensing and {B}ayesian Experimental Design},\n\tauthor       = {Matthias Seeger and Hannes Nickisch},\n\tyear         = 2008,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {912--919}\n}\n@article{segal2020multispan,\n\ttitle        = {A Simple and Effective Model for Answering Multi-span Questions},\n\tauthor       = {Elad Segal and  Avia Efrat and Mor Shoham and Amir Globerson and Jonathan Berant},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:1909.13375}\n}\n@inproceedings{seginer07fast,\n\ttitle        = {Fast Unsupervised Incremental Parsing},\n\tauthor       = {Yoav Seginer},\n\tyear         = 2007,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@book{seife2010proofiness,\n\ttitle        = {Proofiness: How you're being fooled by the numbers},\n\tauthor       = {Charles Seife},\n\tyear         = 2010,\n\tpublisher    = {Penguin}\n}\n@phdthesis{seigel2013confidence,\n\ttitle        = {Confidence Estimation for Automatic Speech Recognition Hypotheses},\n\tauthor       = {Matthew Seigel},\n\tyear         = 2013,\n\tschool       = {University of Cambridge}\n}\n@inproceedings{selsam2017bugfree,\n\ttitle        = {Developing Bug-Free Machine Learning Systems With Formal Mathematics},\n\tauthor       = {Daniel Selsam and Percy Liang and David Dill},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{selsam2019sat,\n\ttitle        = {Learning a {SAT} Solver from Single-Bit Supervision},\n\tauthor       = {Daniel Selsam and Matthew Lamm and Benedikt Bünz and Percy Liang and Leonardo de Moura and David L. Dill},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{selvaraju2017grad,\n\ttitle        = {Grad-cam: Visual explanations from deep networks via gradient-based localization},\n\tauthor       = {Selvaraju, Ramprasaath R and Cogswell, Michael and Das, Abhishek and Vedantam, Ramakrishna and Parikh, Devi and Batra, Dhruv},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the IEEE international conference on computer vision},\n\tpages        = {618--626}\n}\n@article{semanticmachines2020dataflow,\n\ttitle        = {Task-Oriented Dialogue as Dataflow Synthesis},\n\tauthor       = {Semantic Machines and Jacob Andreas and John Bufe and David Burkett and Charles Chen and Josh Clausman and Jean Crawford and Kate Crim and Jordan DeLoach and Leah Dorner and Jason Eisner and Hao Fang and Alan Guo and David Hall and Kristin Hayes and Kellie Hill and Diana Ho and Wendy Iwaszuk and Smriti Jha and Dan Klein and Jayant Krishnamurthy and Theo Lanman and Percy Liang and Christopher H. Lin and Ilya Lintsbakh and Andy McGovern and Aleksandr Nisnevich and Adam Pauls and Dmitrij Petters and Brent Read and Dan Roth and Subhro Roy and Jesse Rusak and Beth Short and Div Slomin and Ben Snyder and Stephon Striplin and Yu Su and Zachary Tellman and Sam Thomson and Andrei Vorobev and Izabela Witoszko and Jason Wolfe and Abby Wray and Yuchen Zhang and Alexander Zotov},\n\tyear         = 2020,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 8\n}\n@inproceedings{semeval2017pun,\n\ttitle        = {Idiom Savant at {S}em{E}val-2017 Task 7: Detection and Interpretation of {E}nglish Puns},\n\tauthor       = {Samuel Doogan and Aniruddha Ghosh and Hanyang Chen and Tony Veale},\n\tyear         = 2017,\n\tbooktitle    = {The 11th International Workshop on Semantic Evaluation}\n}\n@article{sen2016race,\n\ttitle        = {Race as a bundle of sticks: Designs that estimate effects of seemingly immutable characteristics},\n\tauthor       = {Maya Sen and Omar Wasow},\n\tyear         = 2016,\n\tjournal      = {Annual Review of Political Science},\n\tvolume       = 19,\n\tnumber       = 1,\n\tpages        = {499--522}\n}\n@inproceedings{sener2018active,\n\ttitle        = {Active learning for convolutional neural networks: A core-set approach},\n\tauthor       = {Ozan Sener and Silvio Savarese},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{senior2020protein,\n\ttitle        = {Improved protein structure prediction using potentials from deep learning},\n\tauthor       = {Andrew W. Senior and Richard Evans and John Jumper and James Kirkpatrick and Laurent Sifre and Tim Green and Chongli Qin and Augustin Žídek and Alexander W. R. Nelson and Alex Bridgland and Hugo Penedones and Stig Petersen and Karen Simonyan and Steve Crossan and Pushmeet Kohli and David T. Jones and David Silver and Koray Kavukcuoglu and Demis Hassabis},\n\tyear         = 2020,\n\tjournal      = {Nature},\n\tvolume       = 577\n}\n@inproceedings{sennrich2016bpe,\n\ttitle        = {Neural Machine Translation of Rare Words with Subword Units},\n\tauthor       = {Rico Sennrich and Barry Haddow and Alexandra Birch},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{sennrich2016monolingual,\n\ttitle        = {Improving neural machine translation models with monolingual data},\n\tauthor       = {R. Sennrich and B. Haddow and A. Birch},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@misc{sensirion2010datasheet,\n\ttitle        = {\n\t\tDatasheet SHT1x (SHT10, SHT11, SHT15) - Humidity and Temperature\n\n\t\tSensor\n\t},\n\tauthor       = {Sensirion},\n\tyear         = 2010,\n\thowpublished = {Available at \\url{http://www.sensirion.com/en/pdf/product_information/Datasheet-humidity-sensor-SHT1x.pdf}}\n}\n@inproceedings{seo2014program,\n\ttitle        = {Programmers' Build Errors: A Case Study at Google},\n\tauthor       = {Hyunmin Seo and Caitlin Sadowski and Sebastian Elbaum and Edward Aftandilian and Robert Bowdidge},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Software Engineering (ICSE)}\n}\n@inproceedings{seo2017bidaf,\n\ttitle        = {Bidirectional Attention Flow for Machine Comprehension},\n\tauthor       = {Minjoon Seo and Aniruddha Kembhavi and Ali Farhadi and Hannaneh Hajishirzi},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{seo2018neural,\n\ttitle        = {Neural Speed Reading via Skim-{RNN}},\n\tauthor       = {Minjoon Seo and Sewon Min and Ali Farhadi and Hannaneh Hajishirzi},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{serban2015building,\n\ttitle        = {Building End-To-End Dialogue Systems Using Generative Hierarchical Neural Network Models},\n\tauthor       = {Iulian V Serban and Alessandro Sordoni and Yoshua Bengio and Aaron Courville and Joelle Pineau},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1507.04808}\n}\n@article{serban2015survey,\n\ttitle        = {A Survey of Available Corpora for Building Data-Driven Dialogue Systems},\n\tauthor       = {Iulian Vlad Serban and Ryan Lowe and Laurent Charlin and Joelle Pineau},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1512.05742}\n}\n@article{serban2017deep,\n\ttitle        = {A deep reinforcement learning chatbot},\n\tauthor       = {Iulian V Serban and Chinnadhurai Sankar and Mathieu Germain and Saizheng Zhang and Zhouhan Lin and Sandeep Subramanian and Taesup Kim and Michael Pieper and Sarath Chandar and Nan Rosemary Ke and others},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1709.02349}\n}\n@inproceedings{serban2017hierarchical,\n\ttitle        = {A Hierarchical Latent Variable Encoder-Decoder Model for Generating Dialogues},\n\tauthor       = {Iulian Serban and Alessandro Sordoni and Ryan Lowe and Laurent Charlin and Joelle Pineau and Aaron C. Courville and Yoshua Bengio},\n\tyear         = 2017,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{serban2017multiresolution,\n\ttitle        = {Multiresolution Recurrent Neural Networks: An Application to Dialogue Response Generation},\n\tauthor       = {Iulian Serban and Tim Klinger and Gerald Tesauro and Kartik Talamadupula and Bowen Zhou and Yoshua Bengio and Aaron C. Courville},\n\tyear         = 2017,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@article{sethi2017reliable,\n\ttitle        = {On the reliable detection of concept drift from streaming unlabeled data},\n\tauthor       = {Tegjyot Singh Sethi and Mehmed Kantardzic},\n\tyear         = 2017,\n\tjournal      = {Expert Systems with Applications},\n\tvolume       = 82,\n\tpages        = {77--99}\n}\n@article{sethuraman94stick,\n\ttitle        = {A Constructive Definition of {D}irichlet Priors},\n\tauthor       = {J. Sethuraman},\n\tyear         = 1994,\n\tjournal      = {Statistica Sinica},\n\tvolume       = 4,\n\tpages        = {639--650}\n}\n@inproceedings{settles2008analysis,\n\ttitle        = {An analysis of active learning strategies for sequence labeling tasks},\n\tauthor       = {Burr Settles and Mark Craven},\n\tyear         = 2008,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1070--1079}\n}\n@techreport{settles2009active,\n\ttitle        = {Active learning literature survey},\n\tauthor       = {Burr Settles},\n\tyear         = 2009,\n\tinstitution  = {University of Wisconsin, Madison}\n}\n@inproceedings{seung1992query,\n\ttitle        = {Query by committee},\n\tauthor       = {H Sebastian Seung and Manfred Opper and Haim Sompolinsky},\n\tyear         = 1992,\n\tbooktitle    = {Proceedings of the fifth annual workshop on computational learning theory}\n}\n@inproceedings{severyn2015learning,\n\ttitle        = {Learning to rank short text pairs with convolutional deep neural networks},\n\tauthor       = {Aliaksei Severyn and Alessandro Moschitti},\n\tyear         = 2015,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)},\n\tpages        = {373--382}\n}\n@article{seyyed2020chexclusion,\n\ttitle        = {CheXclusion: Fairness gaps in deep chest {X-ray} classifiers},\n\tauthor       = {Laleh Seyyed-Kalantari and Guanxiong Liu and Matthew McDermott and Marzyeh Ghassemi},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2003.00827}\n}\n@inproceedings{sgs15,\n\ttitle        = {Training very deep networks},\n\tauthor       = {Srivastava, Rupesh K and Greff, Klaus and Schmidhuber, J{\\\"u}rgen},\n\tyear         = 2015,\n\tbooktitle    = {Advances in neural information processing systems (NIPS)},\n\tpages        = {2377--2385}\n}\n@article{sha70,\n\ttitle        = {Conditioning of quasi-Newton methods for function minimization},\n\tauthor       = {Shanno, David F},\n\tyear         = 1970,\n\tjournal      = {Mathematics of computation},\n\tvolume       = 24,\n\tnumber       = 111,\n\tpages        = {647--656}\n}\n@article{shabat2020sample,\n\ttitle        = {Sample complexity of uniform convergence for multicalibration},\n\tauthor       = {Shabat, Eliran and Cohen, Lee and Mansour, Yishay},\n\tyear         = 2020,\n\tjournal      = {NeurIPS}\n}\n@inproceedings{shachter1989simulation,\n\ttitle        = {Simulation Approaches to General Probabilistic Inference on Belief Networks.},\n\tauthor       = {Shachter, Ross D. and Peot, Mark A.},\n\tyear         = 1989,\n\tbooktitle    = {UAI},\n\tpublisher    = {North-Holland},\n\tpages        = {221--234},\n\tisbn         = {0-444-88738-5},\n\turl          = {http://dblp.uni-trier.de/db/conf/uai/uai1989.html#ShachterP89},\n\teditor       = {Henrion, Max and Shachter, Ross D. and Kanal, Laveen N. and Lemmer, John F.},\n\tadded-at     = {2011-10-24T15:49:08.000+0200},\n\tbiburl       = {http://www.bibsonomy.org/bibtex/2b89fc81eadd940a390c16768fabcb335/djain},\n\tdescription  = {dblp},\n\tee           = {http://rome.exp.sis.pitt.edu/UAI/Abstract.asp?articleID=787&proceedingID=5},\n\tinterhash    = {f917a3b27de3d53cc4b432f0d9cba0e3},\n\tintrahash    = {b89fc81eadd940a390c16768fabcb335},\n\tkeywords     = {Bayesian directedModels inference sampling},\n\ttimestamp    = {2011-10-24T15:49:08.000+0200}\n}\n@article{shafahi2018poison,\n\ttitle        = {Poison {F}rogs! {Targeted} Clean-Label Poisoning Attacks on Neural Networks},\n\tauthor       = {Ali Shafahi and W Ronny Huang and Mahyar Najibi and Octavian Suciu and Christoph Studer and Tudor Dumitras and Tom Goldstein},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1804.00792}\n}\n@article{shafer2008tutorial,\n\ttitle        = {A tutorial on conformal prediction},\n\tauthor       = {Shafer, Glenn and Vovk, Vladimir},\n\tyear         = 2008,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 9,\n\tnumber       = {Mar},\n\tpages        = {371--421}\n}\n@inproceedings{shafieezadeh2015distributionally,\n\ttitle        = {Distributionally Robust Logistic Regression},\n\tauthor       = {Soroosh Shafieezadeh-Abadeh and Peyman Mohajerin Esfahani and Daniel Kuhn},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{Shah12,\n\ttitle        = {Linear System Identification via Atomic Norm Regularization.},\n\tauthor       = {Parikshit Shah and Badri Narayan Bhaskar and Gongguo Tang and Benjamin Recht},\n\tyear         = 2012,\n\tbooktitle    = {Proceedings of the 51st Conference on Decision and Control},\n\tdate-added   = {2016-04-02 18:40:54 +0000},\n\tdate-modified = {2016-04-02 18:40:54 +0000}\n}\n@inproceedings{shah2015approval,\n\ttitle        = {Approval Voting and Incentives in Crowdsourcing},\n\tauthor       = {Nihar Shah and Dengyong Zhou and Yuval Peres},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{shah2015double,\n\ttitle        = {Double or nothing: Multiplicative incentive mechanisms for crowdsourcing},\n\tauthor       = {Nihar B. Shah and Denny Zhou},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{shah2018minimum,\n\ttitle        = {Minimum norm solutions do not always generalize well for over-parameterized problems},\n\tauthor       = {Vatsal Shah and Anastasios Kyrillidis and Sujay Sanghavi},\n\tyear         = 2018,\n\tjournal      = {stat},\n\tvolume       = 1050\n}\n@article{shaham2015understanding,\n\ttitle        = {Understanding adversarial training: Increasing local stability of neural nets through robust optimization},\n\tauthor       = {Uri Shaham and Yutaro Yamada and Sahand Negahban},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1511.05432}\n}\n@inproceedings{shai2008unlabeled,\n\ttitle        = {Does Unlabeled Data Provably Help? Worst-case Analysis of the Sample Complexity of Semi-Supervised Learning},\n\tauthor       = {Shai Ben-David and Tyler Lu and David Pal},\n\tyear         = 2008,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@techreport{Shalev-Shwartz2007a,\n\ttitle        = {{Logarithmic regret algorithms for strongly convex repeated games}},\n\tauthor       = {{Shalev-Shwartz}, Shai and Singer, Yoram},\n\tyear         = 2007,\n\tbooktitle    = {The Hebrew University, Technical \\ldots},\n\tpages        = {1--16},\n\tannote       = {Contains the detailed proof for the PEGASOS paper},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Shalev-Shwartz, Singer - 2007 - Logarithmic regret algorithms for strongly convex repeated games.pdf:pdf},\n\tmendeley-groups = {Optimization/Stochastic Online Regularized Optimization},\n\tinstitution  = {The Hebrew University}\n}\n@phdthesis{Shalev-Shwartz2007b,\n\ttitle        = {{Online learning: Theory, algorithms, and applications}},\n\tauthor       = {{Shalev-Shwartz}, Shai},\n\tyear         = 2007,\n\tnumber       = {July},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Shalev-Shwartz - 2007 - Online learning Theory, algorithms, and applications.pdf:pdf},\n\tmendeley-groups = {Optimization/Stochastic Online Optimization,Optimization/General Theory},\n\tschool       = {Hebrew University}\n}\n@article{Shalev-Shwartz2011a,\n\ttitle        = {{Stochastic methods for l1-regularized loss minimization}},\n\tauthor       = {{Shalev-Shwartz}, Shai and Tewari, Ambuj},\n\tyear         = 2011,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 12,\n\tpages        = {1865−-1892},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Unknown - Unknown - No Title(3).pdf:pdf},\n\tmendeley-groups = {Optimization/Stochastic Online Optimization}\n}\n@article{Shalev-Shwartz2013-SDCA,\n\ttitle        = {{Stochastic dual coordinate ascent methods for regularized loss minimization}},\n\tauthor       = {{Shalev-Shwartz}, Shai and Zhang, Tong},\n\tyear         = 2013,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 14,\n\tnumber       = {Feb},\n\tpages        = {567--599},\n\turl          = {http://arxiv.org/abs/1209.1873},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Shalev-Shwartz, Zhang - 2013 - Stochastic dual coordinate ascent methods for regularized loss minimization.pdf:pdf},\n\tkeywords     = {computational complexity,ized loss minimization,logistic regression,optimization,regular-,ridge regression,stochastic dual coordinate ascent,support vector machines},\n\tmendeley-groups = {Optimization/Stochastic Online Optimization}\n}\n@inproceedings{Shalev-Shwartz2013a,\n\ttitle        = {{Accelerated Mini-Batch Stochastic Dual Coordinate Ascent}},\n\tauthor       = {{Shalev-Shwartz}, Shai and Zhang, Tong},\n\tyear         = 2013,\n\tmonth        = may,\n\tbooktitle    = {NIPS},\n\tpages        = {1--17},\n\tabstract     = {Stochastic dual coordinate ascent (SDCA) is an effective technique for solving regularized loss minimization problems in machine learning. This paper considers an extension of SDCA under the mini-batch setting that is often used in practice. Our main contribution is to introduce an accelerated mini-batch version of SDCA and prove a fast convergence rate for this method. We discuss an implementation of our method over a parallel computing system, and compare the results to both the vanilla stochastic dual coordinate ascent and to the accelerated deterministic gradient descent method of $\\backslash$cite\\{nesterov2007gradient\\}.},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {1305.2581},\n\teprint       = {1305.2581},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Shalev-Shwartz, Zhang - 2013 - Accelerated Mini-Batch Stochastic Dual Coordinate Ascent.pdf:pdf},\n\tmendeley-groups = {Optimization/Stochastic Online Regularized Optimization}\n}\n@inproceedings{Shalev-Shwartz2013b,\n\ttitle        = {{Accelerated Proximal Stochastic Dual Coordinate Ascent for Regularized Loss Minimization}},\n\tauthor       = {{Shalev-Shwartz}, Shai and Zhang, Tong},\n\tyear         = 2014,\n\tbooktitle    = {Proceedings of the 31st International Conference on Machine Learning},\n\tseries       = {ICML 2014},\n\tpages        = {64--72}\n}\n@inproceedings{Shalev-Shwartz2015-SDCAwithoutDual,\n\ttitle        = {{SDCA without Duality, Regularization, and Individual Convexity}},\n\tauthor       = {{Shalev-Shwartz}, Shai},\n\tyear         = 2016,\n\tbooktitle    = {ICML}\n}\n@article{Shalev-ShwartzZhang2014-ProxSDCA,\n\ttitle        = {{Proximal Stochastic Dual Coordinate Ascent}},\n\tauthor       = {{Shalev-Shwartz}, Shai and Zhang, Tong},\n\tyear         = 2012,\n\tjournal      = {arXiv preprint arXiv:1211.2717},\n\tpages        = {1--18},\n\turl          = {http://arxiv.org/pdf/1211.2717v1.pdf},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {1211.2717},\n\teprint       = {1211.2717},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Shalev-shwartz, Zhang - 2012 - Proximal Stochastic Dual Coordinate Ascent.pdf:pdf},\n\tmendeley-groups = {Optimization/Stochastic Online Optimization}\n}\n@phdthesis{shalev07online,\n\ttitle        = {Online Learning: Theory, Algorithms, and Applications},\n\tauthor       = {Shai Shalev-Shwartz},\n\tyear         = 2007,\n\tschool       = {The Hebrew University of Jerusalem}\n}\n@article{shalev13stochastic,\n\ttitle        = {Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization},\n\tauthor       = {Shai Shalev-Shwartz and Tong Zhang},\n\tyear         = 2013,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 14,\n\tpages        = {567--599}\n}\n@article{shalev15dualfree,\n\ttitle        = {{SDCA} without Duality},\n\tauthor       = {Shai Shalev-Shwartz},\n\tyear         = 2015,\n\tjournal      = {arXiv}\n}\n@article{shalev2011online,\n\ttitle        = {Online learning and online convex optimization},\n\tauthor       = {Shai Shalev-Shwartz},\n\tyear         = 2011,\n\tjournal      = {Foundations and Trends in Machine Learning},\n\tvolume       = 4,\n\tnumber       = 2,\n\tpages        = {107--194}\n}\n@article{shalev2012online,\n\ttitle        = {Online learning and online convex optimization},\n\tauthor       = {Shalev-Shwartz, Shai and others},\n\tyear         = 2012,\n\tjournal      = {Foundations and Trends{\\textregistered} in Machine Learning},\n\tpublisher    = {Now Publishers, Inc.},\n\tvolume       = 4,\n\tnumber       = 2,\n\tpages        = {107--194},\n\tdoi          = {10.1561/2200000018},\n\tissn         = {1935-8237},\n\tmendeley-groups = {Optimization/Stochastic Online Regularized Optimization}\n}\n@article{shalev2014accelerated,\n\ttitle        = {Accelerated proximal stochastic dual coordinate ascent for regularized loss minimization},\n\tauthor       = {Shai Shalev-Shwartz and Tong Zhang},\n\tyear         = 2014,\n\tjournal      = {Mathematical Programming},\n\tpages        = {1--41}\n}\n@article{shalev2016safe,\n\ttitle        = {Safe, multi-agent, reinforcement learning for autonomous driving},\n\tauthor       = {Shalev-Shwartz, Shai and Shammah, Shaked and Shashua, Amnon},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1610.03295}\n}\n@inproceedings{shalev2017failures,\n\ttitle        = {Failures of Gradient-Based Deep Learning},\n\tauthor       = {Shalev-Shwartz, Shai and Shamir, Ohad and Shammah, Shaked},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {3067--3075}\n}\n@article{shalev2017weight,\n\ttitle        = {Weight Sharing is Crucial to Succesful Optimization},\n\tauthor       = {Shalev-Shwartz, Shai and Shamir, Ohad and Shammah, Shaked},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1706.00687}\n}\n@article{ShalevShwartzSS2011-zeroone,\n\ttitle        = {Learning kernel-based halfspaces with the 0-1 loss},\n\tauthor       = {Shalev-Shwartz, Shai and Shamir, Ohad and Sridharan, Karthik},\n\tyear         = 2011,\n\tjournal      = {SIAM Journal on Computing},\n\tpublisher    = {SIAM},\n\tvolume       = 40,\n\tnumber       = 6,\n\tpages        = {1623--1646}\n}\n@inproceedings{shalit2014coordinate,\n\ttitle        = {Coordinate-descent for learning orthogonal matrices through Givens rotations},\n\tauthor       = {Uri Shalit and Gal Chechik},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{shallue2018measuring,\n\ttitle        = {Measuring the effects of data parallelism on neural network training},\n\tauthor       = {Shallue, Christopher J and Lee, Jaehoon and Antognini, Joseph and Sohl-Dickstein, Jascha and Frostig, Roy and Dahl, George E},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1811.03600}\n}\n@article{shamir2013fundamental,\n\ttitle        = {Fundamental Limits of Online and Distributed Algorithms for Statistical Learning and Estimation},\n\tauthor       = {Ohad Shamir},\n\tyear         = 2013,\n\tjournal      = {arXiv}\n}\n@inproceedings{Shamir2015-1SVD,\n\ttitle        = {{A Stochastic PCA and SVD Algorithm with an Exponential Convergence Rate}},\n\tauthor       = {Shamir, Ohad},\n\tyear         = 2015,\n\tbooktitle    = {ICML},\n\tpages        = {144----153}\n}\n@inproceedings{Shamir2015-kSVD,\n\ttitle        = {Fast Stochastic Algorithms for SVD and PCA: Convergence Properties and Convexity},\n\tauthor       = {Ohad Shamir},\n\tyear         = 2016,\n\tbooktitle    = {ICML}\n}\n@inproceedings{Shamir2016-onlinePCA,\n\ttitle        = {Convergence of stochastic gradient descent for PCA},\n\tauthor       = {Shamir, Ohad},\n\tyear         = 2016,\n\tbooktitle    = {ICML}\n}\n@article{shamir2018exponential,\n\ttitle        = {Exponential Convergence Time of Gradient Descent for One-Dimensional Deep Linear Neural Networks},\n\tauthor       = {Shamir, Ohad},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1809.08587,}\n}\n@article{shamirPCA,\n\ttitle        = {Fast stochastic algorithms for svd and pca: Convergence properties and convexity},\n\tauthor       = {Shamir, Ohad},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1507.08788}\n}\n@inproceedings{ShamirZhang2013,\n\ttitle        = {{Stochastic Gradient Descent for Non-smooth Optimization: Convergence Results and Optimal Averaging Schemes}},\n\tauthor       = {Shamir, Ohad and Zhang, Tong},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of the 30th International Conference on Machine Learning - ICML '13},\n\tlocation     = {Atlanta, GA, USA},\n\tseries       = {ICML'13},\n\tvolume       = 28,\n\tpages        = {I-71--I-79},\n\turl          = {http://dl.acm.org/citation.cfm?id=3042817.3042827},\n\tabstract     = {Stochastic Gradient Descent (SGD) is one of the simplest and most popular stochastic optimization methods. While it has already been theoretically studied for decades, the classical analysis usually required non-trivial smoothness assumptions, which do not apply to many modern applications of SGD with non-smooth objective functions such as support vector machines. In this paper, we investigate the performance of SGD without such smoothness assumptions, as well as a running average scheme to convert the SGD iterates to a solution with optimal optimization accuracy. In this framework, we prove that after T rounds, the suboptimality of the last SGD iterate scales as O(log(T)/$\\backslash$sqrt\\{T\\}) for non-smooth convex objective functions, and O(log(T)/T) in the non-smooth strongly convex case. To the best of our knowledge, these are the first bounds of this kind, and almost match the minimax-optimal rates obtainable by appropriate averaging schemes. We also propose a new and simple averaging scheme, which not only attains optimal rates, but can also be easily computed on-the-fly (in contrast, the suffix averaging scheme proposed in Rakhlin et al. (2011) is not as simple to implement). Finally, we provide some experimental illustrations.},\n\tannote       = {This paper answers the open question of Shamir in COLT'12 about how to get a non-smooth algorithm whose last round is great, rather than avearging of the history. This paper also works for strongly-convex non-smooth functions.},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {1212.1824},\n\teprint       = {1212.1824},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Shamir, Zhang - 2013 - Stochastic Gradient Descent for Non-smooth Optimization Convergence Results and Optimal Averaging Schemes.pdf:pdf},\n\tmendeley-groups = {Optimization/Gradient Descent Theory},\n\t//publisher  = {JMLR.org},\n\tacmid        = 3042827\n}\n@article{shan04continuations,\n\ttitle        = {Delimited continuations in natural language},\n\tauthor       = {Chung-chieh Shan},\n\tyear         = 2004,\n\tjournal      = {arXiv}\n}\n@inproceedings{shang2015neural,\n\ttitle        = {Neural responding machine for short-text conversation},\n\tauthor       = {Lifeng Shang and Zhengdong Lu and Hang Li},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{shani2019adaptive,\n\ttitle        = {Adaptive trust region policy optimization: Global convergence and faster rates for regularized {MDP}s},\n\tauthor       = {Shani, Lior and Efroni, Yonathan and Mannor, Shie},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1909.02769}\n}\n@article{shankar2019image,\n\ttitle        = {Do Image Classifiers Generalize Across Time?},\n\tauthor       = {Vaishaal Shankar and Achal Dave and Rebecca Roelofs and Deva Ramanan and Benjamin Recht and Ludwig Schmidt},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1906.02168}\n}\n@inproceedings{shao2017generating,\n\ttitle        = {Generating High-Quality and Informative Conversation Responses with Sequence-to-Sequence Models},\n\tauthor       = {Louis Shao and Stephan Gouws and Denny Britz and Anna Goldie and Brian Strope and Ray Kurzweil},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {2210--2219}\n}\n@inproceedings{shao2020concept2robot,\n\ttitle        = {Concept2Robot: Learning Manipulation Concepts from Instructions and Human Demonstrations},\n\tauthor       = {Lin Shao and Toki Migimatsu and Q. Zhang and Karen Yang and Jeannette Bohg},\n\tyear         = 2020,\n\tbooktitle    = {Robotics: Science and Systems (RSS)}\n}\n@article{shao2020grac,\n\ttitle        = {GRAC: Self-Guided and Self-Regularized Actor-Critic},\n\tauthor       = {Shao, Lin and You, Yifan and Yan, Mengyuan and Sun, Qingyun and Bohg, Jeannette},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2009.08973}\n}\n@book{shapiro2014lectures,\n\ttitle        = {Lectures on stochastic programming: modeling and theory},\n\tauthor       = {Alexander Shapiro and Darinka Dentcheva and Andrzej Ruszczy{\\'n}ski},\n\tyear         = 2014,\n\tpublisher    = {SIAM}\n}\n@article{shapley1953stochastic,\n\ttitle        = {Stochastic games},\n\tauthor       = {Shapley, Lloyd S},\n\tyear         = 1953,\n\tjournal      = {Proceedings of the national academy of sciences},\n\tpublisher    = {National Acad Sciences},\n\tvolume       = 39,\n\tnumber       = 10,\n\tpages        = {1095--1100}\n}\n@inproceedings{sharan2017overcomplete,\n\ttitle        = {Learning Overcomplete {HMM}s},\n\tauthor       = {Vatsal Sharan and Sham Kakade and Percy Liang and Gregory Valiant},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{sharan2018prediction,\n\ttitle        = {Prediction with a Short Memory},\n\tauthor       = {Vatsal Sharan and Sham Kakade and Percy Liang and Gregory Valiant},\n\tyear         = 2018,\n\tbooktitle    = {Symposium on Theory of Computing (STOC)}\n}\n@article{sharchilev2018finding,\n\ttitle        = {Finding influential training samples for gradient boosted decision trees},\n\tauthor       = {Boris Sharchilev and Yury Ustinovsky and Pavel Serdyukov and Maarten de Rijke},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1802.06640}\n}\n@inproceedings{sharif2016accessorize,\n\ttitle        = {Accessorize to a crime: Real and stealthy attacks on state-of-the-art face recognition},\n\tauthor       = {Mahmood Sharif and Sruti Bhagavatula and Lujo Bauer and Michael K. Reiter},\n\tyear         = 2016,\n\tbooktitle    = {ACM SIGSAC Conference on Computer and Communications Security},\n\tpages        = {1528--1540}\n}\n@article{shariff2020efficient,\n\ttitle        = {Efficient planning in large MDPs with weak linear function approximation},\n\tauthor       = {Shariff, Roshan and Szepesv{\\'a}ri, Csaba},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.06184}\n}\n@inproceedings{sharma13algebraic,\n\ttitle        = {A Data Driven Approach for Algebraic Loop Invariants},\n\tauthor       = {Rahul Sharma and Saurabh Gupta and Bharath Hariharan and Alex Aiken and Percy Liang and Aditya V. Nori},\n\tyear         = 2013,\n\tbooktitle    = {European Symposium on Programming (ESOP)}\n}\n@inproceedings{sharma2014invariant,\n\ttitle        = {From invariant checking to invariant inference using randomized search},\n\tauthor       = {Rahul Sharma and Alex Aiken},\n\tyear         = 2014,\n\tbooktitle    = {Computer Aided Verification (CAV)},\n\tpages        = {88--105}\n}\n@inproceedings{sharma2015causal,\n\ttitle        = {Estimating the Causal Impact of Recommendation Systems from Observational Data},\n\tauthor       = {Amit Sharma and Jake Hofman and Duncan Watts},\n\tyear         = {2015 2015},\n\tbooktitle    = {ACM Conference on Economics and Computation}\n}\n@inproceedings{sharma2018mime,\n\ttitle        = {Multiple Interactions Made Easy (MIME): Large Scale Demonstrations Data for Imitation},\n\tauthor       = {Pratyusha Sharma and L. Mohan and Lerrel Pinto and A. Gupta},\n\tyear         = 2018,\n\tbooktitle    = {Conference on Robot Learning (CORL)}\n}\n@article{sharoff2006corpora,\n\ttitle        = {Open-source Corpora: Using the net to fish for linguistic data},\n\tauthor       = {S. Sharoff},\n\tyear         = 2006,\n\tjournal      = {International Journal of Corpus Linguistics},\n\tvolume       = 11,\n\tpages        = {435--462}\n}\n@article{shaw2020teacher,\n\ttitle        = {Teacher-student chain for efficient semi-supervised histology image classification},\n\tauthor       = {Shayne Shaw and Maciej Pajak and Aneta Lisowska and Sotirios A Tsaftaris and Alison Q O'Neil},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2003.08797}\n}\n@book{shawe2004kernel,\n\ttitle        = {Kernel methods for pattern analysis},\n\tauthor       = {Shawe-Taylor, John and Cristianini, Nello},\n\tyear         = 2004,\n\tpublisher    = {Cambridge university press}\n}\n@book{shawetaylor04kernel,\n\ttitle        = {Kernel Methods for Pattern Analysis},\n\tauthor       = {John Shawe-Taylor and Nello Cristianini},\n\tyear         = 2004,\n\tpublisher    = {Cambridge University Press}\n}\n@inproceedings{shen2004discriminative,\n\ttitle        = {Discriminative reranking for machine translation},\n\tauthor       = {Libin Shen and Anoop Sarkar and Franz Josef Och},\n\tyear         = 2004,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)},\n\tpages        = {177--184}\n}\n@inproceedings{shen2006exploring,\n\ttitle        = {Exploring correlation of dependency relation paths for answer extraction},\n\tauthor       = {Dan Shen and Dietrich Klakow},\n\tyear         = 2006,\n\tbooktitle    = {International Conference on Computational Linguistics and Association for Computational Linguistics (COLING/ACL)},\n\tpages        = {889--896}\n}\n@article{shen2014risk,\n\ttitle        = {Risk-sensitive reinforcement learning},\n\tauthor       = {Shen, Yun and Tobia, Michael J and Sommer, Tobias and Obermayer, Klaus},\n\tyear         = 2014,\n\tjournal      = {Neural computation},\n\tpublisher    = {MIT Press},\n\tvolume       = 26,\n\tnumber       = 7,\n\tpages        = {1298--1328}\n}\n@inproceedings{shen2014webpage,\n\ttitle        = {Webpage saliency},\n\tauthor       = {Chengyao Shen and Qi Zhao},\n\tyear         = 2014,\n\tbooktitle    = {European Conference on Computer Vision (ECCV)}\n}\n@article{shen2015minimum,\n\ttitle        = {Minimum risk training for neural machine translation},\n\tauthor       = {Shiqi Shen and Yong Cheng and Zhongjun He and Wei He and Hua Wu and Maosong Sun and Yang Liu},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1512.02433}\n}\n@inproceedings{shen2016relay,\n\ttitle        = {Relay backpropagation for effective learning of deep convolutional neural networks},\n\tauthor       = {Li Shen and Zhouchen Lin and Qingming Huang},\n\tyear         = 2016,\n\tbooktitle    = {European Conference on Computer Vision},\n\tpages        = {467--482}\n}\n@inproceedings{shen2017deep,\n\ttitle        = {Deep active learning for named entity recognition},\n\tauthor       = {Yanyao Shen and Hyokun Yun and Zachary C Lipton and Yakov Kronrod and Animashree Anandkumar},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the Second Workshop on Representation Learning for NLP (Repl4NLP)}\n}\n@inproceedings{shen2017inter,\n\ttitle        = {Inter-Weighted Alignment Network for Sentence Pair Modeling},\n\tauthor       = {Gehui Shen and Yunlun Yang and Zhi-Hong Deng},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{shen2017reasonet,\n\ttitle        = {{ReasoNet}: Learning to Stop Reading in Machine Comprehension},\n\tauthor       = {Yelong Shen and Po-Sen Huang and Jianfeng Gao and Weizhu Chen},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)}\n}\n@inproceedings{shen2017style,\n\ttitle        = {Style Transfer from Non-Parallel Text by Cross-Alignment},\n\tauthor       = {Tianxiao Shen and Tao Lei and Regina Barzilay and Tommi Jaakkola},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{shen2018nash,\n\ttitle        = {NASH: Toward End-to-End Neural Architecture for Generative Semantic Hashing},\n\tauthor       = {Dinghan Shen and Qinliang Su and Paidamoyo Chapfuwa and Wenlin Wang and Guoyin Wang and Ricardo Henao and Lawrence Carin},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {2041--2050}\n}\n@inproceedings{shen2018WassersteinDG,\n\ttitle        = {Wasserstein Distance Guided Representation Learning for Domain Adaptation},\n\tauthor       = {Jian Shen and Yanru Qu and Weinan Zhang and Yong Yu},\n\tyear         = 2018,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{Sherman09,\n\ttitle        = {Breaking the Multicommodity Flow Barrier for $O(\\sqrt{\\log n})$-Approximations to Sparsest Cut},\n\tauthor       = {Sherman, Jonah},\n\tyear         = 2009,\n\tbooktitle    = {Proceedings of the 50th Annual IEEE Symposium on Foundations of Computer Science},\n\tseries       = {FOCS '09},\n\tpages        = {363--372},\n\tnumpages     = 10\n}\n@article{sherman1950adjustment,\n\ttitle        = {Adjustment of an inverse matrix corresponding to a change in one element of a given matrix},\n\tauthor       = {Jack Sherman and Winifred J Morrison},\n\tyear         = 1950,\n\tjournal      = {The Annals of Mathematical Statistics},\n\tvolume       = 21,\n\tnumber       = 1,\n\tpages        = {124--127}\n}\n@inproceedings{Sherman2013,\n\ttitle        = {{Nearly Maximum Flows in Nearly Linear Time}},\n\tauthor       = {Sherman, Jonah},\n\tyear         = 2013,\n\tmonth        = oct,\n\tbooktitle    = {2013 IEEE 54th Annual Symposium on Foundations of Computer Science},\n\tpublisher    = {IEEE},\n\tpages        = {263--269},\n\tdoi          = {10.1109/FOCS.2013.36},\n\tisbn         = {978-0-7695-5135-7},\n\tmendeley-groups = {Algorithms/Maxflow}\n}\n@article{shermis2014state,\n\ttitle        = {State-of-the-art automated essay scoring: Competition, results, and future directions from a United States demonstration},\n\tauthor       = {Mark D Shermis},\n\tyear         = 2014,\n\tjournal      = {Assessing Writing},\n\tvolume       = 20,\n\tpages        = {53--76}\n}\n@article{shi2000normalized,\n\ttitle        = {Normalized cuts and image segmentation},\n\tauthor       = {Shi, Jianbo and Malik, Jitendra},\n\tyear         = 2000,\n\tjournal      = {IEEE Transactions on pattern analysis and machine intelligence},\n\tpublisher    = {Ieee},\n\tvolume       = 22,\n\tnumber       = 8,\n\tpages        = {888--905}\n}\n@inproceedings{shi2015sample,\n\ttitle        = {Learning Where To Sample in Structured Prediction},\n\tauthor       = {Tianlin Shi and Jacob Steinhardt and Percy Liang},\n\tyear         = 2015,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)},\n\tpages        = {875--884}\n}\n@inproceedings{shi2017wob,\n\ttitle        = {World of Bits: An Open-Domain Platform for Web-Based Agents},\n\tauthor       = {Tianlin Shi and Andrej Karpathy and Linxi Fan and Jonathan Hernandez and Percy Liang},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{shi2018understanding,\n\ttitle        = {Understanding the acceleration phenomenon via high-resolution differential equations},\n\tauthor       = {Shi, Bin and Du, Simon S and Jordan, Michael I and Su, Weijie J},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1810.08907}\n}\n@inproceedings{shi2019acceleration,\n\ttitle        = {Acceleration via Symplectic Discretization of High-Resolution Differential Equations},\n\tauthor       = {Shi, Bin and Du, Simon S and Su, Weijie and Jordan, Michael I},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 32,\n\tpages        = {},\n\turl          = {https://proceedings.neurips.cc/paper/2019/file/a9986cb066812f440bc2bb6e3c13696c-Paper.pdf},\n\teditor       = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\\textquotesingle Alch\\'{e}-Buc and E. Fox and R. Garnett}\n}\n@inproceedings{shi2019frangel,\n\ttitle        = {{F}r{A}ngel: Component-Based Synthesis with Control Structures},\n\tauthor       = {Kensen Shi and Jacob Steinhardt and Percy Liang},\n\tyear         = 2019,\n\tbooktitle    = {Principles of Programming Languages (POPL)}\n}\n@inproceedings{shi2020robustness,\n\ttitle        = {Robustness Verification for Transformers},\n\tauthor       = {Zhouxing Shi and Huan Zhang and Kai-Wei Chang and Minlie Huang and Cho-Jui Hsieh},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@incollection{shibata89model,\n\ttitle        = {Statistical Aspects of Model Selection},\n\tauthor       = {R. Shibata},\n\tyear         = 1989,\n\tbooktitle    = {From Data to Model},\n\tpages        = {215--240}\n}\n@article{shieber2016principles,\n\ttitle        = {Principles for Designing an {AI} Competition, or Why the {T}uring Test Fails as an Inducement Prize},\n\tauthor       = {Stuart Shieber},\n\tyear         = 2016,\n\tjournal      = {{AI} Magazine},\n\tvolume       = 37,\n\tnumber       = 1\n}\n@inproceedings{shieh2008isax,\n\ttitle        = {iSAX: indexing and mining terabyte sized time series},\n\tauthor       = {Shieh, Jin and Keogh, Eamonn},\n\tyear         = 2008,\n\tbooktitle    = {\n\t\tProceeding of the 14th ACM SIGKDD international conference on Knowledge\n\n\t\tdiscovery and data mining\n\t},\n\tlocation     = {Las Vegas, Nevada, USA},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {KDD '08},\n\tpages        = {623--631},\n\tdoi          = {http://doi.acm.org/10.1145/1401890.1401966},\n\tisbn         = {978-1-60558-193-4},\n\tacmid        = 1401966,\n\tkeywords     = {data mining, indexing, representations, time series},\n\tnumpages     = 9\n}\n@article{shih2019xl,\n\ttitle        = {{XL}-{E}ditor: Post-editing Sentences with XLNet},\n\tauthor       = {Yong-Siang Shih and Wei-Cheng Chang and Yiming Yang},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1910.10479}\n}\n@inproceedings{shih2021critical,\n\ttitle        = {On the Critical Role of Conventions in Adaptive Human-{AI} Collaboration},\n\tauthor       = {Andy Shih and Arjun Sawhney and Jovana Kondic and Stefano Ermon and Dorsa Sadigh},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{shima2011diversity,\n\ttitle        = {Diversity-aware Evaluation for Paraphrase Patterns},\n\tauthor       = {Hideki Shima and Teruko Mitamura},\n\tyear         = 2011,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{shimodaira2000improving,\n\ttitle        = {Improving predictive inference under covariate shift by weighting the log-likelihood function},\n\tauthor       = {Shimodaira, Hidetoshi},\n\tyear         = 2000,\n\tjournal      = {Journal of statistical planning and inference},\n\tpublisher    = {Elsevier},\n\tvolume       = 90,\n\tnumber       = 2,\n\tpages        = {227--244}\n}\n@article{shin2001computer,\n\ttitle        = {Computer puppetry: An importance-based approach},\n\tauthor       = {Hyun Joon Shin and Jehee Lee and Sung Yong Shin and Michael Gleicher},\n\tyear         = 2001,\n\tjournal      = {ACM Trans. Graph.},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tvolume       = 20,\n\tnumber       = 2,\n\tpages        = {67--94},\n\tdoi          = {http://doi.acm.org/10.1145/502122.502123},\n\tissn         = {0730-0301}\n}\n@inproceedings{shin2015incremental,\n\ttitle        = {Incremental knowledge base construction using {DeepDive}},\n\tauthor       = {Jaeho Shin and Sen Wu and Feiran Wang and Christopher De Sa and Ce Zhang and Christopher R\\'{e}},\n\tyear         = 2015,\n\tbooktitle    = {Very Large Data Bases (VLDB)},\n\tnumber       = 11,\n\tpages        = {1310--1321}\n}\n@article{shin2020autoprompt,\n\ttitle        = {AutoPrompt: Eliciting knowledge from language models with automatically generated prompts},\n\tauthor       = {Shin, Taylor and Razeghi, Yasaman and Logan IV, Robert L and Wallace, Eric and Singh, Sameer},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.15980}\n}\n@inproceedings{shirakawan2015ngramidf,\n\ttitle        = {{N}-gram IDF: A Global Term Weighting Scheme Based on Information Distance},\n\tauthor       = {Masumi Shirakawa and Takahiro Hara and Shojiro Nishio},\n\tyear         = 2015,\n\tbooktitle    = {World Wide Web (WWW)},\n\tpages        = {960--970}\n}\n@inproceedings{shivakumar2002modeling,\n\ttitle        = {Modeling the effect of technology trends on the soft error rate of combinational logic},\n\tauthor       = {Premkishore Shivakumar and Michael Kistler and Stephen W Keckler and Doug Burger and Lorenzo Alvisi},\n\tyear         = 2002,\n\tbooktitle    = {International Conference on Dependable Systems and Networks (DSN)},\n\tpages        = {389--398}\n}\n@inproceedings{shivaswamy2010empirical,\n\ttitle        = {Empirical {B}ernstein boosting},\n\tauthor       = {Pannagadatta Shivaswamy and Tony Jebara},\n\tyear         = 2010,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)},\n\tpages        = {733--740}\n}\n@article{shively2009bayesian,\n\ttitle        = {A Bayesian approach to non-parametric monotone function estimation},\n\tauthor       = {Thomas S Shively and Thomas W Sager and Stephen G Walker},\n\tyear         = 2009,\n\tjournal      = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)},\n\tvolume       = 71,\n\tnumber       = 1,\n\tpages        = {159--175}\n}\n@misc{shnayder2016strong,\n\ttitle        = {Strong Truthfulness in Multi-Task Peer Prediction},\n\tauthor       = {Victor Shnayder and Rafael Frongillo and Arpit Agarwal and David C. Parkes},\n\tyear         = 2016\n}\n@book{shoham2008multiagent,\n\ttitle        = {Multiagent systems: Algorithmic, game-theoretic, and logical foundations},\n\tauthor       = {Yoav Shoham and Kevin Leyton-Brown},\n\tyear         = 2008,\n\tpublisher    = {Cambridge University Press}\n}\n@article{SHOPM,\n\ttitle        = {On the Best rank-1 and Rank-$({R}_1, {R}_2, ..., {R}_N)$ Approximation and Applications of Higher-Order Tensors},\n\tauthor       = {L. De Lathauwer and B. De Moor and J. Vandewalle},\n\tyear         = 2000,\n\tjournal      = {SIAM J. Matrix Anal. Appl.},\n\tvolume       = 21,\n\tnumber       = 4,\n\tpages        = {1324--1342}\n}\n@inproceedings{shotton06textonboost,\n\ttitle        = {TextonBoost: Joint Appearance, Shape and Context Modeling for Multi-Class Object Recognition and Segmentation},\n\tauthor       = {J. Shotton and J. Winn and C. Rother and A. Criminisi},\n\tyear         = 2006,\n\tbooktitle    = {European Conference on Computer Vision (ECCV)}\n}\n@inproceedings{shridhar2020alfred,\n\ttitle        = {ALFRED: A Benchmark for Interpreting Grounded Instructions for Everyday Tasks},\n\tauthor       = {Mohit Shridhar and Jesse Thomason and Daniel Gordon and Yonatan Bisk and Winson Han and Roozbeh Mottaghi and Luke Zettlemoyer and Dieter Fox},\n\tyear         = 2020,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@inproceedings{shrikumar2017learning,\n\ttitle        = {Learning Important Features Through Propagating Activation Differences},\n\tauthor       = {Avanti Shrikumar and Peyton Greenside and Anshul Kundaje},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{shu2018dirtt,\n\ttitle        = {A {DIRT}-{T} Approach to Unsupervised Domain Adaptation},\n\tauthor       = {Rui Shu and Hung H. Bui and Hirokazu Narui and Stefano Ermon},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{shu2019meta,\n\ttitle        = {{Meta-Weight-Net}: Learning an explicit mapping for sample weighting},\n\tauthor       = {Jun Shu and Qi Xie and Lixuan Yi and Qian Zhao and Sanping Zhou and Zongben Xu and Deyu Meng},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1919--1930}\n}\n@article{shulaker2013carbon,\n\ttitle        = {Carbon nanotube computer},\n\tauthor       = {Max M Shulaker and Gage Hills and Nishant Patil and Hai Wei and Hong-Yu Chen and H-S Philip Wong and Subhasish Mitra},\n\tyear         = 2013,\n\tjournal      = {Nature},\n\tvolume       = 501,\n\tnumber       = 7468,\n\tpages        = {526--530}\n}\n@article{shumway1982approach,\n\ttitle        = {\n\t\tAn approach to time series smoothing and forecasting using the EM\n\n\t\talgorithm\n\t},\n\tauthor       = {Shumway, R. H. and Stoffer, D. S.},\n\tyear         = 1982,\n\tjournal      = {Journal of Time Series Analysis},\n\tvolume       = 3,\n\tpages        = {253--264},\n\tciteulike-article-id = 2322861,\n\tkeywords     = {algorithm, bibtex-import, em, filtermaximum, kalman, likelihood},\n\tlocal-url    = {file://localhost/Users/paulfrogerais/travail/lecture/articles/algo\\%20EM/em\\_Shumway.pdf},\n\tposted-at    = {2008-02-02 12:11:13},\n\tpriority     = 2\n}\n@article{shuster2018engaging,\n\ttitle        = {Engaging Image Captioning Via Personality},\n\tauthor       = {Kurt Shuster and Samuel Humeau and Hexiang Hu and Antoine Bordes and Jason Weston},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1810.10665}\n}\n@article{shuster2018imagechat,\n\ttitle        = {Engaging Image Chat: Modeling Personality in Grounded Dialogue},\n\tauthor       = {Kurt Shuster and Samuel Humeau and Antoine Bordes and Jason Weston},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1811.00945}\n}\n@article{shuster2020deploying,\n\ttitle        = {Deploying Lifelong Open-Domain Dialogue Learning},\n\tauthor       = {Kurt Shuster and Jack Urbanek and Emily Dinan and Arthur Szlam and J. Weston},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2008.08076}\n}\n@inproceedings{siddhant2018deep,\n\ttitle        = {Deep {B}ayesian active learning for natural language processing: Results of a large-scale empirical study},\n\tauthor       = {Aditya Siddhant and Zachary C Lipton},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{sidford2018near,\n\ttitle        = {Near-optimal time and sample complexities for solving Markov decision processes with a generative model},\n\tauthor       = {Sidford, Aaron and Wang, Mengdi and Wu, Xian and Yang, Lin F and Ye, Yinyu},\n\tyear         = 2018,\n\tbooktitle    = {Proceedings of the 32nd International Conference on Neural Information Processing Systems},\n\tpages        = {5192--5202}\n}\n@inproceedings{sidford2018variance,\n\ttitle        = {Variance reduced value iteration and faster algorithms for solving markov decision processes},\n\tauthor       = {Sidford, Aaron and Wang, Mengdi and Wu, Xian and Ye, Yinyu},\n\tyear         = 2018,\n\tbooktitle    = {Proceedings of the Twenty-Ninth Annual ACM-SIAM Symposium on Discrete Algorithms},\n\tpages        = {770--787},\n\torganization = {SIAM}\n}\n@article{sidi2003zero,\n\ttitle        = {A zero-cost preconditioning for a class of indefinite linear systems},\n\tauthor       = {Sidi, AVRAM},\n\tyear         = 2003,\n\tjournal      = {WSEAS Trans. Math},\n\tvolume       = 2,\n\tpages        = {142--150}\n}\n@book{siefkes2005incremental,\n\ttitle        = {Incremental information extraction using tree-based context representations},\n\tauthor       = {Christian Siefkes},\n\tyear         = 2005,\n\tpublisher    = {Computational Linguistics and Intelligent Text Processing Springer},\n\tpages        = {510--521}\n}\n@article{siegelmann1995computational,\n\ttitle        = {On the computational power of neural nets},\n\tauthor       = {Siegelmann, Hava T and Sontag, Eduardo D},\n\tyear         = 1995,\n\tjournal      = {Journal of computer and system sciences},\n\tpublisher    = {Elsevier},\n\tvolume       = 50,\n\tnumber       = 1,\n\tpages        = {132--150}\n}\n@article{siepel04phylohmm,\n\ttitle        = {Combining phylogenetic and hidden {M}arkov models in biosequence analysis},\n\tauthor       = {Adam Siepel and David Haussler},\n\tyear         = 2004,\n\tjournal      = {Journal of Computational Biology},\n\tvolume       = 11,\n\tpages        = {413--428}\n}\n@techreport{Silva2011,\n\ttitle        = {{Sparse Sums of Positive Semidefinite Matrices}},\n\tauthor       = {{\\noopsort{Carli Silva}}de {Carli Silva}, Marcel K. and Harvey, Nicholas J. A. and Sato, Cristiane M.},\n\tyear         = 2011,\n\tmonth        = jul,\n\tabstract     = {Recently there has been much interest in \"sparsifying\" sums of rank one matrices: modifying the coefficients such that only a few are nonzero, while approximately preserving the matrix that results from the sum. Results of this sort have found applications in many different areas, including sparsifying graphs. In this paper we consider the more general problem of sparsifying sums of positive semidefinite matrices that have arbitrary rank. We give several algorithms for solving this problem. The first algorithm is based on the method of Batson, Spielman and Srivastava (2009). The second algorithm is based on the matrix multiplicative weights update method of Arora and Kale (2007). We also highlight an interesting connection between these two algorithms. Our algorithms have numerous applications. We show how they can be used to construct graph sparsifiers with auxiliary constraints, sparsifiers of hypergraphs, and sparse solutions to semidefinite programs.},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {1107.0088},\n\teprint       = {1107.0088},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Silva, Harvey, Sato - 2011 - Sparse Sums of Positive Semidefinite Matrices(2).pdf:pdf},\n\tmendeley-groups = {Algorithms/Sparsification}\n}\n@inproceedings{silver2013lifelong,\n\ttitle        = {Lifelong Machine Learning Systems: Beyond Learning Algorithms},\n\tauthor       = {Daniel L Silver and Qiang Yang and Lianghao Li},\n\tyear         = 2013,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tvolume       = 13\n}\n@inproceedings{silver2014deterministic,\n\ttitle        = {Deterministic policy gradient algorithms},\n\tauthor       = {Silver, David and Lever, Guy and Heess, Nicolas and Degris, Thomas and Wierstra, Daan and Riedmiller, Martin},\n\tyear         = 2014,\n\tbooktitle    = {International conference on machine learning},\n\tpages        = {387--395},\n\torganization = {PMLR}\n}\n@article{silver2016mastering,\n\ttitle        = {Mastering the game of Go with deep neural networks and tree search},\n\tauthor       = {D. Silver and A. Huang and C. J. Maddison and A. Guez and L. Sifre and G. Van Den Driessche and J. Schrittwieser and I. Antonoglou and V. Panneershelvam and M. Lanctot and others},\n\tyear         = 2016,\n\tjournal      = {Nature},\n\tvolume       = 529,\n\tnumber       = 7587,\n\tpages        = {484--489}\n}\n@article{silver2017mastering,\n\ttitle        = {Mastering the game of {Go} with deep neural networks and tree search},\n\tauthor       = {David Silver and Aja Huang and Christopher J. Maddison and Arthur Guez and Laurent Sifre and George van den Driessche and Julian Schrittwieser and Ioannis Antonoglou and Veda Panneershelvam and Marc Lanctot and Sander Dieleman and Dominik Grewe and John Nham and Nal Kalchbrenner and Ilya Sutskever and Timothy Lillicrap and Madeleine Leach and Koray Kavukcuoglu and Thore Graepel and Demis Hassabis},\n\tyear         = 2016,\n\tjournal      = {Nature},\n\tvolume       = 529,\n\tnumber       = 7676,\n\tpages        = {484--503}\n}\n@inproceedings{silver2017predictron,\n\ttitle        = {The predictron: End-to-end learning and planning},\n\tauthor       = {Silver, David and Hasselt, Hado and Hessel, Matteo and Schaul, Tom and Guez, Arthur and Harley, Tim and Dulac-Arnold, Gabriel and Reichert, David and Rabinowitz, Neil and Barreto, Andre and Degris, Thomas},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {3191--3199},\n\torganization = {PMLR}\n}\n@inproceedings{silvio-security13,\n\ttitle        = {The evolution of sybil defense via social networks},\n\tauthor       = {L. Alvisi and A. Clement and A. Epasto and S. Lattanzi and A. Panconesi},\n\tyear         = 2013,\n\tbooktitle    = {IEEE Symposium on Security and Privacy}\n}\n@book{simard1998transformation,\n\ttitle        = {Transformation Invariance in Pattern Recognition---Tangent Distance and Tangent Propagation},\n\tauthor       = {Patrice Y Simard and Yann A LeCun and John S Denker and Bernard Victorri},\n\tyear         = 1998,\n\tpublisher    = {Neural networks: Tricks of the trade Springer},\n\tpages        = {239--274}\n}\n@article{simard2003best,\n\ttitle        = {Best Practices for Convolutional Neural Networks Applied to Visual Document Analysis},\n\tauthor       = {Patrice Y. Simard and Dave Steinkraus and John C. Platt},\n\tyear         = 2003,\n\tjournal      = {International Conference on Document Analysis and Recognition},\n\tvolume       = 2,\n\tpages        = {958--964}\n}\n@article{SIMAX-080148-Tensor-Eigenvalues,\n\ttitle        = {Shifted Power Method for Computing Tensor Eigenpairs},\n\tauthor       = {T. G. Kolda and J. R. Mayo},\n\tyear         = 2011,\n\tmonth        = oct,\n\tjournal      = {SIAM Journal on Matrix Analysis and Applications},\n\tvolume       = 32,\n\tnumber       = 4,\n\tpages        = {1095--1124}\n}\n@article{simchi2020bypassing,\n\ttitle        = {Bypassing the Monster: A Faster and Simpler Optimal Algorithm for Contextual Bandits under Realizability},\n\tauthor       = {Simchi-Levi, David and Xu, Yunzong},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2003.12699}\n}\n@article{simchowitz2017gap,\n\ttitle        = {On the Gap Between Strict-Saddles and True Convexity: An Omega (log d) Lower Bound for Eigenvector Approximation},\n\tauthor       = {Simchowitz, Max and Alaoui, Ahmed El and Recht, Benjamin},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1704.04548}\n}\n@inproceedings{simchowitz2019non,\n\ttitle        = {Non-asymptotic gap-dependent regret bounds for tabular {MDPs}},\n\tauthor       = {Simchowitz, Max and Jamieson, Kevin G},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {1153--1162}\n}\n@article{simeoni2019rethinking,\n\ttitle        = {Rethinking deep active learning: Using unlabeled data at model training},\n\tauthor       = {Oriane Sim{\\'e}oni and Mateusz Budnik and Yannis Avrithis and Guillaume Gravier},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1911.08177}\n}\n@article{simmons2019q,\n\ttitle        = {Q-learning for continuous actions with cross-entropy guided policies},\n\tauthor       = {Simmons-Edler, Riley and Eisner, Ben and Mitchell, Eric and Seung, Sebastian and Lee, Daniel},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1903.10605}\n}\n@article{simoiu2017problem,\n\ttitle        = {The problem of infra-marginality in outcome tests for discrimination},\n\tauthor       = {Camelia Simoiu and Sam Corbett-Davies and Sharad Goel and others},\n\tyear         = 2017,\n\tjournal      = {The Annals of Applied Statistics},\n\tvolume       = 11,\n\tnumber       = 3,\n\tpages        = {1193--1216}\n}\n@article{simonyan2013deep,\n\ttitle        = {Deep inside convolutional networks: Visualising image classification models and saliency maps},\n\tauthor       = {Karen Simonyan and Andrea Vedaldi and Andrew Zisserman},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1312.6034}\n}\n@article{simonyan2014very,\n\ttitle        = {Very deep convolutional networks for large-scale image recognition},\n\tauthor       = {Simonyan, Karen and Zisserman, Andrew},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1409.1556},\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{SimulatedAnnealing1953,\n\ttitle        = {Equation of state calculations by fast computing machines},\n\tauthor       = {Metropolis, Nicholas and Rosenbluth, Arianna W. and Rosenbluth, Marshall N. and Teller, Augusta H. and Teller, Edward},\n\tyear         = 1953,\n\tjournal      = {The journal of chemical physics},\n\tpublisher    = {AIP Publishing},\n\tvolume       = 21,\n\tnumber       = 6,\n\tpages        = {1087--1092}\n}\n@article{SinclairJerrum89,\n\ttitle        = {Approximate Counting, Uniform Generation and Rapidly Mixing Markov Chains},\n\tauthor       = {Alistair Sinclair and Mark Jerrum},\n\tyear         = 1989,\n\tjournal      = {Information and Computation},\n\tvolume       = 82,\n\tnumber       = 1,\n\tpages        = {93--133},\n\tee           = {http://dx.doi.org/10.1016/0890-5401(89)90067-9},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de}\n}\n@article{singer2011angular,\n\ttitle        = {Angular synchronization by eigenvectors and semidefinite programming},\n\tauthor       = {Singer, Amit},\n\tyear         = 2011,\n\tjournal      = {Applied and computational harmonic analysis},\n\tpublisher    = {Elsevier},\n\tvolume       = 30,\n\tnumber       = 1,\n\tpages        = {20--36}\n}\n@inproceedings{singh1992reinforcement,\n\ttitle        = {Reinforcement learning with a hierarchy of abstract models},\n\tauthor       = {Singh, Satinder P},\n\tyear         = 1992,\n\tbooktitle    = {Proceedings of the National Conference on Artificial Intelligence},\n\tnumber       = 10,\n\tpages        = 202,\n\torganization = {Citeseer}\n}\n@article{singh1992transfer,\n\ttitle        = {Transfer of learning by composing solutions of elemental sequential tasks},\n\tauthor       = {Singh, Satinder Pal},\n\tyear         = 1992,\n\tjournal      = {Machine Learning},\n\tpublisher    = {Springer},\n\tvolume       = 8,\n\tnumber       = {3-4},\n\tpages        = {323--339}\n}\n@article{singh1994upper,\n\ttitle        = {An upper bound on the loss from approximate optimal-value functions},\n\tauthor       = {Singh, Satinder P and Yee, Richard C},\n\tyear         = 1994,\n\tjournal      = {Machine Learning},\n\tpublisher    = {Springer},\n\tvolume       = 16,\n\tnumber       = 3,\n\tpages        = {227--233}\n}\n@inproceedings{singh1995reinforcement,\n\ttitle        = {Reinforcement learning with soft state aggregation},\n\tauthor       = {Singh, Satinder P and Jaakkola, Tommi and Jordan, Michael I},\n\tyear         = 1995,\n\tjournal      = {Advances in neural information processing systems},\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {361--368}\n}\n@article{singh1996reinforcement,\n\ttitle        = {Reinforcement learning with replacing eligibility traces},\n\tauthor       = {Singh, Satinder P and Sutton, Richard S},\n\tyear         = 1996,\n\tjournal      = {Machine learning},\n\tpublisher    = {Springer},\n\tvolume       = 22,\n\tnumber       = {1-3},\n\tpages        = {123--158}\n}\n@inproceedings{singh2004predictive,\n\ttitle        = {Predictive state representations: a new theory for modeling dynamical systems},\n\tauthor       = {Singh, Satinder and James, Michael R and Rudary, Matthew R},\n\tyear         = 2004,\n\tbooktitle    = {Conference on Uncertainty in Artificial Intelligence}\n}\n@inproceedings{singh2008unlabeled,\n\ttitle        = {Unlabeled data: Now it helps, now it doesn't},\n\tauthor       = {Aarti Singh and Robert Nowak and Jerry Zhu},\n\tyear         = 2008,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{singh2011large,\n\ttitle        = {Large-scale cross-document coreference using distributed inference and hierarchical models},\n\tauthor       = {Sameer Singh and Amarnag Subramanya and Fernando Pereira and Andrew McCallum},\n\tyear         = 2011,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {793--803}\n}\n@inproceedings{singh2015tensor,\n\ttitle        = {Towards Combined Matrix and Tensor Factorization for Universal Schema Relation Extraction},\n\tauthor       = {Sameer Singh and Tim Rockt{\\\"{a}}schel and Sebastian Riedel},\n\tyear         = 2015,\n\tbooktitle    = {NAACL Workshop on Vector Space Modeling for NLP}\n}\n@inproceedings{singh2016efficient,\n\ttitle        = {Efficient Nonparametric Smoothness Estimation},\n\tauthor       = {Singh, Shashank and Du, Simon S and Poczos, Barnabas},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 29,\n\tpages        = {},\n\turl          = {https://proceedings.neurips.cc/paper/2016/file/acc3e0404646c57502b480dc052c4fe1-Paper.pdf},\n\teditor       = {D. Lee and M. Sugiyama and U. Luxburg and I. Guyon and R. Garnett}\n}\n@inproceedings{singh2018nonparametric,\n\ttitle        = {Nonparametric Density Estimation under Adversarial Losses},\n\tauthor       = {Shashank Singh and Ananya Uppal and Boyue Li and Chun-Liang Li and Manzil Zaheer and Barnabas Poczos},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {246--257}\n}\n@inproceedings{singh2019endtoend,\n\ttitle        = {End-to-End Robotic Reinforcement Learning without Reward Engineering},\n\tauthor       = {Avi Singh and Larry Yang and Kristian Hartikainen and Chelsea Finn and Sergey Levine},\n\tyear         = 2019,\n\tbooktitle    = {Robotics: Science and Systems (RSS)}\n}\n@inproceedings{singh2019fair,\n\ttitle        = {Fair Predictors under Distribution Shift},\n\tauthor       = {Harvineet Singh and Rina Singh and Vishwali Mhasawade and Rumi Chunara},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{singla2014near,\n\ttitle        = {Near-Optimally Teaching the Crowd to Classify},\n\tauthor       = {Adish Singla and Ilija Bogunovic and G{\\'a}bor Bart{\\'o}k and Amin Karbasi and Andreas Krause},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1402.2092}\n}\n@inproceedings{sinha2018certifiable,\n\ttitle        = {Certifiable Distributional Robustness with Principled Adversarial Training},\n\tauthor       = {Aman Sinha and Hongseok Namkoong and John Duchi},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{sinha2021masked,\n\ttitle        = {Masked language modeling and the distributional hypothesis: Order word matters pre-training for little},\n\tauthor       = {Sinha, Koustuv and Jia, Robin and Hupkes, Dieuwke and Pineau, Joelle and Williams, Adina and Kiela, Douwe},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2104.06644}\n}\n@article{sion1958general,\n\ttitle        = {On general minimax theorems.},\n\tauthor       = {Sion, Maurice},\n\tyear         = 1958,\n\tjournal      = {Pacific Journal of Mathematics},\n\tpublisher    = {Pacific Journal of Mathematics},\n\tvolume       = 8,\n\tnumber       = 1,\n\tpages        = {171--176}\n}\n@article{sion1958minimax,\n\ttitle        = {On general minimax theorems},\n\tauthor       = {Maurice Sion},\n\tyear         = 1958,\n\tjournal      = {Pacific journal of mathematics},\n\tvolume       = 8,\n\tnumber       = 1,\n\tpages        = {171--176}\n}\n@book{sipser13introduction,\n\ttitle        = {Introduction to the Theory of Computation},\n\tauthor       = {Sipser, Michael},\n\tyear         = 2013,\n\tpublisher    = {Course Technology},\n\taddress      = {Boston, MA},\n\tisbn         = {113318779X},\n\tadded-at     = {2014-03-03T20:31:26.000+0100},\n\tbiburl       = {https://www.bibsonomy.org/bibtex/2a275d239d3a005a2a0825e49ce8dced5/ytyoun},\n\tedition      = {Third},\n\tinterhash    = {ba5fd05e9f15a677c2c9e619c57de9a7},\n\tintrahash    = {a275d239d3a005a2a0825e49ce8dced5},\n\tkeywords     = {automata complexity computation hamiltonian np-hardness sipser textbook},\n\trefid        = 814441519,\n\ttimestamp    = {2016-12-04T08:23:19.000+0100}\n}\n@article{sirignano2018mean,\n\ttitle        = {Mean Field Analysis of Neural Networks},\n\tauthor       = {Sirignano, Justin and Spiliopoulos, Konstantinos},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.01053}\n}\n@article{siskind96cross,\n\ttitle        = {A computational study of cross-situational techniques for learning word-to-meaning mappings},\n\tauthor       = {J. M. Siskind},\n\tyear         = 1996,\n\tjournal      = {Cognition},\n\tvolume       = 61,\n\tpages        = {1--38}\n}\n@article{sittler1964optimal,\n\ttitle        = {An Optimal Data Association Problem in Surveillance Theory},\n\tauthor       = {Sittler, Robert W.},\n\tyear         = 1964,\n\tmonth        = apr,\n\tjournal      = {Military Electronics, IEEE Transactions on},\n\tvolume       = 8,\n\tnumber       = 2,\n\tpages        = {125--139},\n\tdoi          = {10.1109/TME.1964.4323129},\n\tissn         = {0536-1559}\n}\n@inproceedings{sivaraman2014experimental,\n\ttitle        = {An Experimental Study of the Learnability of Congestion Control},\n\tauthor       = {Anirudh Sivaraman and Keith Winstein and Pratiksha Thaker and Hari Balakrishnan},\n\tyear         = 2014,\n\tbooktitle    = {SIGCOMM}\n}\n@inproceedings{skorokhodov2018ssl,\n\ttitle        = {Semi-supervised neural machine translation with language models},\n\tauthor       = {I. Skorokhodov and A. Rykachevskiy and D. Emelyanenko and S. Slotin and A. Ponkratov},\n\tyear         = 2018,\n\tbooktitle    = {AMTA 2018 Workshop on Technologies for MT of Low Resource Languages (LoResMT)}\n}\n@article{SleatorTarjan1983,\n\ttitle        = {A data structure for dynamic trees},\n\tauthor       = {Sleator, Daniel D. and Tarjan, Robert Endre},\n\tyear         = 1983,\n\tjournal      = {Journal of computer and system sciences},\n\tvolume       = 26,\n\tnumber       = 3\n}\n@article{slivkins2019introduction,\n\ttitle        = {Introduction to multi-armed bandits},\n\tauthor       = {Slivkins, Aleksandrs},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1904.07272}\n}\n@article{SM00,\n\ttitle        = {Normalized cuts and image segmentation},\n\tauthor       = {J. Shi and J. Malik},\n\tyear         = 2000,\n\tjournal      = {IEEE Transactions on Pattern Analysis and Machine Intelligence},\n\tpublisher    = {Ieee},\n\tvolume       = 22,\n\tnumber       = 8,\n\tpages        = {888--905}\n}\n@inproceedings{smaragdakis11context,\n\ttitle        = {Pick Your Contexts Well: Understanding Object-Sensitivity},\n\tauthor       = {Yannis Smaragdakis and Martin Bravenboer and Ondrej Lhotak},\n\tyear         = 2011,\n\tbooktitle    = {Principles of Programming Languages (POPL)}\n}\n@inproceedings{smh11,\n\ttitle        = {Generating Text with Recurrent Neural Networks},\n\tauthor       = {Sutskever, Ilya and Martens, James and Hinton, Geoffrey},\n\tyear         = 2011,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1017--1024}\n}\n@inproceedings{smith05contrastive,\n\ttitle        = {Contrastive Estimation: Training Log-Linear Models on Unlabeled Data},\n\tauthor       = {Noah Smith and Jason Eisner},\n\tyear         = 2005,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {354--362}\n}\n@inproceedings{smith06anneal,\n\ttitle        = {Annealing Structural Bias in Multilingual Weighted Grammar Induction},\n\tauthor       = {Noah Smith and Jason Eisner},\n\tyear         = 2006,\n\tbooktitle    = {International Conference on Computational Linguistics and Association for Computational Linguistics (COLING/ACL)}\n}\n@inproceedings{smith2006minimum,\n\ttitle        = {Minimum risk annealing for training log-linear models},\n\tauthor       = {David A Smith and Jason Eisner},\n\tyear         = 2006,\n\tbooktitle    = {International Conference on Computational Linguistics and Association for Computational Linguistics (COLING/ACL)},\n\tpages        = {787--794}\n}\n@inproceedings{smith2011cloze,\n\ttitle        = {Cloze but no cigar: The complex relationship between cloze, corpus, and subjective probabilities in language processing},\n\tauthor       = {Nathaniel Smith and Roger Levy},\n\tyear         = 2011,\n\tbooktitle    = {Proceedings of the Annual Meeting of the Cognitive Science Society}\n}\n@article{smith2012adversarial,\n\ttitle        = {Adversarial Evaluation for Models of Natural Language},\n\tauthor       = {Noah A. Smith},\n\tyear         = 2012,\n\tjournal      = {arXiv preprint arXiv:1207.0245}\n}\n@inproceedings{smith2013pragmatics,\n\ttitle        = {Learning and using language via recursive pragmatic reasoning about other agents},\n\tauthor       = {Nathaniel J. Smith and Noah D. Goodman and Michael C. Frank},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {3039--3047}\n}\n@article{smith2017bayesian,\n\ttitle        = {A bayesian perspective on generalization and stochastic gradient descent},\n\tauthor       = {Smith, Samuel L and Le, Quoc V},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1710.06451}\n}\n@inproceedings{smith2017cyclical,\n\ttitle        = {Cyclical learning rates for training neural networks},\n\tauthor       = {Smith, Leslie N},\n\tyear         = 2017,\n\tbooktitle    = {2017 IEEE Winter Conference on Applications of Computer Vision (WACV)},\n\tpages        = {464--472},\n\torganization = {IEEE}\n}\n@article{smith2017don,\n\ttitle        = {Don't decay the learning rate, increase the batch size},\n\tauthor       = {Smith, Samuel L and Kindermans, Pieter-Jan and Ying, Chris and Le, Quoc V},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1711.00489}\n}\n@inproceedings{smith2018understanding,\n\ttitle        = {Understanding Measures of Uncertainty for Adversarial Example Detection},\n\tauthor       = {Lewis Smith and Yarin Gal},\n\tyear         = 2018,\n\tbooktitle    = {Uncertainty in Artificial Intelligence (UAI)}\n}\n@inproceedings{smith2019super,\n\ttitle        = {Super-convergence: Very fast training of neural networks using large learning rates},\n\tauthor       = {Smith, Leslie N and Topin, Nicholay},\n\tyear         = 2019,\n\tbooktitle    = {Artificial Intelligence and Machine Learning for Multi-Domain Operations Applications},\n\tvolume       = 11006,\n\tpages        = 1100612,\n\torganization = {International Society for Optics and Photonics}\n}\n@book{smolensky86rbm,\n\ttitle        = {Parallel Distributed Processing: Volume 1: Foundations},\n\tauthor       = {P. Smolensky},\n\tyear         = 1986,\n\tpublisher    = {MIT Press},\n\tpages        = {194--281}\n}\n@inproceedings{smtjr18,\n\ttitle        = {Learning Without Mixing: Towards A Sharp Analysis of Linear System Identification},\n\tauthor       = {Simchowitz, Max and Mania, Horia and Tu, Stephen and Jordan, Michael I and Recht, Benjamin},\n\tyear         = 2018,\n\tbooktitle    = {Conference on Learning Theory (COLT)},\n\tpublisher    = {arXiv preprint arXiv:1802.08334}\n}\n@misc{Snap-data,\n\ttitle        = {{SNAP Datasets}: {Stanford} Large Network Dataset Collection},\n\tauthor       = {Jure Leskovec and Andrej Krevl},\n\tyear         = 2014,\n\tmonth        = jun,\n\thowpublished = {\\url{http://snap.stanford.edu/data}}\n}\n@inproceedings{snoek12hyper,\n\ttitle        = {Practical {B}ayesian Optimization of Machine Learning Algorithms},\n\tauthor       = {Jasper Snoek and Hugo Larochelle and Ryan P. Adams},\n\tyear         = 2012,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{snover2006ter,\n\ttitle        = {A Study of Translation Edit Rate with Targeted Human Annotation},\n\tauthor       = {Matthew Snover and Bonnie Dorr and Richard Schwartz and Linnea Micciulla and John Makhoul},\n\tyear         = 2006,\n\tbooktitle    = {Association for Machine Translation in the Americas},\n\tpages        = {223--231}\n}\n@inproceedings{snow2004learning,\n\ttitle        = {Learning syntactic patterns for automatic hypernym discovery},\n\tauthor       = {Rion Snow and Daniel Jurafsky and Andrew Y Ng},\n\tyear         = 2004,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{snyder07database,\n\ttitle        = {Database-Text Alignment via Structured Multilabel Classification},\n\tauthor       = {Benjamin Snyder and Regina Barzilay},\n\tyear         = 2007,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)},\n\tpages        = {1713--1718}\n}\n@inproceedings{snyder2010climbing,\n\ttitle        = {Climbing the tower of Babel: Unsupervised multilingual learning},\n\tauthor       = {Benjamin Snyder and Regina Barzilay},\n\tyear         = 2010,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{so2008unified,\n\ttitle        = {A unified theorem on {SDP} rank reduction},\n\tauthor       = {A. So and Y. Ye and J. Zhang},\n\tyear         = 2008,\n\tjournal      = {Mathematics of Operations Research},\n\tvolume       = 33,\n\tnumber       = 4,\n\tpages        = {910--920}\n}\n@inproceedings{socher2011paraphrase,\n\ttitle        = {Dynamic pooling and unfolding recursive autoencoders for paraphrase detection},\n\tauthor       = {Richard Socher and Eric H Huang and Jeffrey Pennin and Christopher D Manning and Andrew Ng},\n\tyear         = 2011,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {801--809}\n}\n@inproceedings{socher2011parsing,\n\ttitle        = {Parsing natural scenes and natural language with recursive neural networks},\n\tauthor       = {Richard Socher and Cliff C Lin and Chris Manning and Andrew Y Ng},\n\tyear         = 2011,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {129--136}\n}\n@inproceedings{socher2012mvrnn,\n\ttitle        = {Semantic compositionality through recursive matrix-vector spaces},\n\tauthor       = {Richard Socher and Brody Huval and Christopher D Manning and Andrew Y Ng},\n\tyear         = 2012,\n\tbooktitle    = {Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP/CoNLL)},\n\tpages        = {1201--1211}\n}\n@inproceedings{socher2013reasoning,\n\ttitle        = {Reasoning with neural tensor networks for knowledge base completion},\n\tauthor       = {Richard Socher and Danqi Chen and Christopher D Manning and Andrew Ng},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {926--934}\n}\n@inproceedings{socher2013recursive,\n\ttitle        = {Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank},\n\tauthor       = {Richard Socher and Alex Perelygin and Jean Y Wu and Jason Chuang and Christopher D Manning and Andrew Y Ng and Christopher Potts},\n\tyear         = 2013,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{socher2014grounded,\n\ttitle        = {Grounded compositional semantics for finding and describing images with sentences},\n\tauthor       = {Richard Socher and Andrej Karpathy and Quoc V Le and Christopher D Manning and Andrew Y Ng},\n\tyear         = 2014,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 2,\n\tpages        = {207--218}\n}\n@article{soderstrom1982some,\n\ttitle        = {Some properties of the output error method},\n\tauthor       = {S{\\\"o}derstr{\\\"o}m, Torsten and Stoica, Petre},\n\tyear         = 1982,\n\tjournal      = {Automatica},\n\tpublisher    = {Elsevier},\n\tvolume       = 18,\n\tnumber       = 1,\n\tpages        = {93--99}\n}\n@inproceedings{sogaard2016deep,\n\ttitle        = {Deep multi-task learning with low level tasks supervised at lower layers},\n\tauthor       = {Anders S{\\o}gaard and Yoav Goldberg},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@misc{soh2017tagui,\n\ttitle        = {{TagUI}: {RPA} / {CLI} tool for automating user interactions},\n\tauthor       = {Ken Soh},\n\tyear         = 2017,\n\thowpublished = {\\url{https://github.com/kelaberetiv/TagUI}}\n}\n@inproceedings{sohn07ihmm,\n\ttitle        = {A Hidden {M}arkov {D}irichlet Process Model for Genetic Recombination in Open Ancestral Space},\n\tauthor       = {K. Sohn and E. P. Xing},\n\tyear         = 2007,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{sohn2016improved,\n\ttitle        = {Improved deep metric learning with multi-class n-pair loss objective},\n\tauthor       = {Sohn, Kihyuk},\n\tyear         = 2016,\n\tbooktitle    = {Proceedings of the 30th International Conference on Neural Information Processing Systems},\n\tpages        = {1857--1865}\n}\n@article{sohn2020fixmatch,\n\ttitle        = {FixMatch: Simplifying Semi-Supervised Learning with Consistency and Confidence},\n\tauthor       = {Kihyuk Sohn and David Berthelot and Chun-Liang Li and Zizhao Zhang and Nicholas Carlini and Ekin D. Cubuk and Alex Kurakin and Han Zhang and Colin Raffel},\n\tyear         = 2020,\n\tjournal      = {arXiv}\n}\n@article{sohoni2020no,\n\ttitle        = {No Subclass Left Behind: Fine-Grained Robustness in Coarse-Grained Classification Problems},\n\tauthor       = {Nimit S Sohoni and Jared A Dunnmon and Geoffrey Angus and Albert Gu and Christopher R{\\'e}},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2011.12945}\n}\n@inproceedings{solar05sketching,\n\ttitle        = {Programming by Sketching for Bit-Streaming Programs},\n\tauthor       = {Armando Solar-Lezama and Rodric Rabbah and Rastislav Bodík and Kemal Ebcioglu},\n\tyear         = 2005,\n\tbooktitle    = {Programming Language Design and Implementation (PLDI)}\n}\n@inproceedings{solar2006combinatorial,\n\ttitle        = {Combinatorial Sketching for Finite Programs},\n\tauthor       = {Armando Solar-Lezama and Liviu Tancau and Rastislav Bodik and Vijay Saraswat and Sanjit Seshia},\n\tyear         = 2006,\n\tbooktitle    = {Architectural Support for Programming Languages and Operating Systems (ASPLOS)}\n}\n@article{solomon2006norms,\n\ttitle        = {Norms of epistemic diversity},\n\tauthor       = {Miriam Solomon},\n\tyear         = 2006,\n\tjournal      = {Episteme},\n\tvolume       = 3,\n\tnumber       = 1,\n\tpages        = {23--36}\n}\n@inproceedings{soltanolkotabi2017learning,\n\ttitle        = {Learning {ReLUs} via gradient descent},\n\tauthor       = {Soltanolkotabi, Mahdi},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {2007--2017}\n}\n@article{soltanolkotabi2018theoretical,\n\ttitle        = {Theoretical insights into the optimization landscape of over-parameterized shallow neural networks},\n\tauthor       = {Soltanolkotabi, Mahdi and Javanmard, Adel and Lee, Jason D},\n\tyear         = 2018,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tpublisher    = {IEEE},\n\tvolume       = 65,\n\tnumber       = 2,\n\tpages        = {742--769}\n}\n@inproceedings{song10kernel,\n\ttitle        = {{H}ilbert Space Embeddings of Hidden {M}arkov Models},\n\tauthor       = {L. Song and B. Boots and S. Siddiqi and G. Gordon and A. Smola},\n\tyear         = 2010,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{song2004learning,\n\ttitle        = {Learning block importance models for web pages},\n\tauthor       = {Ruihua Song and Haifeng Liu and Ji-Rong Wen and Wei-Ying Ma},\n\tyear         = 2004,\n\tbooktitle    = {World Wide Web (WWW)},\n\tpages        = {203--211}\n}\n@inproceedings{song2011kernel,\n\ttitle        = {Kernel embeddings of latent tree graphical models},\n\tauthor       = {Le Song and Eric P Xing and Ankur P Parikh},\n\tyear         = 2011,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {2708--2716}\n}\n@inproceedings{song2011spectral,\n\ttitle        = {A spectral algorithm for latent tree graphical models},\n\tauthor       = {Le Song and E. P Xing and A. P Parikh},\n\tyear         = 2011,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{song2016retrieval,\n\ttitle        = {Two are Better than One: An Ensemble of Retrieval- and Generation-Based Dialog Systems},\n\tauthor       = {Yiping Song and Rui Yan and Xiang Li and Dongyan Zhao and Ming Zhang},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1610.07149}\n}\n@article{song2018learning,\n\ttitle        = {Learning Controllable Fair Representations},\n\tauthor       = {Song, Jiaming and Kalluri, Pratyusha and Grover, Aditya and Zhao, Shengjia and Ermon, Stefano},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1812.04218}\n}\n@inproceedings{song2019distribution,\n\ttitle        = {Distribution calibration for regression},\n\tauthor       = {Song, Hao and Diethe, Tom and Kull, Meelis and Flach, Peter},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {5897--5906},\n\torganization = {PMLR}\n}\n@article{SongEtal:NonparametricTensorDecomp,\n\ttitle        = {Nonparametric Estimation of Multi-View Latent Variable Models},\n\tauthor       = {L. Song and A. Anandkumar and B. Dai and B. Xie},\n\tyear         = 2013,\n\tmonth        = nov,\n\tjournal      = {Available on arXiv:1311.3287}\n}\n@inproceedings{sontag2008outer,\n\ttitle        = {New Outer Bounds on the Marginal Polytope},\n\tauthor       = {D. Sontag and T. Jaakkola},\n\tyear         = 2008,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1393--1400}\n}\n@inproceedings{sontag2008tightening,\n\ttitle        = {Tightening {LP} Relaxations for {MAP} using Message-Passing},\n\tauthor       = {D. Sontag and T. Meltzer and A. Globerson and Y. Weiss and T. Jaakkola},\n\tyear         = 2008,\n\tbooktitle    = {Uncertainty in Artificial Intelligence (UAI)},\n\tpages        = {503--510}\n}\n@phdthesis{sontag2010approximate,\n\ttitle        = {Approximate inference in graphical models using {LP} relaxations},\n\tauthor       = {David Sontag},\n\tyear         = 2010,\n\tschool       = {Massachusetts Institute of Technology}\n}\n@inproceedings{sontag2011complexity,\n\ttitle        = {Complexity of Inference in Latent {D}irichlet Allocation},\n\tauthor       = {David Sontag and Dan Roy},\n\tyear         = 2011,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{sordoni2015neural,\n\ttitle        = {A neural network approach to context-sensitive generation of conversational responses},\n\tauthor       = {Alessandro Sordoni and Michel Galley and Michael Auli and Chris Brockett and Yangfeng Ji and Margaret Mitchell and Jian-Yun Nie and Jianfeng Gao and Bill Dolan},\n\tyear         = 2015,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@inproceedings{soricut06discourse,\n\ttitle        = {Discourse Generation Using Utility-Trained Coherence Models},\n\tauthor       = {Radu Soricut and Daniel Marcu},\n\tyear         = 2006,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {803--810}\n}\n@inproceedings{soricut06widl,\n\ttitle        = {Stochastic Language Generation Using {WIDL}-Expressions and its Application in Machine Translation and Summarization},\n\tauthor       = {Radu Soricut and Daniel Marcu},\n\tyear         = 2006,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {1105--1112}\n}\n@article{soricut2006automatic,\n\ttitle        = {Automatic question answering using the web: Beyond the factoid},\n\tauthor       = {Radu Soricut and Eric Brill},\n\tyear         = 2006,\n\tjournal      = {Information Retrieval},\n\tvolume       = 9\n}\n@inproceedings{sorokin2018modeling,\n\ttitle        = {Modeling semantics with gated graph neural networks for knowledge base question answering},\n\tauthor       = {Daniil Sorokin and Iryna Gurevych},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)}\n}\n@article{soudry2017exponentially,\n\ttitle        = {Exponentially vanishing sub-optimal local minima in multilayer neural networks},\n\tauthor       = {Soudry, Daniel and Hoffer, Elad},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1702.05777}\n}\n@article{soudry2018implicit,\n\ttitle        = {The implicit bias of gradient descent on separable data},\n\tauthor       = {Soudry, Daniel and Hoffer, Elad and Nacson, Mor Shpigel and Gunasekar, Suriya and Srebro, Nathan},\n\tyear         = 2018,\n\tjournal      = {The Journal of Machine Learning Research},\n\tpublisher    = {JMLR. org},\n\tvolume       = 19,\n\tnumber       = 1,\n\tpages        = {2822--2878}\n}\n@inproceedings{souloumiac2009joint,\n\ttitle        = {Joint diagonalization: Is non-orthogonal always preferable to orthogonal?},\n\tauthor       = {A. Souloumiac},\n\tyear         = 2009,\n\tbooktitle    = {Computational Advances in Multi-Sensor Adaptive Processing},\n\tpages        = {305--308}\n}\n@article{sp97,\n\ttitle        = {Bidirectional recurrent neural networks},\n\tauthor       = {Schuster, Mike and Paliwal, Kuldip K},\n\tyear         = 1997,\n\tjournal      = {IEEE Transactions on Signal Processing},\n\tpublisher    = {IEEE},\n\tvolume       = 45,\n\tnumber       = 11,\n\tpages        = {2673--2681}\n}\n@inproceedings{spalteholz2008keysurf,\n\ttitle        = {Keysurf: a character controlled browser for people with physical disabilities},\n\tauthor       = {Leo Spalteholz and Kin Fun Li and Nigel Livingston and Foad Hamidi},\n\tyear         = 2008,\n\tbooktitle    = {World Wide Web (WWW)}\n}\n@article{sparck1975report,\n\ttitle        = {Report on the Need for and Provision of an ``Ideal test collection},\n\tauthor       = {K Sparck Jones and C Van Rijsbergen},\n\tyear         = 1975,\n\tjournal      = {Information Retrieval Test Collection}\n}\n@inproceedings{SpectralLDA,\n\ttitle        = {A Spectral Algorithm for Latent {D}irichlet Allocation},\n\tauthor       = {Anima Anandkumar and Dean P. Foster and Daniel Hsu and Sham M. Kakade and Yi-Kai Liu},\n\tyear         = 2012,\n\tbooktitle    = {Advances in Neural Information Processing Systems 25}\n}\n@incollection{speech,\n\ttitle        = {Readings in speech recognition},\n\tauthor       = {Rabiner, Lawrence R.},\n\tyear         = 1990,\n\tpublisher    = {Morgan Kaufmann Publishers Inc.},\n\taddress      = {San Francisco, CA, USA},\n\tpages        = {267--296},\n\tisbn         = {1-55860-124-4},\n\turl          = {http://dl.acm.org/citation.cfm?id=108235.108253},\n\tchapter      = {A tutorial on hidden Markov models and selected applications in speech recognition},\n\teditor       = {Waibel, Alex and Lee, Kai-Fu},\n\tnumpages     = 30,\n\tacmid        = 108253\n}\n@misc{speer2018wordfreq,\n\ttitle        = {Luminoso{I}nsight/wordfreq: v2.2},\n\tauthor       = {Robyn Speer and Joshua Chin and Andrew Lin and Sara Jewett and Lance Nathan},\n\tyear         = 2018,\n\thowpublished = {\\url{https://doi.org/10.5281/zenodo.1443582}}\n}\n@article{spengler2010document,\n\ttitle        = {Document structure meets page layout: loopy random fields for web news content extraction},\n\tauthor       = {Alex Spengler and Patrick Gallinari},\n\tyear         = 2010,\n\tjournal      = {ACM Symposium on Document Engineering}\n}\n@article{SphericalGaussian2012,\n\ttitle        = {{Learning Mixtures of Spherical Gaussians: Moment Methods and Spectral Decompositions}},\n\tauthor       = {D. Hsu and S. M. Kakade},\n\tyear         = 2012,\n\tjournal      = {arXiv preprint arXiv:1206.5766}\n}\n@misc{Spielman-lecture,\n\ttitle        = {Spectral Graph Theory: Lecture 7},\n\tauthor       = {Daniel Spielman},\n\tyear         = 2012,\n\tnote         = {\\url{http://www.cs.cmu.edu/~15859n/RelatedWork/Spielman-SpectralClass/lect07-12-3.pdf}},\n\thowpublished = {Lecture notes}\n}\n@inproceedings{spielman1996spectral,\n\ttitle        = {Spectral partitioning works: Planar graphs and finite element meshes},\n\tauthor       = {Spielman, Daniel A and Teng, Shang-Hua},\n\tyear         = 1996,\n\tbooktitle    = {Proceedings of 37th Conference on Foundations of Computer Science},\n\tpages        = {96--105},\n\torganization = {IEEE}\n}\n@article{spielman2004smoothed,\n\ttitle        = {Smoothed analysis of algorithms: Why the simplex algorithm usually takes polynomial time},\n\tauthor       = {Spielman, Daniel A and Teng, Shang-Hua},\n\tyear         = 2004,\n\tjournal      = {Journal of the ACM (JACM)},\n\tpublisher    = {ACM},\n\tvolume       = 51,\n\tnumber       = 3,\n\tpages        = {385--463}\n}\n@article{SpielmanSrivastava2011,\n\ttitle        = {{Graph Sparsification by Effective Resistances}},\n\tauthor       = {Spielman, Daniel A. and Srivastava, Nikhil},\n\tyear         = 2011,\n\tmonth        = jan,\n\tjournal      = {SIAM Journal on Computing},\n\tvolume       = 40,\n\tnumber       = 6,\n\tpages        = {1913--1926},\n\tdoi          = {10.1137/080734029},\n\tissn         = {0097-5397},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {arXiv:0803.0929v4},\n\teprint       = {arXiv:0803.0929v4},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Spielman - 2009 - Graph Sparsification by Effective Resistances ∗.pdf:pdf},\n\tmendeley-groups = {Algorithms/Sparsification}\n}\n@inproceedings{spitkovsky2012cross,\n\ttitle        = {A Cross-Lingual Dictionary for {E}nglish {W}ikipedia Concepts},\n\tauthor       = {Valentin I Spitkovsky and Angel X Chang},\n\tyear         = 2012,\n\tbooktitle    = {Language Resources and Evaluation (LREC)},\n\tpages        = {3168--3175}\n}\n@inproceedings{spoon04ddp,\n\ttitle        = {Demand-Driven Type Inference with Subgoal Pruning: Trading Precision for Scalability},\n\tauthor       = {S. Alexander Spoon and Olin Shivers},\n\tyear         = 2004,\n\tbooktitle    = {European Conference on Object-Oriented Programming (ECOOP)}\n}\n@inproceedings{sprague2019interpretable,\n\ttitle        = {Interpretable {AI} for Deep Learning-Based Meteorological Applications},\n\tauthor       = {Conner Sprague and Eric B Wendoloski and Ingrid Guch},\n\tyear         = 2019,\n\tbooktitle    = {99th American Meteorological Society Annual Meeting}\n}\n@article{springenberg2014striving,\n\ttitle        = {Striving for simplicity: The all convolutional net},\n\tauthor       = {Jost Tobias Springenberg and Alexey Dosovitskiy and Thomas Brox and Martin Riedmiller},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1412.6806}\n}\n@article{sprott2010biomarkers,\n\ttitle        = {Biomarkers of aging and disease: introduction and definitions},\n\tauthor       = {Richard L Sprott},\n\tyear         = 2010,\n\tjournal      = {Experimental gerontology},\n\tvolume       = 45,\n\tnumber       = 1,\n\tpages        = {2--4}\n}\n@misc{squad2016url,\n\ttitle        = {{SQuAD}},\n\tauthor       = {Pranav Rajpurkar},\n\tyear         = 2016,\n\thowpublished = {\\url{https://rajpurkar.github.io/SQuAD-explorer/}}\n}\n@inproceedings{SR,\n\ttitle        = {Complexity of inference in Latent Dirichlet Allocation},\n\tauthor       = {D. Sontag and D. Roy},\n\tyear         = 2011,\n\tbooktitle    = {NIPS},\n\tpages        = {1008--1016}\n}\n@book{sra2006nonnegative,\n\ttitle        = {Nonnegative matrix approximation: Algorithms and applications},\n\tauthor       = {Sra, Suvrit and Dhillon, Inderjit S},\n\tyear         = 2006,\n\tpublisher    = {Computer Science Department, University of Texas at Austin}\n}\n@article{srd18,\n\ttitle        = {Mathematical Models of Physiological Responses to Exercise},\n\tauthor       = {Sojoudi, Somayeh and Recht, Benjamin and Doyle, John C},\n\tyear         = 2018,\n\tjournal      = {https://people.eecs.berkeley.edu/~sojoudi/SRD_2018.pdf}\n}\n@inproceedings{srebro06mixture,\n\ttitle        = {An investigation of computational and informational limits in {G}aussian mixture clustering},\n\tauthor       = {Nathan Srebro and Gregory Shakhnarovich and Sam Roweis},\n\tyear         = 2006,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {865--872}\n}\n@techreport{srebro10stochastic,\n\ttitle        = {Stochastic optimization and online learning with smooth loss functions},\n\tauthor       = {Nati Srebro and Karthik Sridharan and Ambuj Tewari},\n\tyear         = 2010,\n\tinstitution  = {TTI Chicago}\n}\n@inproceedings{srebro10tutorial,\n\ttitle        = {Stochastic optimization for machine learning (tutorial)},\n\tauthor       = {Nati Srebro and Ambuj Tewari},\n\tyear         = 2010,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{srebro2003weighted,\n\ttitle        = {Weighted low-rank approximations},\n\tauthor       = {Srebro, Nathan and Jaakkola, Tommi},\n\tyear         = 2013,\n\tbooktitle    = {ICML}\n}\n@inproceedings{srebro2004maximum,\n\ttitle        = {Maximum-margin matrix factorization},\n\tauthor       = {Srebro, Nathan and Rennie, and Jaakkola, Tommi S},\n\tyear         = 2004,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {1329--1336}\n}\n@inproceedings{srebro2005rank,\n\ttitle        = {Rank, trace-norm and max-norm},\n\tauthor       = {Srebro, Nathan and Shraibman, Adi},\n\tyear         = 2005,\n\tbooktitle    = {International Conference on Computational Learning Theory},\n\tpages        = {545--560},\n\torganization = {Springer}\n}\n@inproceedings{srikumar2011joint,\n\ttitle        = {A joint model for extended semantic role labeling},\n\tauthor       = {Vivek Srikumar and Dan Roth},\n\tyear         = 2011,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{srinivas2009gaussian,\n\ttitle        = {Gaussian process optimization in the bandit setting: No regret and experimental design},\n\tauthor       = {Srinivas, Niranjan and Krause, Andreas and Kakade, Sham M and Seeger, Matthias},\n\tyear         = 2009,\n\tjournal      = {arXiv preprint arXiv:0912.3995}\n}\n@article{srinivas2012,\n\ttitle        = {Information-Theoretic Regret Bounds for Gaussian Process Optimization in the Bandit Setting},\n\tauthor       = {Srinivas, Niranjan and Krause, Andreas and Kakade, Sham M. and Seeger, Matthias W.},\n\tyear         = 2012,\n\tmonth        = may,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tpublisher    = {Institute of Electrical and Electronics Engineers (IEEE)},\n\tvolume       = 58,\n\tnumber       = 5,\n\tpages        = {3250–3265},\n\tdoi          = {10.1109/tit.2011.2182033},\n\tissn         = {1557-9654},\n\turl          = {http://dx.doi.org/10.1109/TIT.2011.2182033}\n}\n@inproceedings{srinivas2020curl,\n\ttitle        = {Curl: Contrastive unsupervised representations for reinforcement learning},\n\tauthor       = {Srinivas, Aravind and Laskin, Michael and Abbeel, Pieter},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning}\n}\n@article{srinivasa2010herb,\n\ttitle        = {{HERB}: a home exploring robotic butler},\n\tauthor       = {S. Srinivasa and D. Ferguson and C. Helfrich and D. Berenson and A. Collet and R. Diankov and G. Gallagher and G. Hollinger and J. Kuffner and M. Weghe},\n\tyear         = 2010,\n\tjournal      = {Autonomous Robots},\n\tvolume       = 28,\n\tnumber       = 1,\n\tpages        = {5--20}\n}\n@article{srinivasan2017nli,\n\ttitle        = {Natural Language Interfaces for Data Analysis with Visualization: Considering What Has and Could Be Asked},\n\tauthor       = {Arjun Srinivasan and John Stasko},\n\tyear         = 2017,\n\tjournal      = {EuroVis}\n}\n@article{srinivasan2020learning,\n\ttitle        = {Learning to be Safe: Deep RL with a Safety Critic},\n\tauthor       = {Srinivasan, Krishnan and Eysenbach, Benjamin and Ha, Sehoon and Tan, Jie and Finn, Chelsea},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.14603}\n}\n@inproceedings{srivasta2020human,\n\ttitle        = {Robustness to Spurious Correlations via Human Annotations},\n\tauthor       = {Megha Srivastava and Tatsunori Hashimoto and Percy Liang},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{srivastava2012multimodal,\n\ttitle        = {Multimodal Learning with Deep {B}oltzmann Machines},\n\tauthor       = {Nitish Srivastava and Ruslan R Salakhutdinov},\n\tyear         = 2012,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {2222--2230}\n}\n@article{srivastava2014dropout,\n\ttitle        = {Dropout: a simple way to prevent neural networks from overfitting},\n\tauthor       = {Srivastava, Nitish and Hinton, Geoffrey and Krizhevsky, Alex and Sutskever, Ilya and Salakhutdinov, Ruslan},\n\tyear         = 2014,\n\tjournal      = {The journal of machine learning research},\n\tpublisher    = {JMLR. org},\n\tvolume       = 15,\n\tnumber       = 1,\n\tpages        = {1929--1958}\n}\n@article{srivastava2015highway,\n\ttitle        = {Highway networks},\n\tauthor       = {Srivastava, Rupesh Kumar and Greff, Klaus and Schmidhuber, J{\\\"u}rgen},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1505.00387}\n}\n@inproceedings{srivastava2017joint,\n\ttitle        = {Joint concept learning and semantic parsing from natural language explanations},\n\tauthor       = {Shashank Srivastava and Igor Labutov and Tom Mitchell},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1528--1537}\n}\n@article{srivastava2020sequence,\n\ttitle        = {Sequence and chromatin determinants of transcription factor binding and the establishment of cell type-specific binding patterns},\n\tauthor       = {Divyanshi Srivastava and Shaun Mahony},\n\tyear         = 2020,\n\tjournal      = {Biochimica et Biophysica Acta (BBA)-Gene Regulatory Mechanisms},\n\tvolume       = 1863,\n\tnumber       = 6\n}\n@inproceedings{SS08,\n\ttitle        = {{SVM} optimization: inverse dependence on training set size},\n\tauthor       = {Shai {Shalev-Shwartz} and Nathan Srebro},\n\tyear         = 2008,\n\tbooktitle    = {ICML}\n}\n@inproceedings{ss18,\n\ttitle        = {Spurious Local Minima are Common in Two-Layer {R}e{LU} Neural Networks},\n\tauthor       = {Itay Safran and Ohad Shamir},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpublisher    = {http://arxiv.org/abs/1712.08968}\n}\n@book{SS90,\n\ttitle        = {Matrix Perturbation Theory},\n\tauthor       = {G. W. Stewart and Ji-Guang Sun},\n\tyear         = 1990,\n\tpublisher    = {Academic Press}\n}\n@article{ss91,\n\ttitle        = {Turing computability with neural nets},\n\tauthor       = {Siegelmann, Hava T and Sontag, Eduardo D},\n\tyear         = 1991,\n\tjournal      = {Applied Mathematics Letters},\n\tpublisher    = {Elsevier},\n\tvolume       = 4,\n\tnumber       = 6,\n\tpages        = {77--80}\n}\n@inproceedings{ssb14,\n\ttitle        = {Long short-term memory recurrent neural network architectures for large scale acoustic modeling},\n\tauthor       = {Sak, Ha{\\c{s}}im and Senior, Andrew and Beaufays, Fran{\\c{c}}oise},\n\tyear         = 2014,\n\tbooktitle    = {Fifteenth annual conference of the international speech communication association}\n}\n@inproceedings{ssn12,\n\ttitle        = {LSTM neural networks for language modeling},\n\tauthor       = {Sundermeyer, Martin and Schl{\\\"u}ter, Ralf and Ney, Hermann},\n\tyear         = 2012,\n\tbooktitle    = {Thirteenth annual conference of the international speech communication association}\n}\n@inproceedings{st04,\n\ttitle        = {{Nearly-linear time algorithms for graph partitioning, graph sparsification, and solving linear systems}},\n\tauthor       = {Spielman, Daniel A. and Teng, Shang-Hua},\n\tyear         = 2004,\n\tbooktitle    = {Proceedings of the thirty-sixth annual ACM symposium on Theory of computing - STOC '04},\n\tpublisher    = {ACM Press},\n\taddress      = {New York, New York, USA},\n\tpages        = 81,\n\tdoi          = {10.1145/1007352.1007372},\n\tisbn         = 1581138520,\n\tmendeley-groups = {Algorithms/Sparsification}\n}\n@article{ST08a,\n\ttitle        = {{A Local Clustering Algorithm for Massive Graphs and Its Application to Nearly Linear Time Graph Partitioning}},\n\tauthor       = {Spielman, Daniel A. and Teng, Shang-Hua},\n\tyear         = 2013,\n\tmonth        = jan,\n\tjournal      = {SIAM Journal on Computing},\n\tvolume       = 42,\n\tnumber       = 1,\n\tpages        = {1--26},\n\tdoi          = {10.1137/080744888},\n\tissn         = {0097-5397},\n\tabstract     = {We study the design of local algorithms for massive graphs. A local algorithm is one that finds a solution containing or near a given vertex without looking at the whole graph. We present a local clustering algorithm. Our algorithm finds a good cluster--a subset of vertices whose internal connections are significantly richer than its external connections--near a given vertex. The running time of our algorithm, when it finds a non-empty local cluster, is nearly linear in the size of the cluster it outputs. Our clustering algorithm could be a useful primitive for handling massive graphs, such as social networks and web-graphs. As an application of this clustering algorithm, we present a partitioning algorithm that finds an approximate sparsest cut with nearly optimal balance. Our algorithm takes time nearly linear in the number edges of the graph. Using the partitioning algorithm of this paper, we have designed a nearly-linear time algorithm for constructing spectral sparsifiers of graphs, which we in turn use in a nearly-linear time algorithm for solving linear systems in symmetric, diagonally-dominant matrices. The linear system solver also leads to a nearly linear-time algorithm for approximating the second-smallest eigenvalue and corresponding eigenvector of the Laplacian matrix of a graph. These other results are presented in two companion papers.},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {0809.3232},\n\teprint       = {0809.3232},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Spielman, Teng - 2013 - A Local Clustering Algorithm for Massive Graphs and Its Application to Nearly Linear Time Graph Partitioning.pdf:pdf},\n\tmendeley-groups = {Algorithms/Sparsest Cut/Local Clustering,Algorithms/Sparsification}\n}\n@article{ST08b,\n\ttitle        = {{Spectral Sparsification of Graphs}},\n\tauthor       = {Spielman, Daniel A. and Teng, Shang-Hua},\n\tyear         = 2011,\n\tmonth        = jan,\n\tjournal      = {SIAM Journal on Computing},\n\tvolume       = 40,\n\tnumber       = 4,\n\tpages        = {981--1025},\n\tdoi          = {10.1137/08074489X},\n\tissn         = {0097-5397},\n\tabstract     = {We introduce a new notion of graph sparsificaiton based on spectral similarity of graph Laplacians: spectral sparsification requires that the Laplacian quadratic form of the sparsifier approximate that of the original. This is equivalent to saying that the Laplacian of the sparsifier is a good preconditioner for the Laplacian of the original. We prove that every graph has a spectral sparsifier of nearly linear size. Moreover, we present an algorithm that produces spectral sparsifiers in time \\$\\backslash softO\\{m\\}\\$, where \\$m\\$ is the number of edges in the original graph. This construction is a key component of a nearly-linear time algorithm for solving linear equations in diagonally-dominant matrcies. Our sparsification algorithm makes use of a nearly-linear time algorithm for graph partitioning that satisfies a strong guarantee: if the partition it outputs is very unbalanced, then the larger part is contained in a subgraph of high conductance.},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {0808.4134},\n\teprint       = {0808.4134},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Spielman, Teng - 2008 - Spectral Sparsification of Graphs.pdf:pdf},\n\tmendeley-groups = {Algorithms/Sparsification}\n}\n@article{ST08c,\n\ttitle        = {{Nearly Linear Time Algorithms for Preconditioning and Solving Symmetric, Diagonally Dominant Linear Systems}},\n\tauthor       = {Spielman, Daniel A. and Teng, Shang-Hua},\n\tyear         = 2014,\n\tmonth        = jul,\n\tjournal      = {SIAM Journal on Matrix Analysis and Applications},\n\tvolume       = 35,\n\tnumber       = 3,\n\tpages        = {835--885},\n\tdoi          = {10.1137/090771430},\n\tissn         = {0895-4798},\n\tabstract     = {We present a randomized algorithm that, on input a symmetric, weakly diagonally dominant n-by-n matrix A with m nonzero entries and an n-vector b, produces a y such that \\$\\backslash norm\\{y - \\backslash pinv\\{A\\} b\\}\\_\\{A\\} \\backslash leq \\backslash epsilon \\backslash norm\\{\\backslash pinv\\{A\\} b\\}\\_\\{A\\}\\$ in expected time \\$O (m \\backslash log\\^{}\\{c\\}n \\backslash log (1/\\backslash epsilon)),\\$ for some constant c. By applying this algorithm inside the inverse power method, we compute approximate Fiedler vectors in a similar amount of time. The algorithm applies subgraph preconditioners in a recursive fashion. These preconditioners improve upon the subgraph preconditioners first introduced by Vaidya (1990). For any symmetric, weakly diagonally-dominant matrix A with non-positive off-diagonal entries and \\$k \\backslash geq 1\\$, we construct in time \\$O (m \\backslash log\\^{}\\{c\\} n)\\$ a preconditioner B of A with at most \\$2 (n - 1) + O ((m/k) \\backslash log\\^{}\\{39\\} n)\\$ nonzero off-diagonal entries such that the finite generalized condition number \\$\\backslash kappa\\_\\{f\\} (A,B)\\$ is at most k, for some other constant c. In the special case when the nonzero structure of the matrix is planar the corresponding linear system solver runs in expected time \\$ O (n \\backslash log\\^{}\\{2\\} n + n \\backslash log n \\backslash \\backslash log \\backslash log n \\backslash \\backslash log (1/\\backslash epsilon))\\$. We hope that our introduction of algorithms of low asymptotic complexity will lead to the development of algorithms that are also fast in practice.},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {cs/0607105},\n\teprint       = {0607105},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Spielman, Teng - 2006 - Nearly-Linear Time Algorithms for Preconditioning and Solving Symmetric, Diagonally Dominant Linear Systems.pdf:pdf},\n\tmendeley-groups = {Algorithms/Sparsification},\n\tprimaryclass = {cs}\n}\n@article{stack4816,\n\ttitle        = {What are the sharpest known tail bounds for ${\\cal X}_k^2$ distributed variables?},\n\tauthor       = {Robin Girard},\n\tyear         = 2010,\n\tjournal      = {StackExchange},\n\tpublisher    = {\\url{https://stats.stackexchange.com/questions/4816/what-are-the-sharpest-known-tail-bounds-for-chi-k2-distributed-variables}}\n}\n@article{stadie2017third,\n\ttitle        = {Third-Person Imitation Learning},\n\tauthor       = {B. C. Stadie and P. Abbeel and I. Sutskever},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1703.01703}\n}\n@inproceedings{stadie2018importance,\n\ttitle        = {The importance of sampling inmeta-reinforcement learning},\n\tauthor       = {Bradly Stadie and Ge Yang and Rein Houthooft and Peter Chen and Yan Duan and Yuhuai Wu and Pieter Abbeel and Ilya Sutskever},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {9280--9290}\n}\n@inproceedings{stallkamp2011german,\n\ttitle        = {The German traffic sign recognition benchmark: a multi-class classification competition},\n\tauthor       = {Johannes Stallkamp and Marc Schlipsing and Jan Salmen and Christian Igel},\n\tyear         = 2011,\n\tbooktitle    = {The 2011 international joint conference on neural networks},\n\tpages        = {1453--1460}\n}\n@inproceedings{stanford2017kbp,\n\ttitle        = {Stanford at {TAC} {KBP} 2017: Building a Trilingual Relational Knowledge Graph},\n\tauthor       = {Arun Tejasvi Chaganty and Ashwin Paranjape and Jason Bolton and Matthew Lamm and Jinhao Lei and Abigail See and Kevin Clark and Yuhao Zhang and Peng Qi and Christopher D Manning},\n\tyear         = 2017,\n\tbooktitle    = {Text Analytics Conference}\n}\n@article{stanojevic2008reference,\n\ttitle        = {Reference ranges for spirometry across all ages: a new approach},\n\tauthor       = {Sanja Stanojevic and Angie Wade and Janet Stocks and John Hankinson and Allan L Coates and Huiqi Pan and Mark Rosenthal and Mary Corey and Patrick Lebecque and Tim J Cole},\n\tyear         = 2008,\n\tjournal      = {American Journal of Respiratory and Critical Care Medicine},\n\tvolume       = 177,\n\tnumber       = 3,\n\tpages        = {253--260}\n}\n@article{stanton2018deep,\n\ttitle        = {Deep curiosity search: Intra-life exploration improves performance on challenging deep reinforcement learning problems},\n\tauthor       = {Christopher Stanton and Jeff Clune},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1806.00553}\n}\n@article{star_model,\n\ttitle        = {Smooth Transition Autoregressive Models -- A Survey Of Recent Developments},\n\tauthor       = {Dick van Dijk and Timo Ter�svirta and Philip Hans Franses},\n\tyear         = 2002,\n\tjournal      = {Econometric Reviews},\n\tvolume       = 21,\n\tpages        = {1--47}\n}\n@article{stat-linear-regression,\n\ttitle        = {A Statistical Perspective on Algorithmic Leveraging},\n\tauthor       = {Ma, Ping and Mahoney, Michael and Yu, Bin},\n\tyear         = 2013,\n\tjournal      = {arXiv:1306.5362}\n}\n@book{steedman00ccg,\n\ttitle        = {The Syntactic Process},\n\tauthor       = {Mark Steedman},\n\tyear         = 2000,\n\tpublisher    = {MIT Press}\n}\n@book{steedman1996surface,\n\ttitle        = {Surface structure and interpretation},\n\tauthor       = {M. Steedman},\n\tyear         = 1996,\n\tpublisher    = {MIT press}\n}\n@article{steel2018multiple,\n\ttitle        = {Multiple diversity concepts and their ethical-epistemic implications},\n\tauthor       = {Daniel Steel and Sina Fazelpour and Kinley Gillette and Bianca Crewe and Michael Burgess},\n\tyear         = 2018,\n\tjournal      = {European journal for philosophy of science},\n\tvolume       = 8,\n\tnumber       = 3,\n\tpages        = {761--780}\n}\n@book{stein2011fourier,\n\ttitle        = {Fourier Analysis: an Introduction},\n\tauthor       = {Elias M Stein and Rami Shakarchi},\n\tyear         = 2011,\n\tpublisher    = {Princeton University Press},\n\tvolume       = 1\n}\n@inproceedings{stein61quadratic,\n\ttitle        = {Estimation with Quadratic Loss},\n\tauthor       = {W. James and C. Stein},\n\tyear         = 1961,\n\tbooktitle    = {Fourth Berkeley Symposium in Mathematics, Statistics, and Probability},\n\tpages        = {361--380}\n}\n@article{stein81sure,\n\ttitle        = {Estimation of the mean of a multivariate normal distribution},\n\tauthor       = {C. M. Stein},\n\tyear         = 1981,\n\tjournal      = {Annals of Statistics},\n\tvolume       = 9,\n\tnumber       = 6,\n\tpages        = {1135--1151}\n}\n@mastersthesis{steinberg2005computation,\n\ttitle        = {Computation of Matrix Norms with Applications to Robust Optimization},\n\tauthor       = {Daureen Steinberg},\n\tyear         = 2005,\n\tschool       = {Technion -- Israel Institute of Technology}\n}\n@article{steinhardt2009coloring,\n\ttitle        = {On coloring the odd-distance graph},\n\tauthor       = {Jacob Steinhardt},\n\tyear         = 2009,\n\tjournal      = {Electronic Journal of Combinatorics},\n\tvolume       = 16\n}\n@article{steinhardt2010permutations,\n\ttitle        = {Permutations with ascending and descending blocks},\n\tauthor       = {Jacob Steinhardt},\n\tyear         = 2010,\n\tjournal      = {Electronic Journal of Combinatorics},\n\tvolume       = 17\n}\n@inproceedings{steinhardt2011finite,\n\ttitle        = {Finite-time regional verification of stochastic nonlinear systems},\n\tauthor       = {Jacob Steinhardt and Russ Tedrake},\n\tyear         = 2011,\n\tbooktitle    = {Robotics: Science and Systems (RSS)}\n}\n@inproceedings{steinhardt2012flexible,\n\ttitle        = {Flexible martingale priors for deep hierarchies},\n\tauthor       = {Jacob Steinhardt and Zoubin Ghahramani},\n\tyear         = 2012,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)}\n}\n@inproceedings{steinhardt2014eg,\n\ttitle        = {Adaptivity and Optimism: An Improved Exponentiated Gradient Algorithm},\n\tauthor       = {Jacob Steinhardt and Percy Liang},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{steinhardt2014filtering,\n\ttitle        = {Filtering with Abstract Particles},\n\tauthor       = {Jacob Steinhardt and Percy Liang},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {727--735}\n}\n@article{steinhardt2014sparse,\n\ttitle        = {The Statistics of Streaming Sparse Regression},\n\tauthor       = {Jacob Steinhardt and Stefan Wager and Percy Liang},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1412.4182}\n}\n@inproceedings{steinhardt2015fast,\n\ttitle        = {Learning Fast-Mixing Models for Structured Prediction},\n\tauthor       = {Jacob Steinhardt and Percy Liang},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1063--1072}\n}\n@inproceedings{steinhardt2015minimax,\n\ttitle        = {Minimax rates for memory-constrained sparse linear regression},\n\tauthor       = {Jacob Steinhardt and John Duchi},\n\tyear         = 2015,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@inproceedings{steinhardt2015rcm,\n\ttitle        = {Reified Context Models},\n\tauthor       = {Jacob Steinhardt and Percy Liang},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{steinhardt2015relaxed,\n\ttitle        = {Learning with Relaxed Supervision},\n\tauthor       = {Jacob Steinhardt and Percy Liang},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{steinhardt2016avoiding,\n\ttitle        = {Avoiding Imposters and Delinquents: Adversarial Crowdsourcing and Peer Prediction},\n\tauthor       = {Jacob Steinhardt and Gregory Valiant and Moses Charikar},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{steinhardt2016memory,\n\ttitle        = {Memory, Communication, and Statistical Queries},\n\tauthor       = {Jacob Steinhardt and Gregory Valiant and Stefan Wager},\n\tyear         = 2016,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@inproceedings{steinhardt2016risk,\n\ttitle        = {Unsupervised Risk Estimation Using Only Conditional Independence Structure},\n\tauthor       = {Jacob Steinhardt and Percy Liang},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{steinhardt2017certified,\n\ttitle        = {Certified Defenses for Data Poisoning Attacks},\n\tauthor       = {Jacob Steinhardt and Pang Wei Koh and Percy Liang},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{steinhardt2017clique,\n\ttitle        = {Does robustness imply tractability? {A} lower bound for planted clique in the semi-random model},\n\tauthor       = {Jacob Steinhardt},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@inproceedings{steinhardt2018resilience,\n\ttitle        = {Resilience: A Criterion for Learning in the Presence of Arbitrary Outliers},\n\tauthor       = {Jacob Steinhardt and Moses Charikar and Gregory Valiant},\n\tyear         = 2018,\n\tbooktitle    = {Innovations in Theoretical Computer Science (ITCS)}\n}\n@phdthesis{steinhardt2018thesis,\n\ttitle        = {Robust Learning: Information Theory and Algorithms},\n\tauthor       = {Jacob Steinhardt},\n\tyear         = 2018,\n\tschool       = {Stanford University}\n}\n@article{steinwart2007how,\n\ttitle        = {How to Compare Different Loss Functions and Their Risks},\n\tauthor       = {Ingo Steinwart},\n\tyear         = 2007,\n\tjournal      = {Constructive Approximation},\n\tvolume       = 26\n}\n@inproceedings{steinwart2009optimal,\n\ttitle        = {Optimal Rates for Regularized Least Squares Regression.},\n\tauthor       = {Steinwart, Ingo and Hush, Don R and Scovel, Clint and others},\n\tyear         = 2009,\n\tbooktitle    = {COLT}\n}\n@article{stephenson08brier,\n\ttitle        = {Two Extra Components in the Brier Score Decomposition},\n\tauthor       = {D. B. Stephenson and C. A. S. Coelho and I. T. Jolliffe},\n\tyear         = 2008,\n\tjournal      = {Weather Forecasting},\n\tvolume       = 23,\n\tpages        = {752--757}\n}\n@inproceedings{stepputtis2020lcil,\n\ttitle        = {Language-Conditioned Imitation Learning for Robot Manipulation Tasks},\n\tauthor       = {Simon Stepputtis and J. Campbell and Mariano Phielipp and Stefan Lee and Chitta Baral and H. B. Amor},\n\tyear         = 2020,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{stern2010rulechaining,\n\ttitle        = {Rule Chaining and Approximate Match in Textual Inference},\n\tauthor       = {Asher Stern and Eyal Shnarch and Amnon Lotan and Shachar Mirkin and Lili Kotlerman and Naomi Zeichner and Jonathan Berant and Ido Dagan},\n\tyear         = 2010,\n\tbooktitle    = {Text Analysis Conference}\n}\n@inproceedings{stern2011knowledge,\n\ttitle        = {Knowledge and Tree-Edits in Learnable Entailment Proofs},\n\tauthor       = {Asher Stern and Amnon Lotan and Shachar Mirkin and Eyal Shnarch and Lili Kotlerman and Jonathan Berant and Ido Dagan},\n\tyear         = {2011 2011},\n\tbooktitle    = {Text Analysis Conference}\n}\n@inproceedings{stern2011transformation,\n\ttitle        = {A Confidence Model for Syntactically-Motivated Entailment Proofs},\n\tauthor       = {Asher Stern and Ido Dagan},\n\tyear         = 2011,\n\tbooktitle    = {Recent Advances in Natural Language Processing},\n\tpages        = {455--462}\n}\n@article{stern2019insertion,\n\ttitle        = {Insertion Transformer: Flexible Sequence Generation via Insertion Operations},\n\tauthor       = {Mitchell Stern and William Chan and Jamie Kiros and Jakob Uszkoreit},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1902.03249}\n}\n@article{stetter1993multivariate,\n\ttitle        = {Multivariate polynomial equations as matrix eigenproblems},\n\tauthor       = {Hans J Stetter},\n\tyear         = 1993,\n\tjournal      = {WSSIA},\n\tvolume       = 2,\n\tpages        = {355--371}\n}\n@book{stetter2004numerical,\n\ttitle        = {Numerical polynomial algebra},\n\tauthor       = {Hans J Stetter},\n\tyear         = 2004,\n\tpublisher    = {Siam}\n}\n@inproceedings{Steurer2010,\n\ttitle        = {{Fast SDP algorithms for constraint satisfaction problems}},\n\tauthor       = {Steurer, David},\n\tyear         = 2010,\n\tbooktitle    = {Proceedings of the Twenty-First Annual ACM-SIAM Symposium on Discrete Algorithms - SODA '10},\n\tpages        = {684----697},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Steurer - 2010 - Fast SDP algorithms for constraint satisfaction problems.pdf:pdf},\n\tmendeley-groups = {Optimization/Multiplicative Weight/SDP}\n}\n@inproceedings{stevens2012exploring,\n\ttitle        = {Exploring Topic Coherence over Many Models and Many Topics},\n\tauthor       = {Keith Stevens and Philip Kegelmeyer and David Andrzejewski and David Buttler},\n\tyear         = 2012,\n\tbooktitle    = {EMNLP}\n}\n@article{stewart1990matrix,\n\ttitle        = {Matrix perturbation theory},\n\tauthor       = {Stewart, Gilbert W},\n\tyear         = 1990,\n\tpublisher    = {Citeseer}\n}\n@techreport{stewart1998perturbation,\n\ttitle        = {Perturbation theory for the singular value decomposition},\n\tauthor       = {Stewart, Gilbert W},\n\tyear         = 1998\n}\n@incollection{steyvers2006probabilistic,\n\ttitle        = {Probabilistic Topic Models},\n\tauthor       = {M. Steyvers and T. Griffiths},\n\tyear         = 2006,\n\tbooktitle    = {Latent Semantic Analysis: A Road to Meaning.},\n\tpublisher    = {Laurence Erlbaum},\n\turl          = {http://cocosci.berkeley.edu/tom/papers/SteyversGriffiths.pdf},\n\teditor       = {Landauer, T. and Mcnamara, D. and Dennis, S. and Kintsch, W.}\n}\n@article{steyvers2007probabilistic,\n\ttitle        = {Probabilistic topic models},\n\tauthor       = {Steyvers, Mark and Griffiths, Tom},\n\tyear         = 2007,\n\tjournal      = {Handbook of latent semantic analysis},\n\tvolume       = 427,\n\tnumber       = 7,\n\tpages        = {424--440}\n}\n@inproceedings{stolcke94merge,\n\ttitle        = {Inducing Probabilistic Grammars by {B}ayesian Model Merging},\n\tauthor       = {A. Stolcke and S. Omohundro},\n\tyear         = 1994,\n\tbooktitle    = {International Colloquium on Grammatical Inference and Applications},\n\tpages        = {106--118}\n}\n@inproceedings{stolle2002options,\n\ttitle        = {Learning Options in Reinforcement Learning},\n\tauthor       = {Martin Stolle and Doina Precup},\n\tyear         = 2002,\n\tbooktitle    = {Proceedings of the 5th International Symposium on Abstraction, Reformulation and Approximation}\n}\n@article{stone1977,\n\ttitle        = {Consistent Nonparametric Regression},\n\tauthor       = {Charles J Stone},\n\tyear         = 1977,\n\tjournal      = {Annals of Statistics},\n\tvolume       = 5\n}\n@article{stone1998towards,\n\ttitle        = {Towards collaborative and adversarial learning: A case study in robotic soccer},\n\tauthor       = {Peter Stone and Manuela Veloso},\n\tyear         = 1998,\n\tjournal      = {International Journal of Human-Computer Studies},\n\tvolume       = 48,\n\tnumber       = 1,\n\tpages        = {83--104}\n}\n@book{stone2014amazon,\n\ttitle        = {Amazon Unveils a Listening, Talking, Music-Playing Speaker for Your Home},\n\tauthor       = {Brad Stone and Spencer Soper},\n\tyear         = 2014,\n\tpublisher    = {Bloomberg L. P.}\n}\n@inproceedings{stooke2020responsive,\n\ttitle        = {Responsive safety in reinforcement learning by pid lagrangian methods},\n\tauthor       = {Stooke, Adam and Achiam, Joshua and Abbeel, Pieter},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {9133--9143},\n\torganization = {PMLR}\n}\n@article{storvik2002particle,\n\ttitle        = {Particle Filters for state-space models with the presence of unknown static paramaters},\n\tauthor       = {Storvik, Geir},\n\tyear         = 2002,\n\tjournal      = {IEEE Transactions on Signal Processing},\n\tvolume       = 50,\n\tnumber       = 2,\n\tpages        = {281--289}\n}\n@inproceedings{stoyanov2011empirical,\n\ttitle        = {Empirical risk minimization of graphical model parameters given approximate inference, decoding, and model structure},\n\tauthor       = {Veselin Stoyanov and Alexander Ropson and Jason Eisner},\n\tyear         = 2011,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)},\n\tpages        = {725--733}\n}\n@inproceedings{stoyanov2012fast,\n\ttitle        = {Fast and Accurate Prediction via Evidence-Specific {MRF} Structure},\n\tauthor       = {Veselin Stoyanov and Jason Eisner},\n\tyear         = 2012,\n\tbooktitle    = {ICML Workshop on Inferning: Interactions between Inference and Learning}\n}\n@article{strack2014impact,\n\ttitle        = {Impact of {HbA1c} measurement on hospital readmission rates: {Analysis} of 70,000 clinical database patient records},\n\tauthor       = {Beata Strack and Jonathan P DeShazo and Chris Gennings and Juan L Olmo and Sebastian Ventura and Krzysztof J Cios and John N Clore},\n\tyear         = 2014,\n\tjournal      = {BioMed Research International},\n\tvolume       = 2014\n}\n@article{strehl09reinforcement,\n\ttitle        = {Reinforcement Learning in Finite {MDP}s: {PAC} Analysis},\n\tauthor       = {Alexander L. Strehl and Lihong Li and Michael L. Littman},\n\tyear         = 2009,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 10,\n\tnumber       = {Nov},\n\tpages        = {2413--2444}\n}\n@inproceedings{strehl2004empirical,\n\ttitle        = {An empirical evaluation of interval estimation for markov decision processes},\n\tauthor       = {Strehl, Alexander L and Littman, Michael L},\n\tyear         = 2004,\n\tbooktitle    = {Tools with Artificial Intelligence, 2004. ICTAI 2004. 16th IEEE International Conference on},\n\tpages        = {128--135},\n\torganization = {IEEE}\n}\n@inproceedings{strehl2005theoretical,\n\ttitle        = {A theoretical analysis of model-based interval estimation},\n\tauthor       = {Strehl, Alexander L and Littman, Michael L},\n\tyear         = 2005,\n\tjournal      = {Proceedings of the 22nd international conference on Machine learning},\n\tbooktitle    = {Proceedings of the 22nd international conference on Machine learning},\n\tpages        = {856--863},\n\torganization = {ACM}\n}\n@inproceedings{strehl2006pac,\n\ttitle        = {PAC model-free reinforcement learning},\n\tauthor       = {Strehl, Alexander L and Li, Lihong and Wiewiora, Eric and Langford, John and Littman, Michael L},\n\tyear         = 2006,\n\tbooktitle    = {Proceedings of the 23rd international conference on Machine learning},\n\tpages        = {881--888}\n}\n@book{strehl2007probably,\n\ttitle        = {Probably approximately correct (PAC) exploration in reinforcement learning},\n\tauthor       = {Strehl, Alexander L},\n\tyear         = 2007,\n\tpublisher    = {ProQuest}\n}\n@article{strehl2008analysis,\n\ttitle        = {An analysis of model-based interval estimation for Markov decision processes},\n\tauthor       = {Strehl, Alexander L and Littman, Michael L},\n\tyear         = 2008,\n\tjournal      = {Journal of Computer and System Sciences},\n\tpublisher    = {Academic Press},\n\tvolume       = 74,\n\tnumber       = 8,\n\tpages        = {1309--1331}\n}\n@article{strehl2009reinforcement,\n\ttitle        = {Reinforcement Learning in Finite MDPs: PAC Analysis.},\n\tauthor       = {Strehl, Alexander L and Li, Lihong and Littman, Michael L},\n\tyear         = 2009,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 10,\n\tnumber       = 11,\n\tpages        = {2413--2444}\n}\n@article{strohmer09kaczmarz,\n\ttitle        = {A randomized Kaczmarz algorithm with exponential convergence},\n\tauthor       = {Thomas Strohmer and Roman Vershynin},\n\tyear         = 2009,\n\tjournal      = {Journal of Fourier Analysis and Applications},\n\tvolume       = 15,\n\tpages        = {262--278}\n}\n@article{StrohmerVershynin2009,\n\ttitle        = {A randomized Kaczmarz algorithm with exponential convergence},\n\tauthor       = {Strohmer, Thomas and Vershynin, Roman},\n\tyear         = 2009,\n\tjournal      = {Journal of Fourier Analysis and Applications},\n\tpublisher    = {Springer},\n\tvolume       = 15,\n\tnumber       = 2,\n\tpages        = {262--278}\n}\n@article{stroop1935studies,\n\ttitle        = {Studies of interference in serial verbal reactions.},\n\tauthor       = {Stroop, J Ridley},\n\tyear         = 1935,\n\tjournal      = {Journal of experimental psychology},\n\tpublisher    = {Psychological Review Company},\n\tvolume       = 18,\n\tnumber       = 6,\n\tpages        = 643\n}\n@article{strub2017end,\n\ttitle        = {End-to-end optimization of goal-driven and visually grounded dialogue systems},\n\tauthor       = {Florian Strub and Harm De Vries and Jeremie Mary and Bilal Piot and Aaron Courville and Olivier Pietquin},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1703.05423}\n}\n@article{sturm1999guide,\n\ttitle        = {Using {SeDuMi} 1.02, a {MATLAB} toolbox for optimization over symmetric cones},\n\tauthor       = {Jos F. Sturm},\n\tyear         = 1999,\n\tjournal      = {Optimization Methods and Software},\n\tvolume       = 11,\n\tpages        = {625--653}\n}\n@book{sturmfels2002solving,\n\ttitle        = {Solving systems of polynomial equations},\n\tauthor       = {Bernd Sturmfels},\n\tyear         = 2002,\n\tpublisher    = {American Mathematical Society}\n}\n@book{sturmfels2008algorithms,\n\ttitle        = {Algorithms in invariant theory},\n\tauthor       = {Bernd Sturmfels},\n\tyear         = 2008,\n\tpublisher    = {Springer Science \\& Business Media}\n}\n@misc{sturmfels2011binary,\n\ttitle        = {Binary cumulant varieties},\n\tauthor       = {Sturmfels, B. and Zwiernik, P.},\n\tyear         = 2013,\n\tjournal      = {Ann. Comb.},\n\tnumber       = 17,\n\tpages        = {229--250}\n}\n@article{stutz2018disentangling,\n\ttitle        = {Disentangling adversarial robustness and generalization},\n\tauthor       = {David Stutz and Matthias Hein and Bernt Schiele},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1812.00740}\n}\n@inproceedings{su2014differential,\n\ttitle        = {A Differential Equation for Modeling Nesterov’s Accelerated Gradient Method: Theory and Insights},\n\tauthor       = {Su, Weijie and Boyd, Stephen and Candes, Emmanuel},\n\tyear         = 2014,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {2510--2518}\n}\n@article{su2016continuous,\n\ttitle        = {Continuously Learning Neural Dialogue Management},\n\tauthor       = {Pei-hao Su and Milica Gasic and Nikola Mrksic and Lina Maria Rojas-Barahona and Stefan Ultes and David Vandyke and Tsung-Hsien Wen and Steve J. Young},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1606.02689}\n}\n@inproceedings{su2016graphquestions,\n\ttitle        = {On Generating Characteristic-rich Question Sets for {QA} Evaluation},\n\tauthor       = {Yu Su and Huan Sun and Brian M. Sadler and Mudhakar Srivatsa and Izzeddin Gur and Zenghui Yan and Xifeng Yan},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{su2017building,\n\ttitle        = {Building Natural Language Interfaces to Web APIs},\n\tauthor       = {Yu Su and Ahmed Hassan Awadallah and Madian Khabsa and Patrick Pantel and Michael Gamon and Mark J. Encarnaci\\'{o}n},\n\tyear         = 2017,\n\tbooktitle    = {Conference on Information and Knowledge Management (CIKM)}\n}\n@inproceedings{su2017cross,\n\ttitle        = {Cross-domain Semantic Parsing via Paraphrasing},\n\tauthor       = {Yu Su and Xifeng Yan},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{su2019risk,\n\ttitle        = {Controlling Risk of Web Question Answering},\n\tauthor       = {Lixin Su and Jiafeng Guo and Yixin Fan and Yanyan Lan and Xueqi Cheng},\n\tyear         = 2019,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)}\n}\n@article{su2020sanity,\n\ttitle        = {Sanity-Checking Pruning Methods: Random Tickets can Win the Jackpot},\n\tauthor       = {Su, Jingtong and Chen, Yihang and Cai, Tianle and Wu, Tianhao and Gao, Ruiqi and Wang, Liwei and Lee, Jason D},\n\tyear         = 2020,\n\tjournal      = {Neural Information Processing Systems (NeurIPS)}\n}\n@article{subbaswamy2020evaluating,\n\ttitle        = {Evaluating Model Robustness to Dataset Shift},\n\tauthor       = {Adarsh Subbaswamy and Roy Adams and Suchi Saria},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.15100}\n}\n@book{SubIDMoor,\n\ttitle        = {Subspace Identification of Linear Systems},\n\tauthor       = {P. V. Overschee and B. De Moor},\n\tyear         = 1996,\n\tpublisher    = {Kluwer Academic Publishers}\n}\n@inproceedings{subramanian2020interpretability,\n\ttitle        = {Achieving Interpretability in Compositional Neural Networks},\n\tauthor       = {Sanjay Subramanian and Ben Bogin and Nitish Gupta and Tomer Wolfson and Sameer Singh and Jonathan Berant and Matt Gardner},\n\tyear         = 2020,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{such2017deep,\n\ttitle        = {Deep neuroevolution: Genetic algorithms are a competitive alternative for training deep neural networks for reinforcement learning},\n\tauthor       = {Such, Felipe Petroski and Madhavan, Vashisht and Conti, Edoardo and Lehman, Joel and Stanley, Kenneth O and Clune, Jeff},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1712.06567}\n}\n@inproceedings{suchanek2007yago,\n\ttitle        = {{YAGO}: a core of semantic knowledge},\n\tauthor       = {Fabian M Suchanek and Gjergji Kasneci and Gerhard Weikum},\n\tyear         = 2007,\n\tbooktitle    = {World Wide Web (WWW)},\n\tpages        = {697--706}\n}\n@article{suciu2018does,\n\ttitle        = {When Does Machine Learning FAIL? Generalized Transferability for Evasion and Poisoning Attacks},\n\tauthor       = {Octavian Suciu and Radu M{\\u{a}}rginean and Yi{\\u{g}}itcan Kaya and Hal Daum{\\'e} III and Tudor Dumitra{\\c{s}}},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1803.06975}\n}\n@inproceedings{sudderth06transformed,\n\ttitle        = {Describing Visual Scenes using Transformed {D}irichlet Processes},\n\tauthor       = {E. B. Sudderth and A. B. Torralba and W. T. Freeman and A. S. Willsky},\n\tyear         = 2006,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1297--1304}\n}\n@inproceedings{sudderth2008shared,\n\ttitle        = {\n\t\tShared Segmentation of Natural Scenes Using Dependent {P}itman--{Y}or\n\n\t\tProcesses\n\t},\n\tauthor       = {\n\t\tErik B. Sudderth and\n\n\t\tMichael I. Jordan\n\t},\n\tyear         = 2008,\n\tbooktitle    = {NIPS},\n\tpages        = {1585--1592},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de},\n\tee           = {http://books.nips.cc/papers/files/nips21/NIPS2008_1027.pdf}\n}\n@article{sudhakar2015bayesian,\n\ttitle        = {{B}ayesian Error-Based Sequences of Statistical Information Bounds},\n\tauthor       = {Sudhakar Prasad},\n\tyear         = 2015,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tvolume       = 19,\n\tpages        = {5052--5062}\n}\n@article{sudlow2015uk,\n\ttitle        = {{UK Biobank}: an open access resource for identifying the causes of a wide range of complex diseases of middle and old age},\n\tauthor       = {Cathie Sudlow and John Gallacher and Naomi Allen and Valerie Beral and Paul Burton and John Danesh and Paul Downey and Paul Elliott and Jane Green and Martin Landray and others},\n\tyear         = 2015,\n\tjournal      = {PLoS Medicine},\n\tvolume       = 12,\n\tnumber       = 3\n}\n@article{sugiyama2007covariate,\n\ttitle        = {Covariate shift adaptation by importance weighted cross validation},\n\tauthor       = {Sugiyama, Masashi and Krauledat, Matthias and M{\\~A}{\\v{z}}ller, Klaus-Robert},\n\tyear         = 2007,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 8,\n\tnumber       = {May},\n\tpages        = {985--1005}\n}\n@inproceedings{sugiyama2008direct,\n\ttitle        = {Direct importance estimation with model selection and its application to covariate shift adaptation},\n\tauthor       = {Sugiyama, Masashi and Nakajima, Shinichi and Kashima, Hisashi and Buenau, Paul V and Kawanabe, Motoaki},\n\tyear         = 2008,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {1433--1440}\n}\n@inproceedings{suhr2017nlvr,\n\ttitle        = {A Corpus of Natural Language for Visual Reasoning},\n\tauthor       = {Alane  Suhr and Mike   Lewis and James   Yeh and Yoav  Artzi},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{suhr2018situated,\n\ttitle        = {Situated Mapping of Sequential Instructions to Actions with Single-step Reward Observation},\n\tauthor       = {Alane Suhr and Yoav Artzi},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{suhr2019nlvr2,\n\ttitle        = {A corpus for reasoning about natural language grounded in photographs},\n\tauthor       = {Alane Suhr and Stephanie Zhou and Ally Zhang and Iris Zhang and Huajun Bai and Yoav Artzi},\n\tyear         = 2019,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{sukhbaatar2015end,\n\ttitle        = {End-to-end memory networks},\n\tauthor       = {Sukhbaatar, Sainbayar and Szlam, Arthur and Weston, Jason and Fergus, Rob},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1503.08895},\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{sukhbaatar2016learning,\n\ttitle        = {Learning multiagent communication with backpropagation},\n\tauthor       = {Sainbayar Sukhbaatar and Rob Fergus and others},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {2244--2252}\n}\n@inproceedings{sumita1991experiments,\n\ttitle        = {Experiments and prospects of Example-Based Machine Translation},\n\tauthor       = {Eiichiro Sumita and Hitoshi Iida},\n\tyear         = 1991,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{Summa2015,\n\ttitle        = {On Largest Volume Simplices and Sub-determinants},\n\tauthor       = {Summa, Marco Di and Eisenbrand, Friedrich and Faenza, Yuri and Moldenhauer, Carsten},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the Twenty-Sixth Annual ACM-SIAM Symposium on Discrete Algorithms},\n\tlocation     = {San Diego, California},\n\tpublisher    = {SIAM},\n\tseries       = {SODA '15},\n\tpages        = {315--323},\n\turl          = {http://dl.acm.org/citation.cfm?id=2722129.2722152},\n\tnumpages     = 9,\n\tacmid        = 2722152\n}\n@inproceedings{sun2006beyond,\n\ttitle        = {Beyond streams and graphs: dynamic tensor analysis},\n\tauthor       = {Sun, Jimeng and Tao, Dacheng and Faloutsos, Christos},\n\tyear         = 2006,\n\tbooktitle    = {\n\t\tProceedings of the 12th ACM SIGKDD international conference on Knowledge\n\n\t\tdiscovery and data mining\n\t},\n\tlocation     = {Philadelphia, PA, USA},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {KDD '06},\n\tpages        = {374--383},\n\tdoi          = {http://doi.acm.org/10.1145/1150402.1150445},\n\tisbn         = {1-59593-339-5},\n\tacmid        = 1150445,\n\tnumpages     = 10\n}\n@inproceedings{sun2006distributed,\n\ttitle        = {Distributed Pattern Discovery in Multiple Streams},\n\tauthor       = {Jimeng Sun and Spiros Papadimitriou and Christos Faloutsos},\n\tyear         = 2006,\n\tjournal      = {PAKDD},\n\taddress      = {Singapore},\n\tpages        = {713--718},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@inproceedings{sun2006window,\n\ttitle        = {\n\t\tWindow-based Tensor Analysis on High-dimensional and Multi-aspect\n\n\t\tStreams\n\t},\n\tauthor       = {Sun, Jimeng and Papadimitriou, S. and Yu, P. S.},\n\tyear         = 2006,\n\tbooktitle    = {ICDM '06. Sixth International Conference on Data Mining},\n\tpages        = {1076--1080},\n\tdoi          = {10.1109/ICDM.2006.169},\n\tissn         = {1550-4786},\n\tabstract     = {\n\t\tData stream values are often associated with multiple aspects. For\n\n\t\texample, each value from environmental sensors may have an associated\n\n\t\ttype (e.g., temperature, humidity, etc) as well as location. Aside\n\n\t\tfrom timestamp, type and location are the two additional aspects.\n\n\t\tHow to model such streams? How to simultaneously find patterns within\n\n\t\tand across the multiple aspects? How to do it incrementally in a\n\n\t\tstreaming fashion? In this paper, all these problems are addressed\n\n\t\tthrough a general data model, tensor streams, and an effective algorithmic\n\n\t\tframework, window-based tensor analysis (WTA). Two variations of\n\n\t\tWTA, independent- window tensor analysis (IW) and moving-window tensor\n\n\t\tanalysis (MW), are presented and evaluated extensively on real datasets.\n\n\t\tFinally, we illustrate one important application, multi-aspect correlation\n\n\t\tanalysis (MACA), which uses WTA and we demonstrate its effectiveness\n\n\t\ton an environmental monitoring application.\n\t},\n\tkeywords     = {\n\t\tdata mining, environmental science computing, environmental monitoring\n\n\t\tapplication, high-dimensional streams, multi-aspect correlation analysis,\n\n\t\tmulti-aspect streams, window-based tensor analysis\n\t},\n\towner        = {leili},\n\ttimestamp    = {2010.02.03}\n}\n@inproceedings{sun2007less,\n\ttitle        = {Less is more: Compact matrix decomposition for large sparse graphs},\n\tauthor       = {Jimeng Sun and Yinglian Xie and Hui Zhang and Christos Faloutsos},\n\tyear         = 2007,\n\tbooktitle    = {In Proceeding SIAM International Conference on Data Mining}\n}\n@article{sun2008incremental,\n\ttitle        = {Incremental tensor analysis: Theory and applications},\n\tauthor       = {\n\t\tSun, Jimeng and Tao, Dacheng and Papadimitriou, Spiros and Yu, Philip\n\n\t\tS. and Faloutsos, Christos\n\t},\n\tyear         = 2008,\n\tjournal      = {ACM Trans. Knowl. Discov. Data},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tvolume       = 2,\n\tnumber       = 3,\n\tpages        = {1--37},\n\tdoi          = {http://doi.acm.org/10.1145/1409620.1409621},\n\tissn         = {1556-4681},\n\tabstract     = {\n\t\tHow do we find patterns in author-keyword associations, evolving over\n\n\t\ttime Or in data cubes (tensors), with product-branchcustomer sales\n\n\t\tinformation And more generally, how to summarize high-order data\n\n\t\tcubes (tensors) How to incrementally update these patterns over time\n\n\t\tMatrix decompositions, like principal component analysis (PCA) and\n\n\t\tvariants, are invaluable tools for mining, dimensionality reduction,\n\n\t\tfeature selection, rule identification in numerous settings like\n\n\t\tstreaming data, text, graphs, social networks, and many more settings.\n\n\t\tHowever, they have only two orders (i.e., matrices, like author and\n\n\t\tkeyword in the previous example). We propose to envision such higher-order\n\n\t\tdata as tensors, and tap the vast literature on the topic. However,\n\n\t\tthese methods do not necessarily scale up, let alone operate on semi-infinite\n\n\t\tstreams. Thus, we introduce a general framework, incremental tensor\n\n\t\tanalysis (ITA), which efficiently computes a compact summary for\n\n\t\thigh-order and high-dimensional data, and also reveals the hidden\n\n\t\tcorrelations. Three variants of ITA are presented: (1) dynamic tensor\n\n\t\tanalysis (DTA); (2) streaming tensor analysis (STA); and (3) window-based\n\n\t\ttensor analysis (WTA). In paricular, we explore several fundamental\n\n\t\tdesign trade-offs such as space efficiency, computational cost, approximation\n\n\t\taccuracy, time dependency, and model complexity. We implement all\n\n\t\tour methods and apply them in several real settings, such as network\n\n\t\tanomaly detection, multiway latent semantic indexing on citation\n\n\t\tnetworks, and correlation study on sensor measurements. Our empirical\n\n\t\tstudies show that the proposed methods are fast and accurate and\n\n\t\tthat they find interesting patterns and outliers on the real datasets.\n\t},\n\towner        = {leili},\n\ttimestamp    = {2010.02.05}\n}\n@article{sun2009strategies,\n\ttitle        = {On Strategies for Imbalanced Text Classification Using SVM: A Comparative Study},\n\tauthor       = {Aixin Sun and Ee-Peng Lim and Ying Liu},\n\tyear         = 2009,\n\tjournal      = {Decision Support Systems},\n\tvolume       = 48,\n\tnumber       = 1\n}\n@inproceedings{sun2011two,\n\ttitle        = {A two-stage weighting framework for multi-source domain adaptation},\n\tauthor       = {Sun, Qian and Chattopadhyay, Rita and Panchanathan, Sethuraman and Ye, Jieping},\n\tyear         = 2011,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {505--513}\n}\n@inproceedings{sun2012joint,\n\ttitle        = {Joint learning of a dual {SMT} system for paraphrase generation},\n\tauthor       = {Hong Sun and Ming Zhou},\n\tyear         = 2012,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{sun2013answer,\n\ttitle        = {Answer Extraction from Passage Graph for Question Answering},\n\tauthor       = {Hong Sun and Nan Duan and Yajuan Duan and Ming Zhou},\n\tyear         = 2013,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@article{sun2015complete1,\n\ttitle        = {Complete Dictionary Recovery over the Sphere {I}: Overview and the Geometric Picture},\n\tauthor       = {Sun, Ju and Qu, Qing and Wright, John},\n\tyear         = 2015,\n\tjournal      = {arXiv:1511.03607},\n\tpublisher    = {IEEE},\n\tvolume       = 63,\n\tnumber       = 2,\n\tpages        = {853--884},\n\tdate-modified = {2016-02-15 19:36:19 +0000}\n}\n@article{sun2015complete2,\n\ttitle        = {Complete Dictionary Recovery over the Sphere {II}: Recovery by {R}iemannian Trust-region Method},\n\tauthor       = {Sun, Ju and Qu, Qing and Wright, John},\n\tyear         = 2015,\n\tjournal      = {arXiv:1511.04777},\n\tdate-modified = {2016-02-15 19:36:24 +0000}\n}\n@inproceedings{sun2015guaranteed,\n\ttitle        = {Guaranteed matrix completion via nonconvex factorization},\n\tauthor       = {Sun, Ruoyu and Luo, Zhi-Quan},\n\tyear         = 2015,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tbooktitle    = {Foundations of Computer Science (FOCS), 2015 IEEE 56th Annual Symposium on},\n\tpublisher    = {IEEE},\n\tvolume       = 62,\n\tnumber       = 11,\n\tpages        = {270--289},\n\torganization = {IEEE}\n}\n@article{sun2015nonconvex,\n\ttitle        = {When Are Nonconvex Problems Not Scary?},\n\tauthor       = {Sun, Ju and Qu, Qing and Wright, John},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1510.06096}\n}\n@inproceedings{sun2016deep,\n\ttitle        = {Deep coral: Correlation alignment for deep domain adaptation},\n\tauthor       = {Sun, Baochen and Saenko, Kate},\n\tyear         = 2016,\n\tbooktitle    = {European conference on computer vision},\n\tpages        = {443--450},\n\torganization = {Springer}\n}\n@inproceedings{sun2016geometric,\n\ttitle        = {A geometric analysis of phase retrieval},\n\tauthor       = {Sun, Ju and Qu, Qing and Wright, John},\n\tyear         = 2016,\n\tjournal      = {Forthcoming},\n\tbooktitle    = {Information Theory (ISIT), 2016 IEEE International Symposium on},\n\tpages        = {2379--2383},\n\torganization = {IEEE}\n}\n@inproceedings{sun2016return,\n\ttitle        = {Return of Frustratingly Easy Domain Adaptation},\n\tauthor       = {Baochen Sun and Jiashi Feng and Kate Saenko},\n\tyear         = 2016,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@incollection{sun2017correlation,\n\ttitle        = {Correlation alignment for unsupervised domain adaptation},\n\tauthor       = {Sun, Baochen and Feng, Jiashi and Saenko, Kate},\n\tyear         = 2017,\n\tbooktitle    = {Domain Adaptation in Computer Vision Applications},\n\tpublisher    = {Springer},\n\tpages        = {153--171}\n}\n@inproceedings{sun2017deeply,\n\ttitle        = {Deeply AggreVaTeD: Differentiable Imitation Learning for Sequential Prediction},\n\tauthor       = {Wen Sun and Arun Venkatraman and Geoffrey J. Gordon and Byron Boots and J. Andrew Bagnell},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{sun2017revisiting,\n\ttitle        = {Revisiting Unreasonable Effectiveness of Data in Deep Learning Era},\n\tauthor       = {Chen Sun and Abhinav Shrivastava and Saurabh Singh1 and Abhinav Gupta},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Computer Vision (ICCV)}\n}\n@article{sun2018contextual,\n\ttitle        = {Contextual memory trees},\n\tauthor       = {Wen Sun and Alina Beygelzimer and Hal {Daum{\\'e} III} and John Langford and Paul Mineiro},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1807.06473}\n}\n@article{sun2018improving,\n\ttitle        = {Improving Machine Reading Comprehension with General Reading Strategies},\n\tauthor       = {Kai Sun and Dian Yu and Dong Yu and Claire Cardie},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1810.13441}\n}\n@inproceedings{sun2018model,\n\ttitle        = {Model-based {RL} in contextual decision processes: {PAC} bounds and exponential improvements over model-free approaches},\n\tauthor       = {Sun, Wen and Jiang, Nan and Krishnamurthy, Akshay and Agarwal, Alekh and Langford, John},\n\tyear         = 2019,\n\tbooktitle    = {Conference on Learning Theory},\n\tpages        = {2898--2933},\n\torganization = {PMLR}\n}\n@article{sun2019mitigating,\n\ttitle        = {Mitigating gender bias in natural language processing: Literature review},\n\tauthor       = {Tony Sun and Andrew Gaut and Shirlyn Tang and Yuxin Huang and Mai ElSherief and Jieyu Zhao and Diba Mirza and Elizabeth Belding and Kai-Wei Chang and William Yang Wang},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1906.08976}\n}\n@article{sun2019udss,\n\ttitle        = {Unsupervised Domain Adaptation through Self-Supervision},\n\tauthor       = {Yu Sun and Eric Tzeng and Trevor Darrell and Alexei A. Efros},\n\tyear         = 2019,\n\tjournal      = {arXiv}\n}\n@inproceedings{sun2020test,\n\ttitle        = {Test-time training with self-supervision for generalization under distribution shifts},\n\tauthor       = {Yu Sun and Xiaolong Wang and Zhuang Liu and John Miller and Alexei A Efros and Moritz Hardt},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{sun2020zeroth,\n\ttitle        = {Zeroth-Order Supervised Policy Improvement},\n\tauthor       = {Sun, Hao and Xu, Ziping and Song, Yuhang and Fang, Meng and Xiong, Jiechao and Dai, Bo and Zhang, Zhengyou and Zhou, Bolei},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.06600}\n}\n@inproceedings{sundararajan2017axiomatic,\n\ttitle        = {Axiomatic attribution for deep networks},\n\tauthor       = {Mukund Sundararajan and Ankur Taly and Qiqi Yan},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {3319--3328}\n}\n@inproceedings{sung2014learning,\n\ttitle        = {Synthesizing Manipulation Sequences for Under-Specified Tasks using Unrolled {M}arkov Random Fields},\n\tauthor       = {J. Sung and B. Selman and A. Saxena},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Intelligent Robots and Systems (IROS)}\n}\n@article{sung2015robobarista,\n\ttitle        = {Robobarista: Object Part based Transfer of Manipulation Trajectories from Crowd-sourcing in 3{D} Pointclouds},\n\tauthor       = {Jaeyong Sung and Seok Hyun Jin and Ashutosh Saxena},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1504.03071}\n}\n@inproceedings{surdeanu2012multi,\n\ttitle        = {Multi-instance multi-label learning for relation extraction},\n\tauthor       = {Mihai Surdeanu and Julie Tibshirani and Ramesh Nallapati and Christopher D Manning},\n\tyear         = 2012,\n\tbooktitle    = {Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP/CoNLL)},\n\tpages        = {455--465}\n}\n@book{surowiecki2004wisdom,\n\ttitle        = {The wisdom of crowds: Why the many are smarter than the few and how collective wisdom shapes business, economies, societies, and nations},\n\tauthor       = {James Surowiecki},\n\tyear         = 2004,\n\tpublisher    = {Doubleday and Co}\n}\n@article{survey,\n\ttitle        = {Introduction to probabilistic topic models},\n\tauthor       = {D. Blei},\n\tyear         = 2012,\n\tjournal      = {Communications of the ACM},\n\tpages        = {77--84}\n}\n@misc{sussmann2007robust,\n\ttitle        = {Building Robust Systems: An essay},\n\tauthor       = {Gerald Jay Sussman},\n\tyear         = 2007,\n\thowpublished = {\\url{https://groups.csail.mit.edu/mac/users/gjs/6.945/readings/robust-systems.pdf}}\n}\n@inproceedings{sutskever2009modelling,\n\ttitle        = {Modelling Relational Data using {B}ayesian Clustered Tensor Factorization},\n\tauthor       = {Ilya Sutskever and Ruslan Salakhutdinov and Joshua B Tenenbaum},\n\tyear         = 2009,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1821--1828}\n}\n@inproceedings{sutskever2011generating,\n\ttitle        = {Generating text with recurrent neural networks},\n\tauthor       = {Ilya Sutskever and James Martens and Geoffrey E Hinton},\n\tyear         = 2011,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1017--1024}\n}\n@inproceedings{sutskever2013importance,\n\ttitle        = {On the importance of initialization and momentum in deep learning},\n\tauthor       = {Sutskever, Ilya and Martens, James and Dahl, George and Hinton, Geoffrey},\n\tyear         = 2013,\n\tbooktitle    = {International conference on machine learning},\n\tpages        = {1139--1147}\n}\n@book{Sutton:1998:IRL:551283,\n\ttitle        = {Introduction to Reinforcement Learning},\n\tauthor       = {Sutton, Richard S. and Barto, Andrew G.},\n\tyear         = 1998,\n\tpublisher    = {MIT Press},\n\taddress      = {Cambridge, MA, USA},\n\tisbn         = {0262193981},\n\tedition      = {1st},\n\tdate-added   = {2018-02-14 09:43:11 +0000},\n\tdate-modified = {2018-02-14 09:43:11 +0000}\n}\n@inproceedings{sutton05piecewise,\n\ttitle        = {Piecewise Training of Undirected Models},\n\tauthor       = {Charles Sutton and Andrew McCallum},\n\tyear         = 2005,\n\tbooktitle    = {Uncertainty in Artificial Intelligence (UAI)}\n}\n@inproceedings{sutton07pwpl,\n\ttitle        = {Piecewise Pseudolikelihood for Efficient {CRF} Training},\n\tauthor       = {Charles Sutton and Andrew McCallum},\n\tyear         = 2007,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@incollection{sutton1990integrated,\n\ttitle        = {Integrated architectures for learning, planning, and reacting based on approximating dynamic programming},\n\tauthor       = {Sutton, Richard S},\n\tyear         = 1990,\n\tjournal      = {Machine Learning Proceedings},\n\tbooktitle    = {Machine learning proceedings 1990},\n\tpublisher    = {Elsevier},\n\tpages        = {216--224}\n}\n@article{sutton1991dyna,\n\ttitle        = {Dyna, an integrated architecture for learning, planning, and reacting},\n\tauthor       = {Sutton, Richard S},\n\tyear         = 1991,\n\tjournal      = {ACM Sigart Bulletin},\n\tpublisher    = {ACM New York, NY, USA},\n\tvolume       = 2,\n\tnumber       = 4,\n\tpages        = {160--163}\n}\n@article{sutton1995td,\n\ttitle        = {{TD} models: Modeling the world at a mixture of time scales},\n\tauthor       = {R. S. Sutton},\n\tyear         = 1995,\n\tjournal      = {Machine Learning Proceedings},\n\tpages        = {531--539}\n}\n@book{sutton1998reinforcement,\n\ttitle        = {Reinforcement learning: An introduction},\n\tauthor       = {Sutton, Richard S and Barto, Andrew G},\n\tyear         = 1998,\n\tpublisher    = {MIT press},\n\tvolume       = 1,\n\tnumber       = 1\n}\n@article{sutton1999between,\n\ttitle        = {Between MDPs and semi-MDPs: A framework for temporal abstraction in reinforcement learning},\n\tauthor       = {R. S. Sutton and D. Precup and S. Singh},\n\tyear         = 1999,\n\tjournal      = {Articial intelligence},\n\tvolume       = 112,\n\tpages        = {181--211}\n}\n@inproceedings{sutton1999policy,\n\ttitle        = {Policy gradient methods for reinforcement learning with function approximation.},\n\tauthor       = {Sutton, Richard S and McAllester, David A and Singh, Satinder P and Mansour, Yishay},\n\tyear         = 1999,\n\tbooktitle    = {NIPs},\n\tvolume       = 99,\n\tpages        = {1057--1063},\n\torganization = {Citeseer}\n}\n@inproceedings{sutton2005joint,\n\ttitle        = {Joint Parsing and Semantic Role Labeling},\n\tauthor       = {Charles Sutton and Andrew McCallum},\n\tyear         = 2005,\n\tbooktitle    = {Computational Natural Language Learning (CoNLL)}\n}\n@inproceedings{sutton2009convergent,\n\ttitle        = {A Convergent $ O (n) $ Temporal-difference Algorithm for Off-policy Learning with Linear Function Approximation},\n\tauthor       = {Sutton, Richard S and Maei, Hamid R and Szepesv{\\'a}ri, Csaba},\n\tyear         = 2009,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {1609--1616}\n}\n@article{sutton88learning,\n\ttitle        = {Learning to Predict by the Methods of Temporal Differences},\n\tauthor       = {Richard S. Sutton},\n\tyear         = 1988,\n\tjournal      = {Machine Learning},\n\tvolume       = 3,\n\tnumber       = 1,\n\tpages        = {9--44}\n}\n@inproceedings{suzuki07hybrid,\n\ttitle        = {Semi-Supervised Structured Output Learning based on a Hybrid Generative and Discriminative Approach},\n\tauthor       = {Jun Suzuki and Akinori Fujino and Hideki Isozaki},\n\tyear         = 2007,\n\tbooktitle    = {Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP/CoNLL)}\n}\n@incollection{svitkina2004min,\n\ttitle        = {Min-max multiway cut},\n\tauthor       = {Svitkina, Zoya and Tardos, {\\'E}va},\n\tyear         = 2004,\n\tbooktitle    = {Approximation, Randomization, and Combinatorial Optimization. Algorithms and Techniques},\n\tpublisher    = {Springer},\n\tpages        = {207--218}\n}\n@inproceedings{svl14,\n\ttitle        = {Sequence to sequence learning with neural networks},\n\tauthor       = {Sutskever, Ilya and Vinyals, Oriol and Le, Quoc V},\n\tyear         = 2014,\n\tbooktitle    = {Advances in neural information processing systems (NIPS)},\n\tpages        = {3104--3112}\n}\n@inproceedings{SVM,\n\ttitle        = {A training algorithm for optimal margin classifiers},\n\tauthor       = {Boser, Bernhard E. and Guyon, Isabelle M. and Vapnik, Vladimir N.},\n\tyear         = 1992,\n\tbooktitle    = {Proceedings of the fifth annual workshop on Computational learning theory},\n\tlocation     = {Pittsburgh, Pennsylvania, United States},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {COLT '92},\n\tpages        = {144--152},\n\tdoi          = {10.1145/130385.130401},\n\tisbn         = {0-89791-497-X},\n\turl          = {http://doi.acm.org/10.1145/130385.130401},\n\tnumpages     = 9,\n\tacmid        = 130401\n}\n@inproceedings{svwx17,\n\ttitle        = {On the complexity of learning neural networks},\n\tauthor       = {Song, Le and Vempala, Santosh and Wilmes, John and Xie, Bo},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NIPS)},\n\tpages        = {5514--5522}\n}\n@inproceedings{svyatkovskiy2019pythia,\n\ttitle        = {Pythia: {AI}-assisted Code Completion System},\n\tauthor       = {Alexey Svyatkovskiy and Ying Zhao and Shengyo Fu and Neel Sundaresan},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)}\n}\n@misc{swalin2010evaluating,\n\ttitle        = {\n\t\tEvaluating Microsoft Hyper-V Live Migration Performance Using IBM\n\n\t\tSystem x3650 M3 and IBM System Storage DS3400\n\t},\n\tauthor       = {Kent R. Swalin},\n\tyear         = 2010,\n\thowpublished = {Available at \\url{ftp://public.dhe.ibm.com/common/ssi/ecm/en/xsw03091usen/XSW03091USEN.PD}}\n}\n@inproceedings{swayamdipta2018multi,\n\ttitle        = {Multi-Mention Learning for Reading Comprehension with Neural Cascades},\n\tauthor       = {Swabha Swayamdipta and Ankur P Parikh and Tom Kwiatkowski},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{swayamdipta2020dataset,\n\ttitle        = {Dataset cartography: Mapping and diagnosing datasets with training dynamics},\n\tauthor       = {Swabha Swayamdipta and Roy Schwartz and Nicholas Lourie and Yizhong Wang and Hannaneh Hajishirzi and Noah A. Smith and Yejin Choi},\n\tyear         = 2020,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{swendsen87,\n\ttitle        = {Nonuniversal critical dynamics in {MC} simulations},\n\tauthor       = {R. H. Swendsen and J. S. Wang},\n\tyear         = 1987,\n\tjournal      = {Physics Review Letters},\n\tvolume       = 58,\n\tpages        = {86--88}\n}\n@inproceedings{SWW,\n\ttitle        = {Exact recovery of sparsely-used dictionaries},\n\tauthor       = {D. Spielman and H. Wang and J. Wright},\n\tyear         = 2012,\n\tbooktitle    = {Journal of Machine Learning Research}\n}\n@inproceedings{syed2010exploiting,\n\ttitle        = {Exploiting a web of semantic data for interpreting tables},\n\tauthor       = {Zareen Syed and Tim Finin and Varish Mulwad and Anupam Joshi},\n\tyear         = 2010,\n\tbooktitle    = {Proceedings of the Second Web Science Conference}\n}\n@inproceedings{SYG07,\n\ttitle        = {A stochastic quasi-Newton method for online convex optimization},\n\tauthor       = {Schraudolph, Nicol N and Yu, Jin and G{\\\"u}nter, Simon},\n\tyear         = 2007,\n\tbooktitle    = {International Conference on Artificial Intelligence and Statistics},\n\tpages        = {436--443}\n}\n@article{sylvester1857question,\n\ttitle        = {A question in the geometry of situation},\n\tauthor       = {Sylvester, James Joseph},\n\tyear         = 1857,\n\tjournal      = {Quarterly Journal of Pure and Applied Mathematics},\n\tvolume       = 1\n}\n@article{Szarek1991-EigenDistribution,\n\ttitle        = {Condition numbers of random matrices},\n\tauthor       = {Szarek, Stanislaw J},\n\tyear         = 1991,\n\tjournal      = {Journal of Complexity},\n\tpublisher    = {Elsevier},\n\tvolume       = 7,\n\tnumber       = 2,\n\tpages        = {131--149}\n}\n@inproceedings{szarvas2013learning,\n\ttitle        = {Learning to rank lexical substitutions},\n\tauthor       = {Szarvas, Gy{\\\"o}rgy and Busa-Fekete, R{'o}bert and H{\\\"u}llermeier, Eyke},\n\tyear         = 2013,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{szegedy2014intriguing,\n\ttitle        = {Intriguing properties of neural networks},\n\tauthor       = {Christian Szegedy and Wojciech Zaremba and Ilya Sutskever and Joan Bruna and Dumitru Erhan and Ian Goodfellow and Rob Fergus},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{szegedy2015googlenet,\n\ttitle        = {Going deeper with convolutions},\n\tauthor       = {Christian Szegedy and Wei Liu and Yangqing Jia and Pierre Sermanet and Scott E. Reed and Dragomir Anguelov and Dumitru Erhan and Vincent Vanhoucke and Andrew Rabinovich},\n\tyear         = 2015,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@article{szegedy2016inception,\n\ttitle        = {Inception-v4, inception-resnet and the impact of residual connections on learning},\n\tauthor       = {Szegedy, Christian and Ioffe, Sergey and Vanhoucke, Vincent},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1602.07261},\n\tbooktitle    = {AAAI},\n\tpages        = {4278--4284}\n}\n@inproceedings{szegedy2016rethinking,\n\ttitle        = {Rethinking the inception architecture for computer vision},\n\tauthor       = {Szegedy, Christian and Vanhoucke, Vincent and Ioffe, Sergey and Shlens, Jon and Wojna, Zbigniew},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1512.00567},\n\tbooktitle    = {Proceedings of the IEEE conference on computer vision and pattern recognition},\n\tpages        = {2818--2826}\n}\n@inproceedings{szita2010model,\n\ttitle        = {Model-based reinforcement learning with nearly tight exploration complexity bounds},\n\tauthor       = {Szita, Istv{\\'a}n and Szepesv{\\'a}ri, Csaba},\n\tyear         = 2010,\n\tbooktitle    = {ICML}\n}\n@inproceedings{T,\n\ttitle        = {Greed is good: Algorithmic results for sparse approximation},\n\tauthor       = {J. Tropp},\n\tyear         = 2004,\n\tbooktitle    = {IEEE Transactions on Information Theory},\n\tpages        = {2231--2242}\n}\n@article{t10,\n\ttitle        = {254A, Notes 3 : The operator norm of a random matrix},\n\tauthor       = {Terence Tao},\n\tyear         = 2010,\n\tjournal      = {https://terrytao.wordpress.com/2010/01/09/254a-notes-3-the-operator-norm-of-a-random-matrix/}\n}\n@inproceedings{t16,\n\ttitle        = {Benefits of depth in neural networks},\n\tauthor       = {Telgarsky, Matus},\n\tyear         = 2016,\n\tbooktitle    = {Conference on Learning Theory (COLT)},\n\tpublisher    = {arXiv preprint arXiv:1602.04485}\n}\n@inproceedings{t17,\n\ttitle        = {An Analytical Formula of Population Gradient for two-layered {R}e{LU} network and its Applications in Convergence and Critical Point Analysis},\n\tauthor       = {Yuandong Tian},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpublisher    = {http://arxiv.org/abs/1703.00560}\n}\n@inproceedings{tachet2020domain,\n\ttitle        = {Domain Adaptation with Conditional Distribution Matching and Generalized Label Shift},\n\tauthor       = {Tachet des Combes, Remi and Zhao, Han and Wang, Yu-Xiang and Gordon, Geoffrey J},\n\tyear         = 2020,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 33,\n\tpages        = {19276--19289},\n\turl          = {https://proceedings.neurips.cc/paper/2020/file/dfbfa7ddcfffeb581f50edcf9a0204bb-Paper.pdf},\n\teditor       = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin}\n}\n@inproceedings{taghipour2016neural,\n\ttitle        = {A neural approach to automated essay scoring},\n\tauthor       = {Kaveh Taghipour and Hwee Tou Ng},\n\tyear         = 2016,\n\tbooktitle    = {Proceedings of the 2016 conference on empirical methods in natural language processing},\n\tpages        = {1882--1891}\n}\n@article{tagorti2014rate,\n\ttitle        = {Rate of Convergence and Error Bounds for LSTD ($\\lambda$)},\n\tauthor       = {Tagorti, Manel and Scherrer, Bruno and others},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1405.3229}\n}\n@inproceedings{tagorti2015rate,\n\ttitle        = {On the Rate of Convergence and Error Bounds for {LSTD}($\\lambda$)},\n\tauthor       = {Tagorti, Manel and Scherrer, Bruno},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {1521--1529}\n}\n@inproceedings{tai2015improved,\n\ttitle        = {Improved Semantic Representations From Tree-Structured Long Short-Term Memory Networks},\n\tauthor       = {Kai Shen Tai and Richard Socher and Christopher D. Manning},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{tak2005physically,\n\ttitle        = {A physically-based motion retargeting filter},\n\tauthor       = {Tak, Seyoon and Ko, Hyeong-Seok},\n\tyear         = 2005,\n\tmonth        = jan,\n\tjournal      = {ACM Trans. Graph.},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tvolume       = 24,\n\tpages        = {98--117},\n\tdoi          = {http://doi.acm.org/10.1145/1037957.1037963},\n\tissn         = {0730-0301},\n\tacmid        = 1037963,\n\tissue        = 1,\n\tkeywords     = {Motion retargeting, animation w/constraints, physically based animation},\n\tnumpages     = 20\n}\n@inproceedings{TakacBRS2013,\n\ttitle        = {Mini-Batch Primal and Dual Methods for SVMs},\n\tauthor       = {Takac, Martin and Bijral, Avleen and Richtarik, Peter and Srebro, Nati},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of The 30th International Conference on Machine Learning},\n\tpages        = {1022--1030}\n}\n@inproceedings{takamatsu2012reducing,\n\ttitle        = {Reducing wrong labels in distant supervision for relation extraction},\n\tauthor       = {Shingo Takamatsu and Issei Sato and Hiroshi Nakagawa},\n\tyear         = 2012,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {721--729}\n}\n@article{talagrand1996majorizing,\n\ttitle        = {Majorizing measures: the generic chaining},\n\tauthor       = {Michel Talagrand},\n\tyear         = 1996,\n\tjournal      = {The Annals of Probability},\n\tpages        = {1049--1103}\n}\n@article{talebi2018variance,\n\ttitle        = {Variance-aware regret bounds for undiscounted reinforcement learning in mdps},\n\tauthor       = {Talebi, Mohammad Sadegh and Maillard, Odalric-Ambrym},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1803.01626}\n}\n@article{taleghan2015pac,\n\ttitle        = {{PAC} Optimal MDP Planning with Application to Invasive Species Management},\n\tauthor       = {Majid Alkaee Taleghan and Thomas G. Dietterich and Mark Crowley and Kim Hall and H. Jo Albers},\n\tyear         = 2015,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 16,\n\tpages        = {3877--3903}\n}\n@book{talluri2006theory,\n\ttitle        = {The theory and practice of revenue management},\n\tauthor       = {Talluri, Kalyan T and Van Ryzin, Garrett J},\n\tyear         = 2006,\n\tpublisher    = {Springer Science \\& Business Media},\n\tvolume       = 68\n}\n@inproceedings{talmor2017evaluating,\n\ttitle        = {Evaluating Semantic Parsing against a Simple Web-based Question Answering Model},\n\tauthor       = {Alon Talmor and Mor Geva and Jonathan Berant},\n\tyear         = 2017,\n\tbooktitle    = {*SEM}\n}\n@article{talmor2018repartitioning,\n\ttitle        = {Repartitioning of the {ComplexWebQuestions} Dataset},\n\tauthor       = {Alon Talmor and Jonathan Berant},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1807.09623}\n}\n@inproceedings{talmor2018web,\n\ttitle        = {The Web as Knowledge-base for Answering Complex Questions},\n\tauthor       = {Alon Talmor and Jonathan Berant},\n\tyear         = 2018,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@inproceedings{talmor2019commonsenseqa,\n\ttitle        = {CommonsenseQA: A Question Answering Challenge Targeting Commonsense Knowledge},\n\tauthor       = {Alon Talmor and Jonathan Herzig and Nicholas Lourie and Jonathan Berant},\n\tyear         = 2019,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@inproceedings{talmor2019generalization,\n\ttitle        = {Multi{QA}: An Empirical Investigation of Generalization and Transfer in Reading Comprehension},\n\tauthor       = {Alon Talmor and Jonathan Berant},\n\tyear         = 2019,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{talmor2020olmpics,\n\ttitle        = {oLMpics -- On what Language Model Pre-training Captures},\n\tauthor       = {Alon Talmor and Yanai Elazar and Yoav Goldberg and Jonathan Berant},\n\tyear         = 2020,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 3\n}\n@article{talmor2020teaching,\n\ttitle        = {Teaching Pre-Trained Models to Systematically Reason Over Implicit Knowledge},\n\tauthor       = {Alon Talmor and Ojinvd Tafjord and Peter Clark and Yoav Goldberg and Jonathan Berant},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.06609}\n}\n@inproceedings{talvitie2014model,\n\ttitle        = {Model Regularization for Stable Sample Rollouts},\n\tauthor       = {E. Talvitie},\n\tyear         = 2014,\n\tbooktitle    = {Uncertainty in Artificial Intelligence (UAI)},\n\tpages        = {780--789}\n}\n@inproceedings{talvitie2015agnostic,\n\ttitle        = {Agnostic System Identification for Monte Carlo Planning},\n\tauthor       = {Erik Talvitie},\n\tyear         = 2015,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tpages        = {2986--2992}\n}\n@inproceedings{talvitie2017self,\n\ttitle        = {Self-Correcting Models for Model-Based Reinforcement Learning},\n\tauthor       = {E. Talvitie},\n\tyear         = 2017,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tpages        = {2597--2603}\n}\n@inproceedings{tamar2012policy,\n\ttitle        = {Policy gradients with variance related risk criteria},\n\tauthor       = {Tamar, Aviv and Di Castro, Dotan and Mannor, Shie},\n\tyear         = 2012,\n\tbooktitle    = {Proceedings of the 29th International Coference on International Conference on Machine Learning},\n\tpages        = {1651--1658}\n}\n@inproceedings{tamar2015optimizing,\n\ttitle        = {Optimizing the CVaR via sampling},\n\tauthor       = {Tamar, Aviv and Glassner, Yonatan and Mannor, Shie},\n\tyear         = 2015,\n\tbooktitle    = {Twenty-Ninth AAAI Conference on Artificial Intelligence}\n}\n@article{tamhane1981randomized,\n\ttitle        = {Randomized response techniques for multiple sensitive attributes},\n\tauthor       = {Ajit C Tamhane},\n\tyear         = 1981,\n\tjournal      = {Journal of the American Statistical Association (JASA)},\n\tvolume       = 76,\n\tnumber       = 376,\n\tpages        = {916--923}\n}\n@inproceedings{tan1993multi,\n\ttitle        = {Multi-Agent Reinforcement Learning: Independent vs. Cooperative Agents},\n\tauthor       = {Ming Tan},\n\tyear         = 1993,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {330--337}\n}\n@article{tan2015lstm,\n\ttitle        = {{LSTM}-based deep learning models for non-factoid answer selection},\n\tauthor       = {Ming Tan and Cicero dos Santos and Bing Xiang and Bowen Zhou},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1511.04108}\n}\n@inproceedings{tan2018s,\n\ttitle        = {{S}-{N}et: From answer extraction to answer generation for machine reading comprehension},\n\tauthor       = {Chuanqi Tan and Furu Wei and Nan Yang and Weifeng Lv and Ming Zhou},\n\tyear         = 2018,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{tan2018source,\n\ttitle        = {Source-target inference models for spatial instruction understanding},\n\tauthor       = {Hao Tan and Mohit Bansal},\n\tyear         = 2018,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{tan2019lxmert,\n\ttitle        = {{LXMERT}: Learning Cross-Modality Encoder Representations from Transformers},\n\tauthor       = {Hao Hao Tan and Mohit Bansal},\n\tyear         = 2019,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{tan2020coal,\n\ttitle        = {Class-imbalanced Domain Adaptation: An Empirical Odyssey},\n\tauthor       = {Shuhan Tan and Xingchao Peng and Kate Saenko},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:1910.10320}\n}\n@inproceedings{tang01ilp,\n\ttitle        = {Using multiple clause constructors in inductive logic programming for semantic parsing},\n\tauthor       = {L. R. Tang and R. J. Mooney},\n\tyear         = 2001,\n\tbooktitle    = {European Conference on Machine Learning (ECML)},\n\tpages        = {466--477}\n}\n@article{tang2008energy,\n\ttitle        = {\n\t\tEnergy-efficient thermal-aware task scheduling for homogeneous high-performance\n\n\t\tcomputing data centers: A cyber-physical approach\n\t},\n\tauthor       = {Q. Tang and S. K. S. Gupta and G. Varsamopoulos},\n\tyear         = 2008,\n\tjournal      = {IEEE Transactions on Parallel and Distributed Systems},\n\tvolume       = 19,\n\tnumber       = 11,\n\tpages        = {1458--1472}\n}\n@inproceedings{tang2017exploration,\n\ttitle        = {\\#{E}xploration: A study of count-based exploration for deep reinforcement learning},\n\tauthor       = {Tang, Haoran and Houthooft, Rein and Foote, Davis and Stooke, Adam and Chen, Xi and Duan, Yan and Schulman, John and DeTurck, Filip and Abbeel, Pieter},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {2753--2762}\n}\n@inproceedings{tang2020long,\n\ttitle        = {Long-Tailed Classification by Keeping the Good and Removing the Bad Momentum Causal Effect},\n\tauthor       = {Tang, Kaihua and Huang, Jianqiang and Zhang, Hanwang},\n\tyear         = 2020,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 33,\n\tpages        = {1513--1524}\n}\n@article{tanner87da,\n\ttitle        = {The calculation of posterior distributions by data augmentation},\n\tauthor       = {M. A. Tanner and W. H. Wong},\n\tyear         = 1987,\n\tjournal      = {Journal of the American Statistical Association (JASA)},\n\tvolume       = 82,\n\tpages        = {528--540}\n}\n@inproceedings{tao2004prediction,\n\ttitle        = {Prediction and indexing of moving objects with unknown motion patterns},\n\tauthor       = {\n\t\tTao, Yufei and Faloutsos, Christos and Papadias, Dimitris and Liu,\n\n\t\tBin\n\t},\n\tyear         = 2004,\n\tbooktitle    = {\n\t\tSIGMOD '04: Proceedings of the 2004 ACM SIGMOD international conference\n\n\t\ton Management of data\n\t},\n\tpublisher    = {ACM Press},\n\taddress      = {New York, NY, USA},\n\tpages        = {611--622},\n\tdoi          = {http://dx.doi.org/10.1145/1007568.1007637},\n\tisbn         = 1581138598,\n\tciteulike-article-id = 1053264,\n\tkeywords     = {location-prediction, sota},\n\towner        = {leili},\n\tposted-at    = {2007-01-19 16:27:54},\n\tpriority     = 2,\n\ttimestamp    = {2011.07.28}\n}\n@article{tao2010random,\n\ttitle        = {Random matrices: The distribution of the smallest singular values},\n\tauthor       = {Tao, Terence and Vu, Van},\n\tyear         = 2010,\n\tjournal      = {Geometric And Functional Analysis},\n\tpublisher    = {Springer},\n\tvolume       = 20,\n\tnumber       = 1,\n\tpages        = {260--297}\n}\n@book{tao2012random,\n\ttitle        = {Topics in random matrix theory},\n\tauthor       = {Terrence Tao},\n\tyear         = 2012,\n\tpublisher    = {American Mathematical Society}\n}\n@article{taori2020measuring,\n\ttitle        = {Measuring Robustness to Natural Distribution Shifts in Image Classification},\n\tauthor       = {Rohan Taori and Achal Dave and Vaishaal Shankar and Nicholas Carlini and Benjamin Recht and Ludwig Schmidt},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.00644}\n}\n@inproceedings{tapaswi2016movieqa,\n\ttitle        = {Movieqa: Understanding stories in movies through question-answering},\n\tauthor       = {Makarand Tapaswi and Yukun Zhu and Rainer Stiefelhagen and Antonio Torralba and Raquel Urtasun and Sanja Fidler},\n\tyear         = 2016,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {4631--4640}\n}\n@article{tarbouriech2021stochastic,\n\ttitle        = {Stochastic Shortest Path: Minimax, Parameter-Free and Towards Horizon-Free Regret},\n\tauthor       = {Tarbouriech, Jean and Zhou, Runlong and Du, Simon S and Pirotta, Matteo and Valko, Michal and Lazaric, Alessandro},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2104.11186}\n}\n@inproceedings{tarvainen2017mean,\n\ttitle        = {Mean teachers are better role models: Weight-averaged consistency targets improve semi-supervised deep learning results},\n\tauthor       = {Antti Tarvainen and Harri Valpola},\n\tyear         = 2017,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {1195--1204}\n}\n@inproceedings{taskar03maxmargin,\n\ttitle        = {Max-margin {M}arkov Networks},\n\tauthor       = {Ben Taskar and Carlos Guestrin and Daphne Koller},\n\tyear         = 2003,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{taskar2005learning,\n\ttitle        = {Learning structured prediction models: A large margin approach},\n\tauthor       = {Ben Taskar and Vassil Chatalbashev and Daphne Koller and Carlos Guestrin},\n\tyear         = 2005,\n\tbooktitle    = {Proceedings of the 22nd international conference on Machine learning},\n\tpages        = {896--903}\n}\n@inproceedings{tate2009equality,\n\ttitle        = {Equality Saturation: a New Approach to Optimization},\n\tauthor       = {Ross Tate and Michael Stepp and Zachary Tatlock and Sorin Lerner},\n\tyear         = 2009,\n\tbooktitle    = {Principles of Programming Languages (POPL)}\n}\n@inproceedings{tatman2017,\n\ttitle        = {Gender and Dialect Bias in {Y}ou{T}ube's Automatic Captions},\n\tauthor       = {Rachael Tatman},\n\tyear         = 2017,\n\tbooktitle    = {Workshop on Ethics in Natural Langauge Processing},\n\tvolume       = 1,\n\tpages        = {53--59}\n}\n@inproceedings{taylor12value,\n\ttitle        = {Value Function Approximation in Noisy Environments Using Locally Smoothed Regularized Approximate Linear Programs},\n\tauthor       = {Gavin Taylor and Ronald Parr},\n\tyear         = 2012,\n\tbooktitle    = {Proceedings of the 28th Conference on Uncertainty in Artificial Intelligence},\n\tpages        = {835--842}\n}\n@article{taylor1953cloze,\n\ttitle        = {``{C}loze procedure'': A new tool for measuring readability},\n\tauthor       = {Wilson L. Taylor},\n\tyear         = 1953,\n\tjournal      = {Journalism Bulletin},\n\tvolume       = 30,\n\tnumber       = 4,\n\tpages        = {415--433}\n}\n@incollection{taylor2007modeling,\n\ttitle        = {Modeling Human Motion Using Binary Latent Variables},\n\tauthor       = {Graham W. Taylor and Geoffrey E. Hinton and Sam T. Roweis},\n\tyear         = 2007,\n\tbooktitle    = {Advances in Neural Information Processing Systems 19},\n\tpublisher    = {MIT Press},\n\taddress      = {Cambridge, MA},\n\tpages        = {1345--1352},\n\teditor       = {B. Sch\\\"{o}lkopf and J. Platt and T. Hoffman},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@incollection{taylor2008medical,\n\ttitle        = {Medical robotics and computer-integrated surgery},\n\tauthor       = {Russell H. Taylor and Arianna Menciassi and Gabor Fichtinger and Paolo Dario},\n\tyear         = 2008,\n\tbooktitle    = {Springer Handbook of Robotics},\n\tpages        = {1199--1222}\n}\n@inproceedings{taylor2016alignment,\n\ttitle        = {Alignment for Advanced Machine Learning Systems},\n\tauthor       = {Jessica Taylor and Eliezer Yudkowsky and Patrick LaVictoire and Andrew Critch},\n\tyear         = 2016,\n\tbooktitle    = {Ethics of Artificial Intelligence}\n}\n@book{taylor2018americans,\n\ttitle        = {Americans With Disabilities: 2014},\n\tauthor       = {Danielle M Taylor},\n\tyear         = 2018,\n\tpublisher    = {US Census Bureau}\n}\n@inproceedings{taylor2019episodic,\n\ttitle        = {Episodic Learning with Control Lyapunov Functions for Uncertain Robotic Systems},\n\tauthor       = {Taylor, Andrew J and Dorobantu, Victor D and Le, Hoang M and Yue, Yisong and Ames, Aaron D},\n\tyear         = 2019,\n\tbooktitle    = {2019 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},\n\tpages        = {6878--6884},\n\torganization = {IEEE}\n}\n@article{tedrake2010lqrtrees,\n\ttitle        = {{LQR}-Trees: Feedback motion planning via sums of squares verification},\n\tauthor       = {Russ Tedrake and Ian R. Manchester and Mark M. Tobenkin and John W. Roberts},\n\tyear         = 2010,\n\tjournal      = {International Journal of Robotics Research},\n\tvolume       = 29,\n\tpages        = {1038--1052}\n}\n@article{teh06hdp,\n\ttitle        = {Hierarchical {D}irichlet processes},\n\tauthor       = {Y. W. Teh and M. I. Jordan and M. Beal and D. Blei},\n\tyear         = 2006,\n\tjournal      = {Journal of the American Statistical Association (JASA)},\n\tvolume       = 101,\n\tpages        = {1566--1581}\n}\n@inproceedings{teh06pitmanyor,\n\ttitle        = {A Hierarchical {B}ayesian Language Model Based On {P}itman-{Y}or Processes},\n\tauthor       = {Y. W. Teh},\n\tyear         = 2006,\n\tbooktitle    = {International Conference on Computational Linguistics and Association for Computational Linguistics (COLING/ACL)},\n\tpages        = {985--992}\n}\n@inproceedings{teh07collapsed,\n\ttitle        = {A Collapsed Variational {B}ayesian Inference Algorithm for {L}atent {D}irichlet {A}llocation},\n\tauthor       = {Y. W. Teh and D. Newman and M. Welling},\n\tyear         = 2007,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1353--1360}\n}\n@article{teh2016consistency,\n\ttitle        = {Consistency and fluctuations for stochastic gradient Langevin dynamics},\n\tauthor       = {Teh, Yee Whye and Thiery, Alexandre H and Vollmer, Sebastian J},\n\tyear         = 2016,\n\tjournal      = {The Journal of Machine Learning Research},\n\tpublisher    = {JMLR. org},\n\tvolume       = 17,\n\tnumber       = 1,\n\tpages        = {193--225}\n}\n@article{teigen2015framing,\n\ttitle        = {Framing of Numeric Quantities},\n\tauthor       = {Karl Halvor Teigen},\n\tyear         = 2015,\n\tjournal      = {The Wiley Blackwell Handbook of Judgment and Decision Making},\n\tpages        = {568--589}\n}\n@inproceedings{tellex2009grounding,\n\ttitle        = {Grounding spatial prepositions for video search},\n\tauthor       = {Stefanie Tellex and Deb Roy},\n\tyear         = 2009,\n\tbooktitle    = {International Conference on Multimodal Interfaces (ICMI)},\n\tpages        = {253--260}\n}\n@inproceedings{tellex2011understanding,\n\ttitle        = {Understanding Natural Language Commands for Robotic Navigation and Mobile Manipulation},\n\tauthor       = {Stefanie Tellex and Thomas Kollar and Steven Dickerson and Matthew R Walter and Ashis Gopal Banerjee and Seth J Teller and Nicholas Roy},\n\tyear         = 2011,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{tellex2014asking,\n\ttitle        = {Asking for help using inverse semantics},\n\tauthor       = {Stefanie Tellex and Ross Knepper and Adrian Li and Daniela Rus and Nicholas Roy},\n\tyear         = 2014,\n\tbooktitle    = {Robotics: Science and Systems (RSS)}\n}\n@article{tellex2020robonlp,\n\ttitle        = {Robots That Use Language},\n\tauthor       = {Stefanie Tellex and Nakul Gopalan and Hadas Kress-Gazit and Cynthia Matuszek},\n\tyear         = 2020,\n\tjournal      = {Annual Review of Control, Robotics, and Autonomous Systems},\n\tvolume       = 3,\n\tnumber       = 1,\n\tpages        = {25--55}\n}\n@article{tellez2018whole,\n\ttitle        = {Whole-slide mitosis detection in H\\&E breast histology using PHH3 as a reference to train distilled stain-invariant convolutional networks},\n\tauthor       = {David Tellez and Maschenka Balkenhol and Irene Otte-H{\\\\\"o}ller and Rob van de Loo and Rob Vogels and Peter Bult and Carla Wauters and Willem Vreuls and Suzanne Mol and Nico Karssemeijer and others},\n\tyear         = 2018,\n\tjournal      = {IEEE transactions on medical imaging},\n\tvolume       = 37,\n\tnumber       = 9,\n\tpages        = {2126--2136}\n}\n@article{tellez2019quantifying,\n\ttitle        = {Quantifying the effects of data augmentation and stain color normalization in convolutional neural networks for computational pathology},\n\tauthor       = {David Tellez and Geert Litjens and P{\\'e}ter B{\\'a}ndi and Wouter Bulten and John-Melle Bokhorst and Francesco Ciompi and Jeroen van der Laak},\n\tyear         = 2019,\n\tjournal      = {Medical Image Analysis},\n\tvolume       = 58\n}\n@inproceedings{temizer2010collision,\n\ttitle        = {Collision avoidance for unmanned aircraft using {M}arkov decision processes},\n\tauthor       = {Selim Temizer and Mykel J. Kochenderfer and Leslie P. Kaelbling and Tomas Lozano-P{\\'e}rez and James K. Kuchar},\n\tyear         = 2010,\n\tbooktitle    = {AIAA Guidance, Navigation, and Control Conference}\n}\n@article{tenenbaum2000global,\n\ttitle        = {A Global Geometric Framework for Nonlinear Dimensionality Reduction},\n\tauthor       = {Joshua B Tenenbaum and Vin De Silva and John C Langford},\n\tyear         = 2000,\n\tjournal      = {Science},\n\tpages        = {2319--2323}\n}\n@inproceedings{teney2018tips,\n\ttitle        = {Tips and Tricks for Visual Question Answering: Learnings from the 2017 Challenge},\n\tauthor       = {Damien Teney and Peter Anderson and Xiaodong He and Anton V. D. Hengel},\n\tyear         = 2018,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {4223--4232}\n}\n@article{tenney2019bert,\n\ttitle        = {BERT rediscovers the classical NLP pipeline},\n\tauthor       = {Tenney, Ian and Das, Dipanjan and Pavlick, Ellie},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1905.05950}\n}\n@article{tenney2019you,\n\ttitle        = {What do you learn from context? probing for sentence structure in contextualized word representations},\n\tauthor       = {Tenney, Ian and Xia, Patrick and Chen, Berlin and Wang, Alex and Poliak, Adam and McCoy, R Thomas and Kim, Najoung and Van Durme, Benjamin and Bowman, Samuel R and Das, Dipanjan and others},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1905.06316}\n}\n@inproceedings{tenorth2010knowrob,\n\ttitle        = {{KNOWROB}-{MAP}-knowledge-linked semantic object maps},\n\tauthor       = {M. Tenorth and L. Kunze and D. Jain and M. Beetz},\n\tyear         = 2010,\n\tbooktitle    = {Humanoids}\n}\n@article{tensor_rank_increase,\n\ttitle        = {Subtracting a best rank-1 approximation may increase tensor rank},\n\tauthor       = {A. Stegeman and P. Comon},\n\tyear         = 2010,\n\tjournal      = {Linear Algebra and Its Applications},\n\tvolume       = 433,\n\tpages        = {1276--1300}\n}\n@misc{tensorflow2015whitepaper,\n\ttitle        = {{TensorFlow}: Large-Scale Machine Learning on Heterogeneous Systems},\n\tauthor       = {Mart\\'{\\i}n~Abadi and Ashish~Agarwal and Paul~Barham and Eugene~Brevdo and Zhifeng~Chen and Craig~Citro and Greg~S.~Corrado and Andy~Davis and Jeffrey~Dean and Matthieu~Devin and Sanjay~Ghemawat and Ian~Goodfellow and Andrew~Harp and Geoffrey~Irving and Michael~Isard and Yangqing Jia and Rafal~Jozefowicz and Lukasz~Kaiser and Manjunath~Kudlur and Josh~Levenberg and Dandelion~Man\\'{e} and Rajat~Monga and Sherry~Moore and Derek~Murray and Chris~Olah and Mike~Schuster and Jonathon~Shlens and Benoit~Steiner and Ilya~Sutskever and Kunal~Talwar and Paul~Tucker and Vincent~Vanhoucke and Vijay~Vasudevan and Fernanda~Vi\\'{e}gas and Oriol~Vinyals and Pete~Warden and Martin~Wattenberg and Martin~Wicke and Yuan~Yu and Xiaoqiang~Zheng},\n\tyear         = 2015,\n\turl          = {https://www.tensorflow.org/},\n\tnote         = {Software available from tensorflow.org}\n}\n@misc{tensorhard,\n\ttitle        = {Most tensor problems are {NP} hard},\n\tauthor       = {Christopher Hillar and Lek-Heng Lim},\n\tyear         = 2012,\n\tnote         = {arXiv:0911.1393v3},\n\teprint       = {arXiv:0911.1393v3}\n}\n@article{TensorPCA2014,\n\ttitle        = {A statistical model for tensor PCA},\n\tauthor       = {Andrea Montanari and Emile Richard},\n\tyear         = 2014,\n\tmonth        = nov,\n\tjournal      = {arXiv preprint arXiv:1411.1076}\n}\n@article{TenSparsification,\n\ttitle        = {{ Tensor sparsification via a bound on the spectral norm of random tensors}},\n\tauthor       = {N. H. Nguyen and P. Drineas and T. D. Tran},\n\tyear         = 2010,\n\tmonth        = may,\n\tjournal      = {arXiv preprint arXiv:1005.4732}\n}\n@article{teshima2020universal,\n\ttitle        = {Universal Approximation Property of Neural Ordinary Differential Equations},\n\tauthor       = {Teshima, Takeshi and Tojo, Koichi and Ikeda, Masahiro and Ishikawa, Isao and Oono, Kenta},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2012.02414}\n}\n@article{tessler2018reward,\n\ttitle        = {Reward constrained policy optimization},\n\tauthor       = {Tessler, Chen and Mankowitz, Daniel J and Mannor, Shie},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.11074}\n}\n@inproceedings{tevet2019evaluating,\n\ttitle        = {Evaluating Text {GAN}s as Language Models},\n\tauthor       = {Guy Tevet and Gavriel Habib and Vered Shwartz and Jonathan Berant},\n\tyear         = 2019,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{tevet2020diversity,\n\ttitle        = {Evaluating the Evaluation of Diversity in Natural Language Generation},\n\tauthor       = {Guy Tevet and Jonathan Berant},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2004.02990}\n}\n@inproceedings{tewari2008optimistic,\n\ttitle        = {Optimistic linear programming gives logarithmic regret for irreducible {MDPs}},\n\tauthor       = {Tewari, Ambuj and Bartlett, Peter L},\n\tyear         = 2008,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {1505--1512}\n}\n@inproceedings{TGMS,\n\ttitle        = {Improved sparse approximation over quasi-incoherent dictionaries},\n\tauthor       = {J. Tropp and A. Gilbert and S. Muthukrishnan and M. Strauss},\n\tyear         = 2003,\n\tbooktitle    = {IEEE International Conf. on Image Processing}\n}\n@inproceedings{thagaard2020can,\n\ttitle        = {Can you trust predictive uncertainty under real dataset shifts in digital pathology?},\n\tauthor       = {Jeppe Thagaard and S{\\o}ren Hauberg and Bert van der Vegt and Thomas Ebstrup and Johan D Hansen and Anders B Dahl},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Medical Image Computing and Computer-Assisted Intervention},\n\tpages        = {824--833}\n}\n@article{thananjeyan2021recovery,\n\ttitle        = {Recovery rl: Safe reinforcement learning with learned recovery zones},\n\tauthor       = {Thananjeyan, Brijen and Balakrishna, Ashwin and Nair, Suraj and Luo, Michael and Srinivasan, Krishnan and Hwang, Minho and Gonzalez, Joseph E and Ibarz, Julian and Finn, Chelsea and Goldberg, Ken},\n\tyear         = 2021,\n\tjournal      = {IEEE Robotics and Automation Letters},\n\tpublisher    = {IEEE},\n\tvolume       = 6,\n\tnumber       = 3,\n\tpages        = {4915--4922}\n}\n@inproceedings{thater2010contextualizing,\n\ttitle        = {Contextualizing semantic representations using syntactically enriched vector models},\n\tauthor       = {Thater, Stefan and F{\\\"u}rstenau, Hagen and Pinkal, Manfred},\n\tyear         = 2010,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{theano2016theano,\n\ttitle        = {{Theano: A {Python} framework for fast computation of mathematical expressions}},\n\tauthor       = {{Theano Development Team}},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1605.02688}\n}\n@article{theis2015note,\n\ttitle        = {A note on the evaluation of generative models},\n\tauthor       = {Lucas Theis and Aaron van den Oord and Matthias Bethge},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1511.01844}\n}\n@article{theocharous2017posterior,\n\ttitle        = {Posterior sampling for large scale reinforcement learning},\n\tauthor       = {Theocharous, Georgios and Wen, Zheng and Abbasi-Yadkori, Yasin and Vlassis, Nikos},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1711.07979}\n}\n@inproceedings{thibaux07hbp,\n\ttitle        = {Hierarchical {B}eta Processes and the {I}ndian Buffet Process},\n\tauthor       = {Romain Thibaux and Michael I. Jordan},\n\tyear         = 2007,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)}\n}\n@article{thiebaux2006decision,\n\ttitle        = {Decision-theoretic planning with non-Markovian rewards},\n\tauthor       = {Thi{\\'e}baux, Sylvie and Gretton, Charles and Slaney, John and Price, David and Kabanza, Froduald},\n\tyear         = 2006,\n\tjournal      = {Journal of Artificial Intelligence Research},\n\tvolume       = 25,\n\tpages        = {17--74}\n}\n@article{thomas1990assessing,\n\ttitle        = {Assessing influence on predictions from generalized linear models},\n\tauthor       = {William Thomas and R Dennis Cook},\n\tyear         = 1990,\n\tjournal      = {Technometrics},\n\tvolume       = 32,\n\tnumber       = 1,\n\tpages        = {59--65}\n}\n@phdthesis{thomas2015safe,\n\ttitle        = {Safe reinforcement learning},\n\tauthor       = {Thomas, Philip S},\n\tyear         = 2015,\n\tschool       = {University of Massachusetts Libraries}\n}\n@inproceedings{thomason2015learning,\n\ttitle        = {Learning to Interpret Natural Language Commands through Human-Robot Dialog},\n\tauthor       = {Jesse Thomason and Shiqi Zhang and Raymond J. Mooney and Peter Stone},\n\tyear         = 2015,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)}\n}\n@inproceedings{thomason2019improving,\n\ttitle        = {Improving Grounded Natural Language Understanding through Human-Robot Dialog},\n\tauthor       = {Jesse Thomason and Aishwarya Padmakumar and Jivko Sinapov and Nick Walker and Yuqian Jiang and Harel Yedidsion and Justin W. Hart and Peter Stone and Raymond J. Mooney},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Robotics and Automation (ICRA)}\n}\n@article{thompson1933likelihood,\n\ttitle        = {On the likelihood that one unknown probability exceeds another in view of the evidence of two samples},\n\tauthor       = {William R Thompson},\n\tyear         = 1933,\n\tjournal      = {Biometrika},\n\tvolume       = 25,\n\tnumber       = 3,\n\tpages        = {285--294}\n}\n@article{thompson1975rel,\n\ttitle        = {Practical Natural Language Processing: The {REL} System as Prototype},\n\tauthor       = {Frederick B. Thompson and Bozena Henisz Thompson},\n\tyear         = 1975,\n\tjournal      = {Advances in Computers},\n\tvolume       = 13,\n\tpages        = {109--168}\n}\n@inproceedings{thompson97parse,\n\ttitle        = {Learning to Parse Natural Language Database Queries into Logical Form},\n\tauthor       = {Cynthia A. Thompson and Raymond J. Mooney and Lappoon R. Tang},\n\tyear         = 1997,\n\tbooktitle    = {ML-97 Workshop on Automata Induction, Grammatical Inference, and Language Acquisition}\n}\n@article{thon2015links,\n\ttitle        = {Links between multiplicity automata, observable operator models and predictive state representations: a unified learning framework},\n\tauthor       = {Thon, Michael and Jaeger, Herbert},\n\tyear         = 2015,\n\tjournal      = {The Journal of Machine Learning Research}\n}\n@inproceedings{thorne2018fever,\n\ttitle        = {FEVER: a large-scale dataset for Fact Extraction and VERification},\n\tauthor       = {James Thorne and Andreas Vlachos and Christos Christodoulopoulos and Arpit Mittal},\n\tyear         = 2018,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{thota2021cda,\n\ttitle        = {Contrastive Domain Adaptation},\n\tauthor       = {Mamatha Thota and Georgios Leontidis},\n\tyear         = 2021,\n\tjournal      = {arXiv}\n}\n@article{thrun1998lifelong,\n\ttitle        = {Lifelong learning algorithms},\n\tauthor       = {Sebastian Thrun},\n\tyear         = 1998,\n\tjournal      = {Learning to learn},\n\tpages        = {181--209}\n}\n@book{thrun2012learning,\n\ttitle        = {Learning to learn},\n\tauthor       = {Sebastian Thrun and Lorien Pratt},\n\tyear         = 2012,\n\tpublisher    = {Springer Science \\& Business Media Springer Science \\& Business Media}\n}\n@conference{thurau10cikm,\n\ttitle        = {Yes We Can – Simplex Volume Maximization for Descriptive Web{--}Scale Matrix Factorization},\n\tauthor       = {C. Thurau and K. Kersting and C. Bauckhage},\n\tyear         = 2010,\n\tbooktitle    = {CIKM{--}10}\n}\n@inproceedings{thurau2012deterministic,\n\ttitle        = {Deterministic {CUR} for Improved Large-Scale Data Analysis: An Empirical Study.},\n\tauthor       = {Thurau, Christian and Kersting, Kristian and Bauckhage, Christian},\n\tyear         = 2012,\n\tbooktitle    = {SDM},\n\tpages        = {684--695},\n\torganization = {SIAM}\n}\n@article{tian2017analytical,\n\ttitle        = {An Analytical Formula of Population Gradient for two-layered ReLU network and its Applications in Convergence and Critical Point Analysis},\n\tauthor       = {Tian, Yuandong},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1703.00560}\n}\n@article{tian2019contrastive,\n\ttitle        = {Contrastive multiview coding},\n\tauthor       = {Tian, Yonglong and Krishnan, Dilip and Isola, Phillip},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1906.05849}\n}\n@article{tian2020makes,\n\ttitle        = {What makes for good views for contrastive learning},\n\tauthor       = {Tian, Yonglong and Sun, Chen and Poole, Ben and Krishnan, Dilip and Schmid, Cordelia and Isola, Phillip},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2005.10243}\n}\n@article{tian2020posterior,\n\ttitle        = {Posterior re-calibration for imbalanced datasets},\n\tauthor       = {Tian, Junjiao and Liu, Yen-Cheng and Glaser, Nathan and Hsu, Yen-Chang and Kira, Zsolt},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.11820}\n}\n@article{tian2020understanding,\n\ttitle        = {Understanding self-supervised learning with dual deep networks},\n\tauthor       = {Tian, Yuandong and Yu, Lantao and Chen, Xinlei and Ganguli, Surya},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.00578}\n}\n@article{tibshirani1996regression,\n\ttitle        = {Regression shrinkage and selection via the lasso},\n\tauthor       = {Tibshirani, Robert},\n\tyear         = 1996,\n\tjournal      = {Journal of the Royal Statistical Society: Series B (Methodological)},\n\tpublisher    = {Wiley Online Library},\n\tvolume       = 58,\n\tnumber       = 1,\n\tpages        = {267--288}\n}\n@article{tibshirani2014adaptive,\n\ttitle        = {Adaptive piecewise polynomial estimation via trend filtering},\n\tauthor       = {Tibshirani, Ryan J},\n\tyear         = 2014,\n\tjournal      = {The Annals of Statistics},\n\tpublisher    = {Institute of Mathematical Statistics},\n\tvolume       = 42,\n\tnumber       = 1,\n\tpages        = {285--323},\n\tdate-added   = {2020-06-01 22:32:54 -0400},\n\tdate-modified = {2020-06-01 22:32:54 -0400}\n}\n@article{tiecke2017population,\n\ttitle        = {Mapping the world population one building at a time},\n\tauthor       = {Tobias G. Tiecke and Xianming Liu and Amy Zhang and Andreas Gros and Nan Li and Gregory Yetman and Talip Kilic and Siobhan Murray and Brian Blankespoor and Espen B. Prydz and Hai-Anh H. Dang},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@article{tieleman2012lecture,\n\ttitle        = {Lecture 6.5-rmsprop, coursera: Neural networks for machine learning},\n\tauthor       = {Tieleman, Tijmen and Hinton, Geoffrey},\n\tyear         = 2012,\n\tjournal      = {University of Toronto, Technical Report}\n}\n@article{tierney1994markov,\n\ttitle        = {{M}arkov Chains for Exploring Posterior Distributions},\n\tauthor       = {Tierney, Luke},\n\tyear         = 1994,\n\tjournal      = {The Annals of Statistics},\n\tpublisher    = {Institute of Mathematical Statistics},\n\tvolume       = 22,\n\tnumber       = 4,\n\tpages        = {1701--1728},\n\tdoi          = {10.2307/2242477},\n\tissn         = {00905364},\n\turl          = {http://dx.doi.org/10.2307/2242477},\n\tabstract     = {Several Markov chain methods are available for sampling from a posterior distribution. Two important examples are the Gibbs sampler and the Metropolis algorithm. In addition, several strategies are available for constructing hybrid algorithms. This paper outlines some of the basic methods and strategies and discusses some related theoretical and practical issues. On the theoretical side, results from the theory of general state space Markov chains can be used to obtain convergence rates, laws of large numbers and central limit theorems for estimates obtained from Markov chain methods. These theoretical results can be used to guide the construction of more efficient algorithms. For the practical use of Markov chain methods, standard simulation methodology provides several variance reduction techniques and also give guidance on the choice of sample size and allocation.},\n\tciteulike-article-id = 432149,\n\tciteulike-linkout-0 = {http://dx.doi.org/10.2307/2242477},\n\tciteulike-linkout-1 = {http://www.jstor.org/stable/2242477},\n\tkeywords     = {markov-chains, probability, statistics},\n\tposted-at    = {2008-11-30 02:31:25},\n\tpriority     = 2\n}\n@article{Tippet2000,\n\ttitle        = {Conditioning of the Stable, Discrete-Time Lyapunov Operator},\n\tauthor       = {Michael K. Tippett and Stephen E. Cohn and Ricardo Todling and Dan Marchesin},\n\tyear         = 2000,\n\tjournal      = {{SIAM} J. Matrix Analysis Applications},\n\tvolume       = 22,\n\tnumber       = 1,\n\tpages        = {56--65},\n\tdoi          = {10.1137/S0895479899354822},\n\turl          = {http://dx.doi.org/10.1137/S0895479899354822},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/journals/siammax/TippettCTM00},\n\ttimestamp    = {Tue, 21 Jul 2015 18:50:35 +0200},\n\tbdsk-url-1   = {http://dx.doi.org/10.1137/S0895479899354822}\n}\n@article{tipping1999probabilistic,\n\ttitle        = {Probabilistic Principal Component Analysis},\n\tauthor       = {Michael E. Tipping and Chris M. Bishop},\n\tyear         = 1999,\n\tjournal      = {Journal of the Royal Statistical Society, Series B},\n\tvolume       = 61,\n\tpages        = {611--622},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@book{titterington1985statistical,\n\ttitle        = {Statistical analysis of finite mixture distributions},\n\tauthor       = {D Michael Titterington and Adrian FM Smith and Udi E Makov},\n\tyear         = 1985,\n\tpublisher    = {Wiley New York},\n\tvolume       = 7\n}\n@article{tiulpin2018automatic,\n\ttitle        = {Automatic knee osteoarthritis diagnosis from plain radiographs: A deep learning-based approach},\n\tauthor       = {Aleksei Tiulpin and J{\\'e}r{\\^o}me Thevenot and Esa Rahtu and Petri Lehenkari and Simo Saarakkala},\n\tyear         = 2018,\n\tjournal      = {Scientific Reports},\n\tvolume       = 8,\n\tnumber       = 1,\n\tpages        = {1--10}\n}\n@article{tjeng2017verifying,\n\ttitle        = {Verifying Neural Networks with Mixed Integer Programming},\n\tauthor       = {Vincent Tjeng and Russ Tedrake},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1711.07356}\n}\n@book{TMS,\n\ttitle        = {Tensor Methods in Statistics},\n\tauthor       = {P. McCullagh},\n\tyear         = 1987,\n\tpublisher    = {Chapman and Hall}\n}\n@article{tobenkin2011invariant,\n\ttitle        = {Invariant funnels around trajectories using sum-of-squares programming},\n\tauthor       = {Mark M. Tobenkin and Ian R. Manchester and Russ Tedrake},\n\tyear         = 2011,\n\tjournal      = {IFAC Proceedings Volumes},\n\tvolume       = 44\n}\n@inproceedings{todorov2012mujoco,\n\ttitle        = {Mujoco: A physics engine for model-based control},\n\tauthor       = {Todorov, Emanuel and Erez, Tom and Tassa, Yuval},\n\tyear         = 2012,\n\tbooktitle    = {2012 IEEE/RSJ International Conference on Intelligent Robots and Systems},\n\tpages        = {5026--5033},\n\torganization = {IEEE}\n}\n@inproceedings{tolstikhin2016minimax,\n\ttitle        = {Minimax Estimation of Maximum Mean Discrepancy With Radial Kernels},\n\tauthor       = {Ilya Tolstikhin and Bharath K. Sriperumbudur and Bernhard Scholkopf},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1930--1938}\n}\n@inproceedings{tomioka2011statistical,\n\ttitle        = {Statistical performance of convex tensor decomposition},\n\tauthor       = {Tomioka, Ryota and Suzuki, Taiji and Hayashi, Kohei and Kashima, Hisashi},\n\tyear         = 2011,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NIPS)},\n\tpages        = 137\n}\n@article{tomizawa1971techniques,\n\ttitle        = {On some techniques useful for solution of transportation network problems},\n\tauthor       = {N. Tomizawa},\n\tyear         = 1971,\n\tjournal      = {Networks},\n\tvolume       = 1,\n\tnumber       = 2,\n\tpages        = {173--194}\n}\n@inproceedings{toneva2019empirical,\n\ttitle        = {An empirical study of example forgetting during deep neural network learning},\n\tauthor       = {Mariya Toneva and Alessandro Sordoni and Remi Tachet des Combes and Adam Trischler and Yoshua Bengio and Geoffrey J Gordon},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@book{tong1990non,\n\ttitle        = {Non-linear Time Series: {A} Dynamical System Approach},\n\tauthor       = {Howell Tong},\n\tyear         = 1990,\n\tpublisher    = {Clarendon Press},\n\taddress      = {Oxford},\n\tisbn         = 9780198523000,\n\tlccn         = 89029697,\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@inproceedings{tong2000active,\n\ttitle        = {Active learning for parameter estimation in {B}ayesian networks},\n\tauthor       = {Simon Tong and Daphne Koller},\n\tyear         = 2000,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tvolume       = 13,\n\tpages        = {647--653}\n}\n@article{tong2001support,\n\ttitle        = {Support vector machine active learning with applications to text classification},\n\tauthor       = {Simon Tong and Daphne Koller},\n\tyear         = 2001,\n\tjournal      = {Journal of machine learning research},\n\tvolume       = 2,\n\tnumber       = {0},\n\tpages        = {45--66}\n}\n@article{Topic-SCORE,\n\ttitle        = {A new {SVD} approach to optimal topic estimation},\n\tauthor       = {Ke, Zheng Tracy and Wang, Minzhe},\n\tyear         = 2017,\n\tjournal      = {arXiv:1704.07016}\n}\n@article{toplak2014assessment,\n\ttitle        = {Assessment of Machine Learning Reliability Methods for Quantifying the Applicability Domain of {QSAR} Regression Models},\n\tauthor       = {Marko Toplak and Rok Mo\\v{c}nik and Matija Polajnar and Zoran Bosni\\'{c} and Lars Carlsson and Catrin Hasselgren and Janez Dem\\v{s}ar and Scott Boyer and Blaz Zupan and Jonna St{\\aa}lring},\n\tyear         = 2014,\n\tjournal      = {Journal of Chemical Information and Modeling},\n\tvolume       = 54\n}\n@inproceedings{toro2018teaching,\n\ttitle        = {Teaching multiple tasks to an RL agent using LTL},\n\tauthor       = {Toro Icarte, Rodrigo and Klassen, Toryn Q and Valenzano, Richard and McIlraith, Sheila A},\n\tyear         = 2018,\n\tbooktitle    = {Proceedings of the 17th International Conference on Autonomous Agents and MultiAgent Systems},\n\tpages        = {452--461},\n\torganization = {International Foundation for Autonomous Agents and Multiagent Systems}\n}\n@article{torralba2008million,\n\ttitle        = {80 million tiny images: A large data set for nonparametric object and scene recognition},\n\tauthor       = {Antonio Torralba and Rob Fergus and William T Freeman},\n\tyear         = 2008,\n\tjournal      = {IEEE transactions on pattern analysis and machine intelligence},\n\tvolume       = 30,\n\tnumber       = 11,\n\tpages        = {1958--1970}\n}\n@inproceedings{torralba2011unbiased,\n\ttitle        = {Unbiased look at dataset bias},\n\tauthor       = {Antonio Torralba and Alexei A Efros},\n\tyear         = 2011,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {1521--1528}\n}\n@techreport{torrefrade2008guide,\n\ttitle        = {\n\t\tGuide to the Carnegie Mellon University Multimodal Activity (CMU-MMAC)\n\n\t\tDatabase\n\t},\n\tauthor       = {\n\t\tFernando De la Torre Frade and Jessica K. Hodgins and Adam W. Bargteil\n\n\t\tand Xavier Martin Artal and Justin C. Macey and Alexandre Collado\n\n\t\tI Castells and Josep Beltran\n\t},\n\tyear         = 2008,\n\tmonth        = apr,\n\taddress      = {Pittsburgh, PA},\n\tnumber       = {CMU-RI-TR-08-22},\n\tinstitution  = {Robotics Institute}\n}\n@inproceedings{tortorella2000optimal,\n\ttitle        = {An optimal reject rule for binary classifiers},\n\tauthor       = {Francesco Tortorella},\n\tyear         = 2000,\n\tbooktitle    = {Advances in Pattern Recognition},\n\tpages        = {611--620}\n}\n@article{tosh2020contrastive,\n\ttitle        = {Contrastive estimation reveals topic posterior information to linear models},\n\tauthor       = {Tosh, Christopher and Krishnamurthy, Akshay and Hsu, Daniel},\n\tyear         = 2020,\n\tjournal      = {arXiv:2003.02234}\n}\n@inproceedings{tosh2021contrastive,\n\ttitle        = {Contrastive learning, multi-view redundancy, and linear models},\n\tauthor       = {Tosh, Christopher and Krishnamurthy, Akshay and Hsu, Daniel},\n\tyear         = 2021,\n\tbooktitle    = {Algorithmic Learning Theory},\n\tpages        = {1179--1206},\n\torganization = {PMLR}\n}\n@inproceedings{toutanova2003tagger,\n\ttitle        = {Feature-Rich Part-of-Speech Tagging with a Cyclic Dependency Network},\n\tauthor       = {Kristina Toutanova and Christopher D. Manning},\n\tyear         = 2003,\n\tbooktitle    = {Human Language Technology and North American Association for Computational Linguistics (HLT/NAACL)}\n}\n@inproceedings{toutanova2016dataset,\n\ttitle        = {A Dataset and Evaluation Metrics for Abstractive Compression of Sentences and Short Paragraphs},\n\tauthor       = {Kristina Toutanova and Chris Brockett},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {340--350}\n}\n@inproceedings{tr18,\n\ttitle        = {Least-squares temporal difference learning for the linear quadratic regulator},\n\tauthor       = {Tu, Stephen and Recht, Benjamin},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpublisher    = {arXiv preprint arXiv:1712.08642}\n}\n@inproceedings{traina2000fast,\n\ttitle        = {Fast feature selection using the fractal dimension,},\n\tauthor       = {Caetano Traina and Agma Traina and Leejay Wu and Christos Faloutsos},\n\tyear         = 2000,\n\tmonth        = oct,\n\tbooktitle    = {XV Brazilian Symposium on Databases (SBBD)},\n\taddress      = {Paraiba, Brazil},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@inproceedings{tramer2016stealing,\n\ttitle        = {Stealing machine learning models via prediction {API}s},\n\tauthor       = {Florian Tram{\\`e}r and Fan Zhang and Ari Juels and Michael K. Reiter and Thomas Ristenpart},\n\tyear         = 2016,\n\tbooktitle    = {USENIX Security}\n}\n@article{tramer2017ensemble,\n\ttitle        = {Ensemble Adversarial Training: Attacks and Defenses},\n\tauthor       = {Florian Tram{\\`e}r and Alexey Kurakin and Nicolas Papernot and Dan Boneh and Patrick McDaniel},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1705.07204}\n}\n@article{TranDinh2015adaptive,\n\ttitle        = {Adaptive Smoothing Algorithms for Nonsmooth Composite Convex Minimization},\n\tauthor       = {Tran-Dinh, Quoc},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1509.00106}\n}\n@article{trapnell2014dynamics,\n\ttitle        = {The dynamics and regulators of cell fate decisions are revealed by pseudotemporal ordering of single cells},\n\tauthor       = {Cole Trapnell and Davide Cacchiarelli and Jonna Grimsby and Prapti Pokharel and Shuqiang Li and Michael Morse and Niall J Lennon and Kenneth J Livak and Tarjei S Mikkelsen and John L Rinn},\n\tyear         = 2014,\n\tjournal      = {Nature Biotechnology},\n\tvolume       = 32,\n\tnumber       = 4\n}\n@inproceedings{traum2008multi,\n\ttitle        = {Multi-party, Multi-issue, Multi-strategy Negotiation for Multi-modal Virtual Agents},\n\tauthor       = {David Traum and Stacy C Marsella and Jonathan Gratch and Jina Lee and Arno Hartholt},\n\tyear         = 2008,\n\tbooktitle    = {International Workshop on Intelligent Virtual Agents},\n\tpages        = {117--130}\n}\n@book{trefethen1997numerical,\n\ttitle        = {Numerical linear algebra},\n\tauthor       = {Trefethen, Lloyd N and Bau III, David},\n\tyear         = 1997,\n\tpublisher    = {Siam},\n\tvolume       = 50\n}\n@book{Trefethen2013,\n\ttitle        = {{Approximation Theory and Approximation Practice}},\n\tauthor       = {Trefethen, Lloyd N.},\n\tyear         = 2013,\n\tpublisher    = {SIAM},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Trefethen - 2013 - Approximation Theory and Approximation Practice.pdf:pdf},\n\tmendeley-groups = {Books/Book-Theory,Notes-Tools/Chebyshev,Books/Book-Optimization}\n}\n@article{trejo2015stackelberg,\n\ttitle        = {A Stackelberg security game with random strategies based on the extraproximal theoretic approach},\n\tauthor       = {Trejo, Kristal K and Clempner, Julio B and Poznyak, Alexander S},\n\tyear         = 2015,\n\tjournal      = {Engineering Applications of Artificial Intelligence},\n\tpublisher    = {Elsevier},\n\tvolume       = 37,\n\tpages        = {145--153}\n}\n@article{tretter2006accuracy,\n\ttitle        = {Accuracy of scale conceptions in science: Mental maneuverings across many orders of spatial magnitude},\n\tauthor       = {Thomas R. Tretter and M. Gail Jones and James Minogue},\n\tyear         = 2006,\n\tjournal      = {Journal of Research in Science Teaching},\n\tvolume       = 43,\n\tpages        = {1061--1085}\n}\n@article{Trevisan1998,\n\ttitle        = {{Parallel Approximation Algorithms by Positive Linear Programming}},\n\tauthor       = {Trevisan, Luca},\n\tyear         = 1998,\n\tmonth        = may,\n\tjournal      = {Algorithmica},\n\tvolume       = 21,\n\tnumber       = 1,\n\tpages        = {72--88},\n\tdoi          = {10.1007/PL00009209},\n\tissn         = {0178-4617},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Trevisan - 1998 - Parallel Approximation Algorithms by Positive Linear Programming.pdf:pdf},\n\tmendeley-groups = {Algorithms/Multiplicative Weight/LP}\n}\n@article{triantafyllopoulos2002moments,\n\ttitle        = {Moments and cumulants of the multivariate real and complex {G}aussian distributions},\n\tauthor       = {Kostas Triantafyllopoulos},\n\tyear         = {2002 2002},\n\tjournal      = {Department of Mathematics, University of Bristol},\n\tvolume       = 12\n}\n@article{tripuraneni2020multitask,\n\ttitle        = {On the Theory of Transfer Learning: The Importance of Task Diversity},\n\tauthor       = {Nilesh Tripuraneni and Michael I. Jordan and Chi Jin},\n\tyear         = 2020,\n\tjournal      = {arXiv}\n}\n@inproceedings{trischler2017newsqa,\n\ttitle        = {{NewsQA}: A Machine Comprehension Dataset},\n\tauthor       = {Adam Trischler and Tong Wang and Xingdi Yuan and Justin Harris and Alessandro Sordoni and Philip Bachman and Kaheer Suleman},\n\tyear         = 2017,\n\tbooktitle    = {Workshop on Representation Learning for NLP}\n}\n@article{Tropp,\n\ttitle        = {User-friendly tail bounds for sums of random matrices},\n\tauthor       = {Tropp, Joel A},\n\tyear         = 2012,\n\tjournal      = {Foundations of computational mathematics},\n\tpublisher    = {Springer},\n\tvolume       = 12,\n\tnumber       = 4,\n\tpages        = {389--434},\n\tdoi          = {10.1007/s10208-011-9099-z},\n\tissn         = {1615-3383},\n\turl          = {http://dx.doi.org/10.1007/s10208-011-9099-z},\n\tabstract     = {This paper presents new probability inequalities for sums of independent, random, self-adjoint matrices. These results place simple and easily verifiable hypotheses on the summands, and they deliver strong conclusions about the large-deviation behavior of the maximum eigenvalue of the sum. Tail bounds for the norm of a sum of random rectangular matrices follow as an immediate corollary. The proof techniques also yield some information about matrix-valued martingales.}\n}\n@article{Tropp-book2015,\n\ttitle        = {{An Introduction to Matrix Concentration Inequalities}},\n\tauthor       = {Tropp, Joel A.},\n\tyear         = 2015,\n\tmonth        = jan,\n\tjournal      = {ArXiv e-prints},\n\tpublisher    = {Now Publishers, Inc.},\n\tvolume       = {abs/1501.01571},\n\tnumber       = {1-2},\n\tpages        = {1--230},\n\tarchiveprefix = {arXiv},\n\teprint       = {1501.01571},\n\tprimaryclass = {math.PR},\n\tkeywords     = {Mathematics - Probability, Computer Science - Data Structures and Algorithms, Computer Science - Information Theory, Computer Science - Numerical Analysis, Statistics - Machine Learning, Primary: 60B20.~Secondary: 60F10, 60G50, 60G42},\n\tadsurl       = {http://adsabs.harvard.edu/abs/2015arXiv150101571T},\n\tadsnote      = {Provided by the SAO/NASA Astrophysics Data System}\n}\n@article{tropp:svd,\n\ttitle        = {Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions},\n\tauthor       = {N. Halko and P.-G. Martinsson and J. A. Tropp},\n\tyear         = 2011,\n\tjournal      = {SIAM Review},\n\tvolume       = 53,\n\tnumber       = 2,\n\tpages        = {217--288}\n}\n@misc{tropp2015introduction,\n\ttitle        = {An Introduction to Matrix Concentration Inequalities},\n\tauthor       = {Joel A. Tropp},\n\tyear         = 2015,\n\tjournal      = {Foundations and Trends in Machine Learning},\n\tvolume       = 8,\n\tpages        = {1--230},\n\teprint       = {1501.01571},\n\tarchiveprefix = {arXiv},\n\tprimaryclass = {math.PR}\n}\n@article{trunk1979problem,\n\ttitle        = {A Problem of Dimensionality: A Simple Example},\n\tauthor       = {G. V. Trunk},\n\tyear         = 1979,\n\tjournal      = {IEEE Transactions on Pattern Analysis and Machine Intelligence},\n\tvolume       = 3,\n\tpages        = {306--307}\n}\n@inproceedings{tsai2010urban,\n\ttitle        = {Urban security: Game-theoretic resource allocation in networked domains},\n\tauthor       = {Tsai, Jason and Yin, Zhengyu and Kwak, Jun-young and Kempe, David and Kiekintveld, Christopher and Tambe, Milind},\n\tyear         = 2010,\n\tbooktitle    = {Twenty-Fourth AAAI Conference on Artificial Intelligence}\n}\n@article{tsai2020self,\n\ttitle        = {Self-supervised learning from a multi-view perspective},\n\tauthor       = {Tsai, Yao-Hung Hubert and Wu, Yue and Salakhutdinov, Ruslan and Morency, Louis-Philippe},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.05576}\n}\n@article{tsai2021mice,\n\ttitle        = {MiCE: Mixture of Contrastive Experts for Unsupervised Image Clustering},\n\tauthor       = {Tsai, Tsung Wei and Li, Chongxuan and Zhu, Jun},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2105.01899}\n}\n@article{tschandl2018ham10000,\n\ttitle        = {The HAM10000 dataset, a large collection of multi-source dermatoscopic images of common pigmented skin lesions},\n\tauthor       = {Tschandl, Philipp and Rosendahl, Cliff and Kittler, Harald},\n\tyear         = 2018,\n\tjournal      = {Scientific data},\n\tpublisher    = {Nature Publishing Group},\n\tvolume       = 5,\n\tnumber       = 1,\n\tpages        = {1--9}\n}\n@techreport{Tse90,\n\ttitle        = {Successive projection under a quasi-cyclic order},\n\tauthor       = {Tseng, Paul},\n\tyear         = 1990,\n\tpublisher    = {Lab. for Information and Decision Systems, MIT},\n\tseries       = {LIDS-P-1938},\n\tinstitution  = {DTIC Document},\n\t//address    = {Cambridge, MA, USA}\n}\n@article{tseng1990solving,\n\ttitle        = {Solving H-horizon, stationary Markov decision problems in time proportional to log (H)},\n\tauthor       = {Tseng, Paul},\n\tyear         = 1990,\n\tjournal      = {Operations Research Letters},\n\tpublisher    = {Elsevier},\n\tvolume       = 9,\n\tnumber       = 5,\n\tpages        = {287--297},\n\tdate-added   = {2017-05-19 05:05:08 +0000},\n\tdate-modified = {2017-05-19 05:05:08 +0000}\n}\n@article{tsipras2018there,\n\ttitle        = {There is no free lunch in adversarial robustness (but there are unexpected benefits)},\n\tauthor       = {Dimitris Tsipras and Shibani Santurkar and Logan Engstrom and Alexander Turner and Aleksander Madry},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.12152}\n}\n@inproceedings{tsipras2019robustness,\n\ttitle        = {Robustness may be at odds with accuracy},\n\tauthor       = {Dimitris Tsipras and Shibani Santurkar and Logan Engstrom and Alexander Turner and Aleksander Madry},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{tsirelson1976norms,\n\ttitle        = {Norms of {G}aussian sample functions},\n\tauthor       = {B. S. Tsirelson and I. A. Ibragimov and V. N. Sudakov},\n\tyear         = 1976,\n\tbooktitle    = {Proceedings of the Third Japan-USSR Symposium on Probability Theory},\n\tpages        = {20--41}\n}\n@article{tsitsiklis1996feature,\n\ttitle        = {Feature-based methods for large scale dynamic programming},\n\tauthor       = {Tsitsiklis, John N and Van Roy, Benjamin},\n\tyear         = 1996,\n\tjournal      = {Machine Learning},\n\tpublisher    = {Springer},\n\tvolume       = 22,\n\tnumber       = {1-3},\n\tpages        = {59--94}\n}\n@inproceedings{tsitsiklis1997analysis,\n\ttitle        = {Analysis of temporal-diffference learning with function approximation},\n\tauthor       = {Tsitsiklis, John N and Van Roy, Benjamin},\n\tyear         = 1997,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {1075--1081}\n}\n@book{TSM85,\n\ttitle        = {Statistical analysis of finite mixture distributions},\n\tauthor       = {D. M. Titterington and A. F. M. Smith and U. E. Makov},\n\tyear         = 1985,\n\tpublisher    = {Wiley}\n}\n@article{tsn18,\n\ttitle        = {Tensor Decomposition for Compressing Recurrent Neural Network},\n\tauthor       = {Andros Tjandra and Sakriani Sakti and Satoshi Nakamura},\n\tyear         = 2018,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1802.10410},\n\turl          = {http://arxiv.org/abs/1802.10410}\n}\n@article{tsuchida2017invariance,\n\ttitle        = {Invariance of Weight Distributions in Rectified MLPs},\n\tauthor       = {Tsuchida, Russell and Roosta-Khorasani, Farbod and Gallagher, Marcus},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1711.09090}\n}\n@article{tsuda2017chemts,\n\ttitle        = {ChemTS: An Efficient {Python} Library for de novo Molecular Generation},\n\tauthor       = {X. Yang and J. Zhang and K. Yoshizoe and K. Terayama and K. Tsuda},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@book{tsybakov2008introduction,\n\ttitle        = {Introduction to nonparametric estimation},\n\tauthor       = {Tsybakov, Alexandre B},\n\tyear         = 2008,\n\tpublisher    = {Springer Science \\& Business Media}\n}\n@book{tsybakov2009introduction,\n\ttitle        = {Introduction to Nonparametric Estimation},\n\tauthor       = {Alexandre B. Tsybakov},\n\tyear         = 2009,\n\tpublisher    = {Springer}\n}\n@techreport{tu2014practical,\n\ttitle        = {Practical first order methods for large scale semidefinite programming},\n\tauthor       = {Stephen Tu and Jingyan Wang},\n\tyear         = 2014,\n\tinstitution  = {University of California, Berkeley}\n}\n@article{tu2015low,\n\ttitle        = {Low-rank solutions of linear matrix equations via {P}rocrustes flow},\n\tauthor       = {Tu, Stephen and Boczar, Ross and Soltanolkotabi, Mahdi and Recht, Benjamin},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1507.03566},\n\tbooktitle    = {Proceedings of the 33rd International Conference on International Conference on Machine Learning-Volume 48},\n\tpages        = {964--973},\n\tdate-modified = {2016-02-15 19:26:56 +0000},\n\torganization = {JMLR. org}\n}\n@inproceedings{tu2016modeling,\n\ttitle        = {Modeling Coverage for Neural Machine Translation},\n\tauthor       = {Zhaopeng Tu and Zhengdong Lu and Yang Liu and Xiaohua Liu and Hang Li},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{tu2018gap,\n\ttitle        = {The gap between model-based and model-free methods on the linear quadratic regulator: An asymptotic viewpoint},\n\tauthor       = {Tu, Stephen and Recht, Benjamin},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1812.03565}\n}\n@article{tu2020empirical,\n\ttitle        = {An empirical study on robustness to spurious correlations using pre-trained language models},\n\tauthor       = {Lifu Tu and Garima Lalwani and Spandana Gella and He He},\n\tyear         = 2020,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 8,\n\tpages        = {621--633}\n}\n@article{tukey1960survey,\n\ttitle        = {A survey of sampling from contaminated distributions},\n\tauthor       = {John W. Tukey},\n\tyear         = 1960,\n\tjournal      = {Contributions to probability and statistics},\n\tvolume       = 2,\n\tpages        = {448--485}\n}\n@inproceedings{tukey1975mathematics,\n\ttitle        = {Mathematics  and  picturing  of data},\n\tauthor       = {John W. Tukey},\n\tyear         = 1975,\n\tbooktitle    = {ICM},\n\tvolume       = 6,\n\tpages        = {523--531}\n}\n@article{turchetta2020safe,\n\ttitle        = {Safe reinforcement learning via curriculum induction},\n\tauthor       = {Turchetta, Matteo and Kolobov, Andrey and Shah, Shital and Krause, Andreas and Agarwal, Alekh},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.12136}\n}\n@inproceedings{turian2010word,\n\ttitle        = {Word representations: a simple and general method for semi-supervised learning},\n\tauthor       = {Joseph Turian and Lev Ratinov and Yoshua Bengio},\n\tyear         = 2010,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {384--394}\n}\n@article{turing1950computing,\n\ttitle        = {Computing machinery and intelligence},\n\tauthor       = {Alan M Turing},\n\tyear         = 1950,\n\tjournal      = {Mind},\n\tvolume       = 49,\n\tpages        = {433--460}\n}\n@inproceedings{turner09geo,\n\ttitle        = {Generating approximate geographic descriptions},\n\tauthor       = {Ross Turner and Yaji Sripada and Ehud Reiter},\n\tyear         = 2009,\n\tbooktitle    = {European Workshop on Natural Language Generation},\n\tpages        = {42--49}\n}\n@inproceedings{turner2005supervised,\n\ttitle        = {Supervised and unsupervised learning for sentence compression},\n\tauthor       = {Jenine Turner and Eugene Charniak},\n\tyear         = 2005,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {290--297}\n}\n@article{turney2001mining,\n\ttitle        = {Mining the web for synonyms: {PMI}-{IR} versus {LSA} on {TOEFL}},\n\tauthor       = {Peter Turney},\n\tyear         = 2001,\n\tjournal      = {Machine Learning},\n\tvolume       = 2167,\n\tpages        = {491--502}\n}\n@article{turney2010frequency,\n\ttitle        = {From frequency to meaning: Vector space models of semantics},\n\tauthor       = {Turney, Peter D. and Pantel, Patrick},\n\tyear         = 2010,\n\tjournal      = {Journal of Artificial Intelligence Research}\n}\n@article{tversky1986nearest,\n\ttitle        = {Nearest neighbor analysis of psychological spaces},\n\tauthor       = {Amos Tversky and J. Wesley Hutchinson},\n\tyear         = 1986,\n\tjournal      = {Psychological review},\n\tpages        = {1--3}\n}\n@article{Tzen2019NeuralSD,\n\ttitle        = {Neural Stochastic Differential Equations: Deep Latent Gaussian Models in the Diffusion Limit},\n\tauthor       = {Belinda Tzen and M. Raginsky},\n\tyear         = 2019,\n\tjournal      = {ArXiv},\n\tvolume       = {abs/1905.09883}\n}\n@article{tzeng2014domain,\n\ttitle        = {Deep Domain Confusion: Maximizing for Domain Invariance},\n\tauthor       = {Eric Tzeng and Judy Hoffman and Ning Zhang and Kate Saenko and Trevor Darrell},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1412.3474}\n}\n@inproceedings{tzeng2017domain,\n\ttitle        = {Adversarial Discriminative Domain Adaptation},\n\tauthor       = {Eric Tzeng and Judy Hoffman and Kate Saenko and Trevor Darrell},\n\tyear         = 2017,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@misc{tzutalin2015labelimg,\n\ttitle        = {LabelImg},\n\tauthor       = {Tzutalin},\n\tyear         = 2015,\n\thowpublished = {\\url{https://github.com/tzutalin/labelImg}}\n}\n@misc{uci,\n\ttitle        = {{UCI} Machine Learning Repository},\n\tauthor       = {M. Lichman},\n\tyear         = 2013,\n\turl          = {http://archive.ics.uci.edu/ml},\n\tinstitution  = {University of California, Irvine, School of Information and Computer Sciences}\n}\n@inproceedings{uesato2019are,\n\ttitle        = {Are Labels Required for Improving Adversarial Robustness?},\n\tauthor       = {Jonathan Uesato and Jean-Baptiste Alayrac and Po-Sen Huang and Robert Stanforth and Alhussein Fawzi and Pushmeet Kohli},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{uesato2019rigorous,\n\ttitle        = {Rigorous Agent Evaluation: An Adversarial Approach to Uncover Catastrophic Failures},\n\tauthor       = {Jonathan Uesato and Ananya Kumar and Csaba Szepesvari and Tom Erez and Avraham Ruderman and Keith Anderson and Krishmamurthy (Dj) Dvijotham and Nicolas Heess and Pushmeet Kohli},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{ullman1985implementation,\n\ttitle        = {Implementation of logical query languages for databases},\n\tauthor       = {Jeffrey D Ullman},\n\tyear         = 1985,\n\tjournal      = {ACM Transactions on Database Systems (TODS)},\n\tvolume       = 10,\n\tnumber       = 3,\n\tpages        = {289--321}\n}\n@inproceedings{ullman2016pragmatics,\n\ttitle        = {The Pragmatics of Spatial Language},\n\tauthor       = {Tomer D Ullman and Yang Xu and Noah D Goodman},\n\tyear         = 2016,\n\tbooktitle    = {Proceedings of the 38th Annual Conference of the Cognitive Science Society}\n}\n@inproceedings{ullman2018physics,\n\ttitle        = {Learning physical parameters from dynamic scenes},\n\tauthor       = {Tomer D. Ullman and Andreas Stuhmuller and Noah D. Goodman and Joshua B. Tenenbaum},\n\tyear         = 2018,\n\tbooktitle    = {Cognitive Psychology}\n}\n@book{ullmann1962semantics,\n\ttitle        = {Semantics: An Introduction to the Science of Meaning},\n\tauthor       = {S. Ullmann},\n\tyear         = 1962,\n\tpublisher    = {W. de Gruyter}\n}\n@inproceedings{ungar2012judgement,\n\ttitle        = {The Good Judgment Project: A Large Scale Test of Different Methods of Combining Expert Predictions},\n\tauthor       = {Lyle Ungar and Barb Mellors and Ville Satopää and Jon Baron and Phil Tetlock and Jaime Ramos and Sam Swift},\n\tyear         = 2012,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{unger2011pythia,\n\ttitle        = {Pythia: compositional meaning construction for ontology-based question answering on the semantic web},\n\tauthor       = {Christina Unger and Philipp Cimiano},\n\tyear         = 2011,\n\tbooktitle    = {Proceedings of the 16th international conference on Natural language processing and information systems},\n\tpages        = {153--160}\n}\n@inproceedings{unger2012template,\n\ttitle        = {Template-based question answering over {RDF} data},\n\tauthor       = {Christina Unger and Lorenz Bühmann and Jens Lehmann and Axel-Cyrille Ngonga and Daniel Gerber and Philipp Cimiano},\n\tyear         = 2012,\n\tbooktitle    = {World Wide Web (WWW)},\n\tpages        = {639--648}\n}\n@inproceedings{unmixing,\n\ttitle        = {Identifiability and unmixing of latent parse trees},\n\tauthor       = {D. Hsu and S. Kakade and P. Liang},\n\tyear         = 2012,\n\tbooktitle    = {Advances in Neural Information Processing Systems 25}\n}\n@article{uno1989formation,\n\ttitle        = {\n\t\tFormation and control of optimal trajectory in human multijoint arm\n\n\t\tmovement\n\t},\n\tauthor       = {Uno, Y. and Kawato, M. and Suzuki, R.},\n\tyear         = 1989,\n\tmonth        = jun,\n\tjournal      = {Biological Cybernetics},\n\tvolume       = 61,\n\tnumber       = 2,\n\tpages        = {89--101},\n\tdoi          = {10.1007/BF00204593},\n\tabstract     = {\n\t\tIn this paper, we study trajectory planning and control in voluntary,\n\n\t\thuman arm movements. When a hand is moved to a target, the central\n\n\t\tnervous system must select one specific trajectory among an infinite\n\n\t\tnumber of possible trajectories that lead to the target position.\n\n\t\tFirst, we discuss what criterion is adopted for trajectory determination.\n\n\t\tSeveral researchers measured the hand trajectories of skilled movements\n\n\t\tand found common invariant features. For example, when moving the\n\n\t\thand between a pair of targets, subjects tended to generate roughly\n\n\t\tstraight hand paths with bell-shaped speed profiles. On the basis\n\n\t\tof these observations and dynamic optimization theory, we propose\n\n\t\ta mathematical model which accounts for formation of hand trajectories.\n\n\t\tThis model is formulated by defining an objective function, a measure\n\n\t\tof performance for any possible movement: square of the rate of change\n\n\t\tof torque integrated over the entire movement. That is, the objective\n\n\t\tfunction CT is defined as follows: \\$\\$C\\_T = \\frac{1}{2}{}^t\\int\\limits\\_0^f\n\n\t\t{\\sum\\limits\\_{i = 1}^n {\\left( {\\frac{{{\\text{d}}z\\_i }}{{{\\text{d}}t}}}\n\n\t\t\\right)^2 {\\text{d}}t,} } \\$\\$ where ziis the torque generated by\n\n\t\tthe i-th actuator (muslce) out of n actuators, and tfis the movement\n\n\t\ttime. Since this objective function critically depends on the complex\n\n\t\tnonlinear dynamics of the musculoskeletal system, it is very difficult\n\n\t\tto determine the unique trajectory which yields the best performance.\n\n\t\tWe overcome this difficult by developing an iterative scheme, with\n\n\t\twhich the optimal trajectory and the associated motor command are\n\n\t\tsimultaneously computed. To evaluate our model, human hand trajectories\n\n\t\twere experimentally measured under various behavioral situations.\n\n\t\tThese results supported the idea that the human hand trajectory is\n\n\t\tplanned and controlled in accordance with the minimum torquechange\n\n\t\tcriterion.\n\t},\n\tciteulike-article-id = 2270940,\n\tkeywords     = {movement},\n\tmyurl        = {http://dx.doi.org/10.1007/BF00204593},\n\tpriority     = 2\n}\n@book{uryasev2013stochastic,\n\ttitle        = {Stochastic optimization: algorithms and applications},\n\tauthor       = {Uryasev, Stanislav and Pardalos, Panos M},\n\tyear         = 2013,\n\tpublisher    = {Springer Science \\& Business Media},\n\tvolume       = 54\n}\n@article{uschmajew2012local,\n\ttitle        = {Local convergence of the alternating least squares algorithm for canonical tensor approximation},\n\tauthor       = {Uschmajew, Andr{\\'e}},\n\tyear         = 2012,\n\tjournal      = {SIAM Journal on Matrix Analysis and Applications},\n\tpublisher    = {SIAM},\n\tvolume       = 33,\n\tnumber       = 2,\n\tpages        = {639--652}\n}\n@inproceedings{ustun2019fairness,\n\ttitle        = {Fairness without harm: Decoupled classifiers with preference guarantees},\n\tauthor       = {Berk Ustun and Yang Liu and David Parkes},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {6373--6382}\n}\n@article{utama2020towards,\n\ttitle        = {Towards debiasing {NLU} models from unknown biases},\n\tauthor       = {Prasetya Ajie Utama and Nafise Sadat Moosavi and Iryna Gurevych},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2009.12303}\n}\n@article{uyumazturk2019deep,\n\ttitle        = {Deep Learning for the Digital Pathologic Diagnosis of Cholangiocarcinoma and Hepatocellular Carcinoma: Evaluating the Impact of a Web-based Diagnostic Assistant},\n\tauthor       = {Bora Uyumazturk and Amirhossein Kiani and Pranav Rajpurkar and Alex Wang and Robyn L. Ball and Rebecca Gao and Yifan Yu and Erik Jones and Curtis P. Langlotz and Brock Martin and Gerald J. Berry and Michael G. Ozawa and Florette K. Hazard and Ryanne A. Brown and Simon B. Chen and Mona Wood and Libby S. Allard and Lourdes Ylagan and Andrew Y. Ng and Jeanne Shen},\n\tyear         = 2019,\n\tjournal      = {arXiv}\n}\n@inproceedings{uzkent2020zoom,\n\ttitle        = {Learning When and Where to Zoom with Deep Reinforcement Learning},\n\tauthor       = {Burak Uzkent and Stefano Ermon},\n\tyear         = 2020,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@book{vaart98asymptotic,\n\ttitle        = {Asymptotic statistics},\n\tauthor       = {A. W. van der Vaart},\n\tyear         = 1998,\n\tpublisher    = {Cambridge University Press}\n}\n@inproceedings{vadas2005programming,\n\ttitle        = {Programming With Unrestricted Natural Language},\n\tauthor       = {David Vadas and James R. Curran},\n\tyear         = 2005,\n\tbooktitle    = {Australasian Language Technology Workshop (ALTA)}\n}\n@inproceedings{vaicenavicius2019calibration,\n\ttitle        = {Evaluating model calibration in classification},\n\tauthor       = {Juozas Vaicenavicius and David Widmann and Carl Andersson and Fredrik Lindsten and Jacob Roll and  Thomas B. Schön},\n\tyear         = 2019,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)}\n}\n@inproceedings{vaish2014twitch,\n\ttitle        = {Twitch crowdsourcing: crowd contributions in short bursts of time},\n\tauthor       = {Rajan Vaish and Keith Wyngarden and Jingshu Chen and Brandon Cheung and Michael S Bernstein},\n\tyear         = 2014,\n\tbooktitle    = {Conference on Human Factors in Computing Systems (CHI)},\n\tpages        = {3645--3654}\n}\n@article{valcarcel2015distributed,\n\ttitle        = {Distributed policy evaluation under multiple behavior strategies},\n\tauthor       = {Valcarcel Macua, Sergio and Chen, Jianshu and Zazo, Santiago and Sayed, Ali H},\n\tyear         = 2015,\n\tjournal      = {Automatic Control, IEEE Transactions on},\n\tpublisher    = {IEEE},\n\tvolume       = 60,\n\tnumber       = 5,\n\tpages        = {1260--1274}\n}\n@article{valiant84learnable,\n\ttitle        = {A theory of the learnable},\n\tauthor       = {Leslie Valiant},\n\tyear         = 1984,\n\tjournal      = {Communications of the ACM},\n\tvolume       = 27,\n\tnumber       = 11,\n\tpages        = {1134--1142}\n}\n@inproceedings{valitutti2013adult,\n\ttitle        = {``Let Everything Turn Well in Your Wife: Generation of Adult Humor Using Lexical Constraints},\n\tauthor       = {Alessandro Valitutti and Hannu Toivonen and Antoine Doucet and Jukka M. Toivanen},\n\tyear         = 2013,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{valko2013finite,\n\ttitle        = {Finite-time analysis of kernelised contextual bandits},\n\tauthor       = {Valko, Michal and Korda, Nathan and Munos, R{\\'e}mi and Flaounas, Ilias and Cristianini, Nello},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of the Twenty-Ninth Conference on Uncertainty in Artificial Intelligence},\n\tpages        = {654--663}\n}\n@article{van1983estimation,\n\ttitle        = {On the estimation of the parameters of {M}arkov probability models using macro data},\n\tauthor       = {Adriaan P Van Der Plas},\n\tyear         = 1983,\n\tjournal      = {Annals of Statistics},\n\tvolume       = 1,\n\tpages        = {78--85}\n}\n@article{van1985computing,\n\ttitle        = {Computing the CS and the generalized singular value decompositions},\n\tauthor       = {Van Loan, Charles},\n\tyear         = 1985,\n\tjournal      = {Numerische Mathematik},\n\tpublisher    = {Springer},\n\tvolume       = 46,\n\tnumber       = 4,\n\tpages        = {479--491}\n}\n@article{van2003questioning,\n\ttitle        = {Questioning to resolve decision problems},\n\tauthor       = {Robert Van Rooy},\n\tyear         = 2003,\n\tjournal      = {Linguistics and Philosophy},\n\tvolume       = 26,\n\tnumber       = 6,\n\tpages        = {727--763}\n}\n@article{van2006performance,\n\ttitle        = {Performance loss bounds for approximate value iteration with state aggregation},\n\tauthor       = {Van Roy, Benjamin},\n\tyear         = 2006,\n\tjournal      = {Math. Oper. Res.},\n\tvolume       = 31,\n\tnumber       = 2,\n\tpages        = {234--244},\n\tissn         = {0364-765X},\n\turl          = {https://doi.org/10.1287/moor.1060.0188},\n\tfjournal     = {Mathematics of Operations Research},\n\tmrclass      = {90C39 (60A10 90C40)},\n\tmrnumber     = 2233994\n}\n@inproceedings{van2012effect,\n\ttitle        = {Confounding and effect modification: distribution and measure},\n\tauthor       = {Tyler J. VanderWeele},\n\tyear         = 2012,\n\tbooktitle    = {Epidemiologic Methods}\n}\n@inproceedings{van2013confounder,\n\ttitle        = {On the definition of a confounder},\n\tauthor       = {Tyler J. VanderWeele and Ilya Shpitser},\n\tyear         = 2013,\n\tbooktitle    = {Annals of Statistics}\n}\n@article{van2014transfer,\n\ttitle        = {Transfer learning improves supervised image segmentation across imaging protocols},\n\tauthor       = {Van Opbroek, Annegreet and Ikram, M Arfan and Vernooij, Meike W and De Bruijne, Marleen},\n\tyear         = 2014,\n\tjournal      = {IEEE transactions on medical imaging},\n\tpublisher    = {IEEE},\n\tvolume       = 34,\n\tnumber       = 5,\n\tpages        = {1018--1030}\n}\n@book{van2015causal,\n\ttitle        = {Explanation in Causal Inference: Methods for Mediation and Interaction},\n\tauthor       = {Tyler J. VanderWeele},\n\tyear         = 2015,\n\tpublisher    = {Oxford University Press}\n}\n@inproceedings{van2016deep,\n\ttitle        = {Deep Reinforcement Learning with Double {Q}-Learning},\n\tauthor       = {Hado van Hasselt and Arthur Guez and David Silver},\n\tyear         = 2016,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tvolume       = 16,\n\tpages        = {2094--2100}\n}\n@inproceedings{van2018inaturalist,\n\ttitle        = {The inaturalist species classification and detection dataset},\n\tauthor       = {Van Horn, Grant and Mac Aodha, Oisin and Song, Yang and Cui, Yin and Sun, Chen and Shepard, Alex and Adam, Hartwig and Perona, Pietro and Belongie, Serge},\n\tyear         = 2018,\n\tbooktitle    = {Proceedings of the IEEE conference on computer vision and pattern recognition},\n\tpages        = {8769--8778}\n}\n@article{van2019comments,\n\ttitle        = {Comments on the {D}u-{K}akade-{W}ang-{Y}ang Lower Bounds},\n\tauthor       = {Van Roy, Benjamin and Dong, Shi},\n\tyear         = 2019,\n\tjournal      = {arXiv:1911.07910}\n}\n@misc{vanbriesenchlorine,\n\ttitle        = {Chlorine levels data},\n\tauthor       = {Jeanne M. VanBriesen},\n\turl          = {http://www.cs.cmu.edu/afs/cs/project/spirit-1/www/}\n}\n@inproceedings{vanhalteren2003factoid,\n\ttitle        = {Examining the consensus between human summaries: initial experiments with factoid analysis},\n\tauthor       = {Hans Van Halteren and Simone Teufel},\n\tyear         = 2003,\n\tbooktitle    = {Human Language Technology and North American Association for Computational Linguistics (HLT/NAACL)},\n\tpages        = {57--64}\n}\n@article{vanlehn1987version,\n\ttitle        = {A version space approach to learning context-free grammars},\n\tauthor       = {Kurt Vanlehn and William Ball},\n\tyear         = 1987,\n\tjournal      = {Machine learning},\n\tvolume       = 2,\n\tnumber       = 1,\n\tpages        = {39--74}\n}\n@inproceedings{vannella2014validating,\n\ttitle        = {Validating and Extending Semantic Knowledge Bases using Video Games with a Purpose},\n\tauthor       = {Daniele Vannella and David Jurgens and Daniele Scarfini and Domenico Toscani and Roberto Navigli},\n\tyear         = 2014,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {1294--1304}\n}\n@book{vapnik00nature,\n\ttitle        = {The Nature of Statistical Learning Theory},\n\tauthor       = {Vladimir N. Vapnik},\n\tyear         = 2000,\n\tpublisher    = {Springer-Verlag}\n}\n@article{vapnik1971uniform,\n\ttitle        = {On the uniform convergence of relative frequencies of events to their probabilities},\n\tauthor       = {Vapnik, Vladimir N and Chervonenkis, A Ya},\n\tyear         = 1971,\n\tjournal      = {Theory of Probability \\& Its Applications},\n\tbooktitle    = {Measures of complexity},\n\tpublisher    = {SIAM},\n\tvolume       = 16,\n\tnumber       = 2,\n\tpages        = {264--280},\n\tdoi          = {10.1137/1116025},\n\towner        = {rongge},\n\ttimestamp    = {2013.10.04},\n\tkeywords     = {machine learning, statistics, stl},\n\tposted-at    = {2009-10-11 23:35:59},\n\tpriority     = 2\n}\n@inproceedings{vapnik1992principles,\n\ttitle        = {Principles of risk minimization for learning theory},\n\tauthor       = {Vladimir Vapnik},\n\tyear         = 1992,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {831--838}\n}\n@article{varin08composite,\n\ttitle        = {On composite marginal likelihoods},\n\tauthor       = {Cristiano Varin},\n\tyear         = 2008,\n\tjournal      = {Advances in Statistical Analysis},\n\tvolume       = 92,\n\tpages        = {1--28}\n}\n@article{varma2017socratic,\n\ttitle        = {Socratic Learning: Augmenting Generative Models to Incorporate Latent Subsets in Training Data},\n\tauthor       = {Paroma Varma and Bryan He and Dan Iter and Peng Xu and Rose Yu and C De Sa and C R\\'{e}},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1610.08123}\n}\n@inproceedings{varshney2011risk,\n\ttitle        = {A risk bound for ensemble classification with a reject option},\n\tauthor       = {K. R. Varshney},\n\tyear         = 2011,\n\tbooktitle    = {2011 IEEE Statistical Signal Processing Workshop (SSP)}\n}\n@inproceedings{vasic2019neural,\n\ttitle        = {Neural Program Repair by Jointly Learning to Localize and Repair},\n\tauthor       = {Vasic, Marko and Kanade, Aditya and Maniatis, Petros and Bieber, David and Singh, Rishabh},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{vasilescu2003multilinear,\n\ttitle        = {Multilinear subspace analysis of image ensembles},\n\tauthor       = {M. A. O. Vasilescu and D. Terzopoulos},\n\tyear         = 2003,\n\tbooktitle    = {Computer Vision and Pattern Recognition, 2003. Proceedings. 2003 IEEE Computer Society Conference on},\n\tvolume       = 2,\n\tpages        = {II--93},\n\torganization = {IEEE}\n}\n@inproceedings{vasilescu2005multilinear,\n\ttitle        = {Multilinear independent components analysis},\n\tauthor       = {M Alex O Vasilescu and Demetri Terzopoulos},\n\tyear         = 2005,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tvolume       = 1,\n\tpages        = {547--553}\n}\n@inproceedings{vaskevicius2019implicit,\n\ttitle        = {Implicit Regularization for Optimal Sparse Recovery},\n\tauthor       = {Vaskevicius, Tomas and Kanade, Varun and Rebeschini, Patrick},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {2968--2979}\n}\n@inproceedings{vassiliadis1993input,\n\ttitle        = {\n\t\tThe Input-State Space Approach to the Prediction of Auroral Geomagnetic\n\n\t\tActivity from Solar Wind Variables\n\t},\n\tauthor       = {Dimitris Vassiliadis},\n\tyear         = 1993,\n\tmonth        = sep,\n\tbooktitle    = {\n\t\tInt. Workshop on Applications of Artificial Intelligence in Solar\n\n\t\tTerrestrial Physics\n\t},\n\taddress      = {Lund, Sweden},\n\tkeywords     = {time series},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@inproceedings{vaswani2013decoding,\n\ttitle        = {Decoding with Large-Scale Neural Language Models Improves Translation},\n\tauthor       = {Ashish Vaswani and Yinggong Zhao and Victoria Fossum and David Chiang},\n\tyear         = 2013,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1387--1392}\n}\n@article{vaswani2017attention,\n\ttitle        = {Attention is all you need},\n\tauthor       = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, Lukasz and Polosukhin, Illia},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1706.03762}\n}\n@inproceedings{vaswani2019fast,\n\ttitle        = {Fast and Faster Convergence of SGD for Over-Parameterized Models and an Accelerated Perceptron},\n\tauthor       = {Vaswani, Sharan and Bach, Francis and Schmidt, Mark},\n\tyear         = 2019,\n\tbooktitle    = {The 22nd International Conference on Artificial Intelligence and Statistics},\n\tpages        = {1195--1204}\n}\n@article{Vav,\n\ttitle        = {On the complexity of nonnegative matrix factorization},\n\tauthor       = {S. Vavasis},\n\tyear         = 2009,\n\tjournal      = {SIAM Journal on Optimization},\n\tpages        = {1364--1377}\n}\n@article{Vavasis,\n\ttitle        = {On the Complexity of Nonnegative Matrix Factorization},\n\tauthor       = {Vavasis, Stephen A.},\n\tyear         = 2009,\n\tmonth        = oct,\n\tjournal      = {SIAM J. on Optimization},\n\tpublisher    = {Society for Industrial and Applied Mathematics},\n\taddress      = {Philadelphia, PA, USA},\n\tvolume       = 20,\n\tnumber       = 3,\n\tpages        = {1364--1377},\n\tdoi          = {10.1137/070709967},\n\tissn         = {1052-6234},\n\turl          = {http://dx.doi.org/10.1137/070709967},\n\tissue_date   = {August 2009},\n\tnumpages     = 14,\n\tacmid        = 1898406,\n\tkeywords     = {NP-hard, complexity, data mining, feature detection, nonnegative matrix factorization, nonnegative rank}\n}\n@inproceedings{vavilapalli2013apache,\n\ttitle        = {Apache hadoop yarn: Yet another resource negotiator},\n\tauthor       = {Vavilapalli, Vinod Kumar and Murthy, Arun C and Douglas, Chris and Agarwal, Sharad and Konar, Mahadev and Evans, Robert and Graves, Thomas and Lowe, Jason and Shah, Hitesh and Seth, Siddharth and others},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of the 4th annual Symposium on Cloud Computing},\n\tpages        = 5,\n\torganization = {ACM}\n}\n@article{VCA,\n\ttitle        = {Vertex Component Analysis: A Fast Algorithm to Unmix Hyperspectral Data},\n\tauthor       = {J.M. P. Nascimento and J. M. B. Dias},\n\tyear         = 2004,\n\tjournal      = {IEEE TRANS. GEOSCI. REM. SENS},\n\tvolume       = 43,\n\tpages        = {898--910}\n}\n@article{veale2004incongruity,\n\ttitle        = {Incongruity in humor: Root cause or epiphenomenon?},\n\tauthor       = {Veale, Tony},\n\tyear         = 2004,\n\tjournal      = {Humor: International Journal of Humor Research},\n\tvolume       = 17\n}\n@article{veatch1996scheduling,\n\ttitle        = {Scheduling a make-to-stock queue: Index policies and hedging points},\n\tauthor       = {Veatch, Michael H and Wein, Lawrence M},\n\tyear         = 1996,\n\tjournal      = {Operations Research},\n\tpublisher    = {INFORMS},\n\tvolume       = 44,\n\tnumber       = 4,\n\tpages        = {634--647}\n}\n@article{veatch2013approximate,\n\ttitle        = {Approximate linear programming for average cost MDPs},\n\tauthor       = {Veatch, Michael H},\n\tyear         = 2013,\n\tjournal      = {Mathematics of Operations Research},\n\tpublisher    = {INFORMS},\n\tvolume       = 38,\n\tnumber       = 3,\n\tpages        = {535--544}\n}\n@article{vecerik2017leveraging,\n\ttitle        = {Leveraging Demonstrations for Deep Reinforcement Learning on Robotics Problems with Sparse Rewards},\n\tauthor       = {M. Vecerik and T. Hester and J. Scholz and F. Wang and O. Pietquin and B. Piot and N. Heess and T. Rothorl and T. Lampe and M. Riedmiller},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1707.08817}\n}\n@inproceedings{vedantam2015cider,\n\ttitle        = {{CIDEr}: Consensus-based image description evaluation},\n\tauthor       = {Ramakrishna Vedantam and C. Lawrence Zitnick and Devi Parikh},\n\tyear         = 2015,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {4566--4575}\n}\n@inproceedings{veeling2018rotation,\n\ttitle        = {Rotation equivariant CNNs for digital pathology},\n\tauthor       = {Bastiaan S Veeling and Jasper Linmans and Jim Winkens and Taco Cohen and Max Welling},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Medical image computing and computer-assisted intervention},\n\tpages        = {210--218}\n}\n@inproceedings{vempala2002spectral,\n\ttitle        = {A spectral algorithm for learning mixture models},\n\tauthor       = {Santosh Vempala and Grant Wang},\n\tyear         = 2002,\n\tbooktitle    = {Foundations of Computer Science (FOCS)}\n}\n@article{vempala2011structure,\n\ttitle        = {Structure from local optima: Learning subspace juntas via higher order PCA},\n\tauthor       = {Vempala, Santosh S and Xiao, Ying},\n\tyear         = 2011,\n\tjournal      = {arXiv preprint arXiv:1108.3329}\n}\n@inproceedings{VempalaWang:GaussianMixture,\n\ttitle        = {A spectral algorithm for learning mixtures of distributions},\n\tauthor       = {S. Vempala and G. Wang},\n\tyear         = 2002,\n\tbooktitle    = {FOCS}\n}\n@article{VempalaXiao,\n\ttitle        = {Structure from Local Optima: Learning Subspace Juntas via Higher Order PCA},\n\tauthor       = {Santosh Vempala and Ying Xiao},\n\tyear         = 2011,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1108.3329},\n\tee           = {http://arxiv.org/abs/1108.3329},\n\tbibsource    = {DBLP, http://dblp.uni-trier.de}\n}\n@inproceedings{venetis2011recovering,\n\ttitle        = {Recovering semantics of tables on the web},\n\tauthor       = {Petros Venetis and Alon Halevy and Jayant Madhavan and Marius Pa{\\c{s}}ca and Warren Shen and Fei Wu and Gengxin Miao and Chung Wu},\n\tyear         = 2011,\n\tbooktitle    = {Very Large Data Bases (VLDB)},\n\tvolume       = 4,\n\tpages        = {528--538}\n}\n@article{venkataraman01word,\n\ttitle        = {A statistical model for word discovery in transcribed speech},\n\tauthor       = {A. Venkataraman},\n\tyear         = 2001,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 27,\n\tpages        = {351--372}\n}\n@inproceedings{venkateswara2017deep,\n\ttitle        = {Deep hashing network for unsupervised domain adaptation},\n\tauthor       = {Hemanth Venkateswara and Jose Eusebio and Shayok Chakraborty and Sethuraman Panchanathan},\n\tyear         = 2017,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {5018--5027}\n}\n@inproceedings{venkatraman2015improving,\n\ttitle        = {Improving Multi-Step Prediction of Learned Time Series Models},\n\tauthor       = {Arun Venkatraman and Martial Hebert and J Andrew Bagnell},\n\tyear         = 2015,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tpages        = {3024--3030}\n}\n@article{venturi2018neural,\n\ttitle        = {Neural Networks with Finite Intrinsic Dimension have no Spurious Valleys},\n\tauthor       = {Venturi, Luca and Bandeira, Afonso and Bruna, Joan},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1802.06384}\n}\n@article{verbeke1997linear,\n\ttitle        = {Linear mixed models for longitudinal data},\n\tauthor       = {Geert Verbeke},\n\tyear         = 1997,\n\tjournal      = {Linear Mixed Models in Practice},\n\tpages        = {63--153}\n}\n@article{vergara2012Chemical,\n\ttitle        = {Chemical gas sensor drift compensation using classifier ensembles},\n\tauthor       = {Alexander Vergara and Shankar Vembu and Tuba Ayhan and Margaret A. Ryan and Margie L. Homer and Ramón Huerta},\n\tyear         = 2012,\n\tjournal      = {Journal of the American Statistical Association},\n\tvolume       = {-1},\n\tpages        = {320--329}\n}\n@article{vermaak2005monte,\n\ttitle        = {{M}onte {C}arlo filtering for multi-target tracking and data association},\n\tauthor       = {Jaco Vermaak and Simon J. Godsill and Patrick Perez},\n\tyear         = 2005,\n\tjournal      = {IEEE Transactions on Aerospace and Electronic Systems},\n\tvolume       = 41,\n\tpages        = {309--332}\n}\n@incollection{Vershynin12,\n\ttitle        = {Introduction to the non-asymptotic analysis of random matrices},\n\tauthor       = {R. Vershynin},\n\tyear         = 2012,\n\tbooktitle    = {Compressed Sensing, Theory and Applications},\n\tpublisher    = {Cambridge University Press},\n\tpages        = {210--268},\n\teditor       = {Y. Eldar and G. Kutyniok},\n\tchapter      = 5\n}\n@article{vershynin2010introduction,\n\ttitle        = {Introduction to the non-asymptotic analysis of random matrices},\n\tauthor       = {Vershynin, Roman},\n\tyear         = 2010,\n\tjournal      = {arXiv preprint arXiv:1011.3027}\n}\n@book{vershynin2018high,\n\ttitle        = {High-dimensional probability: An introduction with applications in data science},\n\tauthor       = {Vershynin, Roman},\n\tyear         = 2018,\n\tpublisher    = {Cambridge university press},\n\tvolume       = 47\n}\n@article{vershynin2020memory,\n\ttitle        = {Memory capacity of neural networks with threshold and ReLU activations},\n\tauthor       = {Vershynin, Roman},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2001.06938}\n}\n@article{verstynen2014organization,\n\ttitle        = {The organization and dynamics of corticostriatal pathways link the medial orbitofrontal cortex to future behavioral responses},\n\tauthor       = {Verstynen, Timothy D},\n\tyear         = 2014,\n\tjournal      = {Journal of neurophysiology},\n\tpublisher    = {Am Physiological Soc},\n\tvolume       = 112,\n\tnumber       = 10,\n\tpages        = {2457--2469}\n}\n@article{veta2016mitosis,\n\ttitle        = {Mitosis counting in breast cancer: Object-level interobserver agreement and comparison to an automatic method},\n\tauthor       = {Mitko Veta and Paul J Van Diest and Mehdi Jiwa and Shaimaa Al-Janabi and Josien PW Pluim},\n\tyear         = 2016,\n\tjournal      = {PloS one},\n\tvolume       = 11,\n\tnumber       = 8\n}\n@article{veta2019predicting,\n\ttitle        = {Predicting breast tumor proliferation from whole-slide images: the TUPAC16 challenge},\n\tauthor       = {Mitko Veta and Yujing J Heng and Nikolas Stathonikos and Babak Ehteshami Bejnordi and Francisco Beca and Thomas Wollmann and Karl Rohr and Manan A Shah and Dayong Wang and Mikael Rousson and others},\n\tyear         = 2019,\n\tjournal      = {Medical image analysis},\n\tvolume       = 54,\n\tpages        = {111--121}\n}\n@article{vezhnevets2017feudal,\n\ttitle        = {Feudal networks for hierarchical reinforcement learning},\n\tauthor       = {A. S. Vezhnevets and S. Osindero and T. Schaul and N. Heess and M. Jaderberg and D. Silver and K. Kavukcuoglu},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1703.01161}\n}\n@article{Vidyasagar08,\n\ttitle        = {A learning theory approach to system identification and stochastic adaptive control},\n\tauthor       = {M. Vidyasagar and Rajeeva L.~Karandikar},\n\tyear         = 2008,\n\tjournal      = {Journal of Process Control},\n\tvolume       = 18,\n\tnumber       = 3,\n\tpages        = {421--430},\n\tdate-added   = {2016-04-02 18:43:31 +0000},\n\tdate-modified = {2016-04-02 18:44:14 +0000}\n}\n@article{vieira2017learning,\n\ttitle        = {Learning to Prune: Exploring the Frontier of Fast and Accurate Parsing},\n\tauthor       = {Tim Vieira and Jason Eisner},\n\tyear         = 2017,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 5,\n\tpages        = {263--278}\n}\n@article{viele2002regression,\n\ttitle        = {Modeling with mixtures of linear regressions},\n\tauthor       = {Kert Viele and Barbara Tong},\n\tyear         = 2002,\n\tjournal      = {Statistics and Computing},\n\tvolume       = 12,\n\tnumber       = 4,\n\tpages        = {315--330}\n}\n@article{vijayakumar2016diverse,\n\ttitle        = {Diverse beam search: Decoding diverse solutions from neural sequence models},\n\tauthor       = {Ashwin K Vijayakumar and Michael Cogswell and Ramprasath R Selvaraju and Qing Sun and Stefan Lee and David Crandall and Dhruv Batra},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1610.02424}\n}\n@conference{vijayarangan2017high,\n\ttitle        = {High-throughput Robotic Phenotyping of Energy Sorghum Crops},\n\tauthor       = {Srinivasan Vijayarangan and Paloma Sodhi and Prathamesh Kini and James Bourne and Simon Du and Hanqi Sun and Barnabas Poczos and Dimitrios (Dimi) Apostolopoulos and David Wettergreen},\n\tyear         = 2017,\n\tmonth        = sep,\n\tbooktitle    = {Proceedings of 11th International Conference on Field and Service Robotics (FSR '17)},\n\tpages        = {99--113},\n\tkeywords     = {Plant Phenotyping, Computer Vision, Multi-view Reconstruction, Field Robot Design, Machine Learning}\n}\n@inproceedings{vijayarangan2018high,\n\ttitle        = {High-throughput robotic phenotyping of energy sorghum crops},\n\tauthor       = {Vijayarangan, Srinivasan and Sodhi, Paloma and Kini, Prathamesh and Bourne, James and Du, Simon and Sun, Hanqi and Poczos, Barnabas and Apostolopoulos, Dimitrios and Wettergreen, David},\n\tyear         = 2018,\n\tbooktitle    = {Field and Service Robotics},\n\tpages        = {99--113},\n\torganization = {Springer}\n}\n@article{vilnis2014gaussian,\n\ttitle        = {Word Representations via {G}aussian Embedding},\n\tauthor       = {Luke Vilnis and Andrew McCallum},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1412.6623}\n}\n@article{vincent2010stacked,\n\ttitle        = {Stacked denoising autoencoders: Learning useful representations in a deep network with a local denoising criterion},\n\tauthor       = {Vincent, Pascal and Larochelle, Hugo and Lajoie, Isabelle and Bengio, Yoshua and Manzagol, Pierre-Antoine},\n\tyear         = 2010,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 11,\n\tnumber       = {Dec},\n\tpages        = {3371--3408}\n}\n@article{vinyals2014show,\n\ttitle        = {Show and tell: A neural image caption generator},\n\tauthor       = {Oriol Vinyals and Alexander Toshev and Samy Bengio and Dumitru Erhan},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1411.4555}\n}\n@article{vinyals2015convo,\n\ttitle        = {A Neural Conversational Model},\n\tauthor       = {Oriol Vinyals and Quoc V. Le},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1506.05869}\n}\n@inproceedings{vinyals2015grammar,\n\ttitle        = {Grammar as a Foreign Language},\n\tauthor       = {Oriol Vinyals and Lukasz Kaiser and Terry Koo and Slav Petrov and Ilya Sutskever and Geoffrey Hinton},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {2755--2763}\n}\n@inproceedings{vinyals2015pointer,\n\ttitle        = {Pointer Networks},\n\tauthor       = {Oriol Vinyals and Meire Fortunato and Navdeep Jaitly},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {2674--2682}\n}\n@inproceedings{vinyals2016matching,\n\ttitle        = {Matching networks for one shot learning},\n\tauthor       = {Oriol Vinyals and Charles Blundell and Timothy Lillicrap and Daan Wierstra and others},\n\tyear         = 2016,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {3630--3638}\n}\n@article{viola2004robust,\n\ttitle        = {Robust real-time face detection},\n\tauthor       = {Paul Viola and Michael J Jones},\n\tyear         = 2004,\n\tjournal      = {International Journal of Computer Vision},\n\tvolume       = 57,\n\tnumber       = 2,\n\tpages        = {137--154}\n}\n@inproceedings{vipindeep05pruning,\n\ttitle        = {Efficient static analysis with path pruning using coverage data},\n\tauthor       = {V. Vipindeep and Pankaj Jalote},\n\tyear         = 2005,\n\tbooktitle    = {International Workshop on Dynamic Analysis (WODA)}\n}\n@inproceedings{vlachos2012investigation,\n\ttitle        = {An Investigation of Imitation Learning Algorithms for Structured Prediction},\n\tauthor       = {Andreas Vlachos},\n\tyear         = 2012,\n\tbooktitle    = {European Workshop on Reinforcement Learning}\n}\n@article{vlachos2014new,\n\ttitle        = {A New Corpus and Imitation Learning Framework for Context-Dependent Semantic Parsing},\n\tauthor       = {Andreas Vlachos and Stephen Clark},\n\tyear         = 2014,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 2,\n\tpages        = {547--559}\n}\n@inproceedings{vlatakis2019efficiently,\n\ttitle        = {Efficiently avoiding saddle points with zero order methods: No gradients required},\n\tauthor       = {Vlatakis-Gkaragkounis, Emmanouil-Vasileios and Flokas, Lampros and Piliouras, Georgios},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tvolume       = 32\n}\n@article{vNe50,\n\ttitle        = {Functional operators},\n\tauthor       = {Neumann, John von},\n\tyear         = 1950,\n\tjournal      = {Ann. of Math.},\n\tnumber       = 22,\n\tfjournal     = {Annals of Mathematics}\n}\n@inproceedings{vogel10navigate,\n\ttitle        = {Learning to Follow Navigational Directions},\n\tauthor       = {Adam Vogel and Dan Jurafsky},\n\tyear         = 2010,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {806--814}\n}\n@inproceedings{vogel2013emergence,\n\ttitle        = {Emergence of {Gricean} Maxims from Multi-Agent Decision Theory},\n\tauthor       = {Adam Vogel and Max Bodoia and Christopher Potts and Daniel Jurafsky},\n\tyear         = 2013,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)},\n\tpages        = {1072--1081}\n}\n@inproceedings{vogel2013implicatures,\n\ttitle        = {Implicatures and Nested Beliefs in Approximate Decentralized-{POMDP}s},\n\tauthor       = {Adam Vogel and Christopher Potts and Dan Jurafsky},\n\tyear         = 2013,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {74--80}\n}\n@article{vollgraf2006quadratic,\n\ttitle        = {Quadratic optimization for simultaneous matrix diagonalization},\n\tauthor       = {Roland Vollgraf and Klaus Obermayer},\n\tyear         = 2006,\n\tjournal      = {IEEE Transactions on Signal Processing},\n\tvolume       = 54,\n\tnumber       = 9,\n\tpages        = {3270--3278}\n}\n@inproceedings{voloshin2021minimax,\n\ttitle        = {Minimax Model Learning},\n\tauthor       = {Voloshin, Cameron and Jiang, Nan and Yue, Yisong},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Artificial Intelligence and Statistics},\n\tpages        = {1612--1620},\n\torganization = {PMLR}\n}\n@inproceedings{volpi2018generalizing,\n\ttitle        = {Generalizing to Unseen Domains via Adversarial Data Augmentation},\n\tauthor       = {Riccardo Volpi and Hongseok Namkoong and Ozan Sener and John Duchi and Vittorio Murino and Silvio Savarese},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{von2006peekaboom,\n\ttitle        = {Peekaboom: a game for locating objects in images},\n\tauthor       = {Luis Von Ahn and Ruoran Liu and Manuel Blum},\n\tyear         = 2006,\n\tbooktitle    = {Conference on Human Factors in Computing Systems (CHI)},\n\tpages        = {55--64}\n}\n@article{von2008designing,\n\ttitle        = {Designing games with a purpose},\n\tauthor       = {Luis Von Ahn and Laura Dabbish},\n\tyear         = 2008,\n\tjournal      = {Communications of the ACM},\n\tvolume       = 51,\n\tnumber       = 8,\n\tpages        = {58--67}\n}\n@inproceedings{voorhees1999overview,\n\ttitle        = {Overview of the Eight Text REtreival Conference ({TREC}-8)},\n\tauthor       = {Ellen M. Voorhees and Donna Harman},\n\tyear         = 1999,\n\tbooktitle    = {TREC-8}\n}\n@inproceedings{voorhees2000building,\n\ttitle        = {Building a question answering test collection},\n\tauthor       = {Ellen M Voorhees and Dawn M Tice},\n\tyear         = 2000,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)},\n\tpages        = {200--207}\n}\n@article{voorhees2007trec,\n\ttitle        = {TREC: Continuing Information Retrieval's Tradition of Experimentation},\n\tauthor       = {Ellen M. Voorhees},\n\tyear         = 2007,\n\tjournal      = {Communications of the ACM},\n\tvolume       = 50,\n\tnumber       = 11,\n\tpages        = {51--54}\n}\n@book{vovk2005algorithmic,\n\ttitle        = {Algorithmic learning in a random world},\n\tauthor       = {Vovk, Vladimir and Gammerman, Alex and Shafer, Glenn},\n\tyear         = 2005,\n\tpublisher    = {Springer Science \\& Business Media}\n}\n@inproceedings{vovk2012conditional,\n\ttitle        = {Conditional validity of inductive conformal predictors},\n\tauthor       = {Vovk, Vladimir},\n\tyear         = 2012,\n\tbooktitle    = {Asian conference on machine learning},\n\tpages        = {475--490}\n}\n@incollection{vovk2013kernel,\n\ttitle        = {Kernel ridge regression},\n\tauthor       = {Vovk, Vladimir},\n\tyear         = 2013,\n\tbooktitle    = {Empirical Inference},\n\tpublisher    = {Springer},\n\tpages        = {105--116}\n}\n@inproceedings{VP12,\n\ttitle        = {Krylov Subspace Descent for Deep Learning},\n\tauthor       = {Oriol Vinyals and Daniel Povey},\n\tyear         = 2012,\n\tjournal      = {Journal of Machine Learning Research - Workshop and Conference Proceedings},\n\tbooktitle    = {Proceedings of the Fifteenth International Conference on Artificial Intelligence and Statistics (AISTATS-12)},\n\tvolume       = 22,\n\tpages        = {1261--1268},\n\turl          = {http://jmlr.csail.mit.edu/proceedings/papers/v22/vinyals12/vinyals12.pdf},\n\teditor       = {Neil D. Lawrence and Mark A. Girolami}\n}\n@article{vrandecic2014wikidata,\n\ttitle        = {Wikidata: A free collaborative knowledgebase},\n\tauthor       = {Denny Vrande\\v{c}i\\'{c} and Markus Kr\\H{o}tzsch},\n\tyear         = 2014,\n\tjournal      = {Communications of the ACM},\n\tvolume       = 57\n}\n@article{vries2020ecological,\n\ttitle        = {Towards Ecologically Valid Research on Language User Interfaces},\n\tauthor       = {Harm D. Vries and Dzmitry Bahdanau and Christopher D. Manning},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.14435}\n}\n@article{vsima2002training,\n\ttitle        = {Training a single sigmoidal neuron is hard},\n\tauthor       = {{\\v{S}}{\\'\\i}ma, Ji{\\v{r}}{\\'\\i}},\n\tyear         = 2002,\n\tjournal      = {Neural Computation},\n\tpublisher    = {MIT Press},\n\tvolume       = 14,\n\tnumber       = 11,\n\tpages        = {2709--2728}\n}\n@article{vstrumbelj2014explaining,\n\ttitle        = {Explaining prediction models and individual predictions with feature contributions},\n\tauthor       = {Erik {\\v{S}}trumbelj and Igor Kononenko},\n\tyear         = 2014,\n\tjournal      = {Knowledge and information systems},\n\tvolume       = 41,\n\tnumber       = 3,\n\tpages        = {647--665}\n}\n@article{vuurens2011spam,\n\ttitle        = {How much spam can you take? {A}n analysis of crowdsourcing results to increase accuracy},\n\tauthor       = {Jeroen Vuurens and Arjen P. de Vries and Carsten Eickhoff},\n\tyear         = 2011,\n\tjournal      = {ACM SIGIR Workshop on Crowdsourcing for Information Retrieval}\n}\n@article{vw18,\n\ttitle        = {Polynomial Convergence of Gradient Descent for Training One-Hidden-Layer Neural Networks},\n\tauthor       = {Vempala, Santosh and Wilmes, John},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.02677}\n}\n@inproceedings{W,\n\ttitle        = {Perturbation bounds in connection with singular value decompositions},\n\tauthor       = {P. Wedin},\n\tyear         = 1972,\n\tbooktitle    = {BIT},\n\tpages        = {99--111}\n}\n@article{w90,\n\ttitle        = {Backpropagation through time: what it does and how to do it},\n\tauthor       = {Werbos, Paul J},\n\tyear         = 1990,\n\tjournal      = {Proceedings of the IEEE},\n\tpublisher    = {IEEE},\n\tvolume       = 78,\n\tnumber       = 10,\n\tpages        = {1550--1560}\n}\n@article{WaB12,\n\ttitle        = {Incremental constraint projection methods for variational inequalities},\n\tauthor       = {Wang, Mengdi and Bertsekas, Dimitri P.},\n\tyear         = 2015,\n\tjournal      = {Math. Program.},\n\tvolume       = 150,\n\tnumber       = {2, Ser. A},\n\tpages        = {321--363},\n\tdoi          = {10.1007/s10107-014-0769-x},\n\tissn         = {0025-5610},\n\turl          = {http://dx.doi.org/10.1007/s10107-014-0769-x},\n\tfjournal     = {Mathematical Programming},\n\tmrclass      = {65K15 (62L20 68W27 90C33)},\n\tmrnumber     = 3323620\n}\n@article{WaB13,\n\ttitle        = {Stochastic first-order methods with random constraint projection},\n\tauthor       = {Wang, Mengdi and Bertsekas, Dimitri P.},\n\tyear         = 2016,\n\tjournal      = {SIAM J. Optim.},\n\tvolume       = 26,\n\tnumber       = 1,\n\tpages        = {681--717},\n\tdoi          = {10.1137/130931278},\n\tissn         = {1052-6234},\n\turl          = {http://dx.doi.org/10.1137/130931278},\n\tfjournal     = {SIAM Journal on Optimization},\n\tmrclass      = {90C15 (90C25)},\n\tmrnumber     = 3472017,\n\tmrreviewer   = {Kurt Marti}\n}\n@article{waddell2016algorithms,\n\ttitle        = {How algorithms can bring down minorities' credit scores},\n\tauthor       = {Kaveh Waddell},\n\tyear         = 2016,\n\tjournal      = {The Atlantic}\n}\n@book{waddington1940organisers,\n\ttitle        = {Organisers and Genes},\n\tauthor       = {Conrad Hal Waddington},\n\tyear         = 1940,\n\tpublisher    = {University Press; Cambridge}\n}\n@inproceedings{wager2013dropout,\n\ttitle        = {Dropout Training as Adaptive Regularization},\n\tauthor       = {Stefan Wager and Sida I. Wang and Percy Liang},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{wager2014altitude,\n\ttitle        = {Altitude Training: Strong Bounds for Single-Layer Dropout},\n\tauthor       = {Stefan Wager and Will Fithian and Sida I. Wang and Percy Liang},\n\tyear         = 2014,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{wager2015estimation,\n\ttitle        = {Estimation and Inference of Heterogeneous Treatment Effects using Random Forests},\n\tauthor       = {Stefan Wager and Susan Athey},\n\tyear         = 2015,\n\tjournal      = {arXiv}\n}\n@incollection{wager2016levy,\n\ttitle        = {Data Augmentation via {L}\\'evy Processes},\n\tauthor       = {Stefan Wager and Will Fithian and Percy Liang},\n\tyear         = 2016,\n\tbooktitle    = {Perturbations, Optimization and Statistics}\n}\n@article{wah2011caltech,\n\ttitle        = {The caltech-ucsd birds-200-2011 dataset},\n\tauthor       = {Wah, Catherine and Branson, Steve and Welinder, Peter and Perona, Pietro and Belongie, Serge},\n\tyear         = 2011,\n\tpublisher    = {California Institute of Technology}\n}\n@techreport{wah2011cub,\n\ttitle        = {The {Caltech}-{UCSD} {Birds}-200-2011 dataset},\n\tauthor       = {C Wah and S Branson and P Welinder and P Perona and S Belongie},\n\tyear         = 2011,\n\tinstitution  = {California Institute of Technology}\n}\n@book{wahba1990spline,\n\ttitle        = {Spline models for observational data},\n\tauthor       = {Wahba, Grace},\n\tyear         = 1990,\n\tpublisher    = {Siam},\n\tvolume       = 59\n}\n@article{wain2015novel,\n\ttitle        = {Novel insights into the genetics of smoking behaviour, lung function, and chronic obstructive pulmonary disease ({UK} {B}iLEVE): a genetic association study in {UK} {B}iobank},\n\tauthor       = {Louise V Wain and Nick Shrine and Suzanne Miller and Victoria E Jackson and Ioanna Ntalla and Maria Soler Artigas and Charlotte K Billington and Abdul Kader Kheirallah and Richard Allen and James P Cook and others},\n\tyear         = 2015,\n\tjournal      = {The Lancet Respiratory Medicine},\n\tvolume       = 3,\n\tnumber       = 10,\n\tpages        = {769--781}\n}\n@inproceedings{wainwright03trw,\n\ttitle        = {Tree-reweighted belief propagation algorithms and approximate {ML} estimation by pseudo-moment matching},\n\tauthor       = {Martin Wainwright and Tommi Jaakkola and Alan Willsky},\n\tyear         = 2003,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)}\n}\n@article{wainwright06wrong,\n\ttitle        = {Estimating the ``wrong'' graphical model: Benefits in the computation-limited setting},\n\tauthor       = {Martin Wainwright},\n\tyear         = 2006,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 7,\n\tpages        = {1829--1859}\n}\n@article{wainwright08varinf,\n\ttitle        = {Graphical models, exponential families, and variational inference},\n\tauthor       = {Martin Wainwright and Michael I. Jordan},\n\tyear         = 2008,\n\tjournal      = {Foundations and Trends in Machine Learning},\n\tvolume       = 1,\n\tpages        = {1--307}\n}\n@article{wainwright2005new,\n\ttitle        = {A new class of upper bounds on the log partition function},\n\tauthor       = {Martin J Wainwright and Tommi S Jaakkola and Alan S Willsky},\n\tyear         = 2005,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tvolume       = 51,\n\tnumber       = 7,\n\tpages        = {2313--2335}\n}\n@book{wainwright2019high,\n\ttitle        = {High-dimensional statistics: A non-asymptotic viewpoint},\n\tauthor       = {Wainwright, Martin J},\n\tyear         = 2019,\n\tpublisher    = {Cambridge University Press},\n\tvolume       = 48\n}\n@article{wald2021calibration,\n\ttitle        = {On Calibration and Out-of-domain Generalization},\n\tauthor       = {Wald, Yoav and Feder, Amir and Greenfield, Daniel and Shalit, Uri},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2102.10395}\n}\n@article{walker04slice,\n\ttitle        = {Sampling the {D}irichlet Mixture Model with Slices},\n\tauthor       = {Stephen G. Walker},\n\tyear         = 2004,\n\tjournal      = {Communications in Statistics - Simulation and Computation},\n\tvolume       = 36,\n\tpages        = {45--54}\n}\n@article{walker2002training,\n\ttitle        = {Training a sentence planner for spoken dialogue using boosting},\n\tauthor       = {Marilyn A Walker and Owen C Rambow and Monica Rogati},\n\tyear         = 2002,\n\tjournal      = {Computer Speech \\& Language},\n\tvolume       = 16,\n\tnumber       = 3,\n\tpages        = {409--433}\n}\n@article{walker2006ace,\n\ttitle        = {{ACE} 2005 multilingual training corpus},\n\tauthor       = {Christopher Walker and Stephanie Strassel and Julie Medero and Kazuaki Maeda},\n\tyear         = 2006,\n\tjournal      = {Linguistic Data Consortium},\n\tvolume       = 1\n}\n@inproceedings{wall2003singular,\n\ttitle        = {Singular Value Decomposition and Principal Component Analysis},\n\tauthor       = {Wall, Michael E. and Rechtsteiner, Andreas and Rocha, Luis M.},\n\tyear         = 2003,\n\tmonth        = mar,\n\tbooktitle    = {A Practical Approach to Microarray Data Analysis},\n\tpublisher    = {Kluwel},\n\taddress      = {Norwell, MA},\n\tpages        = {91--109},\n\teditor       = {Berrar, D. P. and Dubitzky, W. and Granzow, M.},\n\tabstract     = {\n\t\tThis chapter describes gene expression analysis by Singular Value\n\n\t\tDecomposition (SVD), emphasizing initial characterization of the\n\n\t\tdata. We describe SVD methods for visualization of gene expression\n\n\t\tdata, representation of the data using a smaller number of variables,\n\n\t\tand detection of patterns in noisy gene expression data. In addition,\n\n\t\twe describe the precise relation between SVD analysis and Principal\n\n\t\tComponent Analysis (PCA) when PCA is calculated using the covariance\n\n\t\tmatrix, enabling our descriptions to apply equally well to either\n\n\t\tmethod. Our aim is to provide definitions, interpretations, examples,\n\n\t\tand references that will serve as resources for understanding and\n\n\t\textending the application of SVD and PCA to gene expression analysis.\n\t},\n\tchapter      = 5,\n\tciteulike-article-id = 352522,\n\teprint       = {physics/0208101},\n\tkeywords     = {\n\t\talgebra, analysis, components, dimension, dimensionality, linear,\n\n\t\tlinearalgebra, pca, principal, svd\n\t},\n\towner        = {leili},\n\tposted-at    = {2007-09-26 05:31:41},\n\tpriority     = 2,\n\ttimestamp    = {2011.07.28}\n}\n@article{wallace2019trick,\n\ttitle        = {Trick Me If You Can: Human-in-the-Loop Generation of Adversarial Examples for Question Answering},\n\tauthor       = {Eric Wallace and Pedro Rodriguez and Shi Feng and Ikuya Yamada and Jordan Boyd-Graber},\n\tyear         = 2019,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 7\n}\n@inproceedings{wallace2019universal,\n\ttitle        = {Universal Adversarial Triggers for Attacking and Analyzing {NLP}},\n\tauthor       = {Eric Wallace and Shi Feng and Nikhil Kandpal and Matt Gardner and Sameer Singh},\n\tyear         = 2019,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{wallach08dependency,\n\ttitle        = {{B}ayesian Modeling of Dependency Trees Using Hierarchical {P}itman-{Y}or Priors},\n\tauthor       = {Hanna Wallach and Charles Sutton and Andrew McCallum},\n\tyear         = 2008,\n\tbooktitle    = {Workshop on Prior Knowledge for Text and Language},\n\tpages        = {15--20}\n}\n@inproceedings{wallach2009evaluation,\n\ttitle        = {Evaluation Methods for Topic Models},\n\tauthor       = {Hanna Wallach and Iain Murray and Ruslan Salakhutdinov and David Mimno},\n\tyear         = 2009,\n\tbooktitle    = {ICML}\n}\n@inproceedings{walter2013learning,\n\ttitle        = {Learning Semantic Maps from Natural Language Descriptions},\n\tauthor       = {M. Walter and S. Hemachandra and B. Homberg and S. Tellex and S. Teller},\n\tyear         = 2013,\n\tbooktitle    = {Robotics: Science and Systems (RSS)}\n}\n@inproceedings{wan2006dependency,\n\ttitle        = {Using dependency-based features to take the ``para-farce\" out of paraphrase},\n\tauthor       = {Stephen Wan and Mark Dras and Robert Dale and Cécile Paris},\n\tyear         = 2006,\n\tbooktitle    = {Australasian Language Technology Workshop}\n}\n@inproceedings{wan2013regularization,\n\ttitle        = {Regularization of neural networks using dropconnect},\n\tauthor       = {Li Wan and Matthew Zeiler and Sixin Zhang and Yann L Cun and Rob Fergus},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1058--1066}\n}\n@book{wand1994kernel,\n\ttitle        = {Kernel smoothing},\n\tauthor       = {Matt P Wand and M Chris Jones},\n\tyear         = {1994 1994},\n\tpublisher    = {Chapman and Hall/CRC Chapman and Hall/CRC}\n}\n@article{wand87simple,\n\ttitle        = {A Simple Algorithm and Proof for Type Inference},\n\tauthor       = {Mitchell Wand},\n\tyear         = 1987,\n\tjournal      = {Fundamenta Informaticae},\n\tvolume       = 10,\n\tpages        = {115--122}\n}\n@misc{wandb,\n\ttitle        = {Experiment Tracking with Weights and Biases},\n\tauthor       = {Biewald, Lukas},\n\tyear         = 2020,\n\turl          = {https://www.wandb.com/},\n\tnote         = {Software available from wandb.com}\n}\n@inproceedings{wang07stable,\n\ttitle        = {Stable Dual Dynamic Programming},\n\tauthor       = {Tao Wang and Daniel J. Lizotte and Michael H. Bowling and Dale Schuurmans},\n\tyear         = 2007,\n\tbooktitle    = {Advances in Neural Information Processing Systems 20 (NIPS-07)},\n\tpages        = {1569--1576}\n}\n@inproceedings{wang09crf,\n\ttitle        = {Max-Margin Hidden Conditional Random Fields for Human Action Recognition},\n\tauthor       = {Yang Wang and Greg Mori},\n\tyear         = 2009,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@proceedings{wang16_ijcai,\n\ttitle        = {Nonparametric Risk and Stability Analysis for Multi-Task Learning Problems},\n\tauthor       = {Xuezhi Wang, Junier Oliva, Jeff Schneider, Barnabas Poczos},\n\tyear         = 2016,\n\tbooktitle    = {IJCAI}\n}\n@inproceedings{wang2003evaluation,\n\ttitle        = {\n\t\tAn evaluation of a cost metric for selecting transitions between\n\n\t\tmotion segments\n\t},\n\tauthor       = {Wang, Jing and Bodenheimer, Bobby},\n\tyear         = 2003,\n\tbooktitle    = {\n\t\tProceedings of the 2003 ACM SIGGRAPH/Eurographics symposium on Computer\n\n\t\tanimation\n\t},\n\tlocation     = {San Diego, California},\n\tpublisher    = {Eurographics Association},\n\taddress      = {Aire-la-Ville, Switzerland, Switzerland},\n\tseries       = {SCA '03},\n\tpages        = {232--238},\n\tisbn         = {1-58113-659-5},\n\tacmid        = 846309,\n\tnumpages     = 7\n}\n@inproceedings{wang2004computing,\n\ttitle        = {Computing the duration of motion transitions: an empirical approach},\n\tauthor       = {Wang, Jing and Bodenheimer, Bobby},\n\tyear         = 2004,\n\tbooktitle    = {\n\t\tProceedings of the 2004 ACM SIGGRAPH/Eurographics symposium on Computer\n\n\t\tanimation\n\t},\n\tlocation     = {Grenoble, France},\n\tpublisher    = {Eurographics Association},\n\taddress      = {Aire-la-Ville, Switzerland, Switzerland},\n\tseries       = {SCA '04},\n\tpages        = {335--344},\n\tdoi          = {http://dx.doi.org/10.1145/1028523.1028568},\n\tisbn         = {3-905673-14-2},\n\tacmid        = 1028568,\n\tnumpages     = 10\n}\n@inproceedings{wang2006discriminative,\n\ttitle        = {Discriminative models for spoken language understanding},\n\tauthor       = {Ye-Yi Wang and Alex Acero},\n\tyear         = 2006,\n\tbooktitle    = {InterSpeech}\n}\n@inproceedings{wang2007dual,\n\ttitle        = {Dual representations for dynamic programming and reinforcement learning},\n\tauthor       = {Wang, Tao and Bowling, Michael and Schuurmans, Dale},\n\tyear         = 2007,\n\tbooktitle    = {Approximate Dynamic Programming and Reinforcement Learning, 2007. ADPRL 2007. IEEE International Symposium on},\n\tpages        = {44--51},\n\torganization = {IEEE}\n}\n@inproceedings{wang2007qa,\n\ttitle        = {What is the Jeopardy Model? A Quasi-Synchronous Grammar for {QA}},\n\tauthor       = {Mengqiu Wang and Noah A. Smith and Teruko Mitamura},\n\tyear         = 2007,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{wang2008gaussian,\n\ttitle        = {Gaussian Process Dynamical Models for Human Motion},\n\tauthor       = {Wang, J. M. and Fleet, D. J. and Hertzmann, A.},\n\tyear         = 2008,\n\tjournal      = {IEEE Transactions on Pattern Analysis and Machine Intelligence},\n\tbooktitle    = {Pattern Analysis and Machine Intelligence, IEEE Transactions on},\n\tvolume       = 30,\n\tnumber       = 2,\n\tpages        = {283--298},\n\tdoi          = {10.1109/TPAMI.2007.1167},\n\tabstract     = {\n\t\tWe introduce Gaussian process dynamical models (GPDMs) for nonlinear\n\n\t\ttime series analysis, with applications to learning models of human\n\n\t\tpose and motion from high-dimensional motion capture data. A GPDM\n\n\t\tis a latent variable model. It comprises a low-dimensional latent\n\n\t\tspace with associated dynamics, as well as a map from the latent\n\n\t\tspace to an observation space. We marginalize out the model parameters\n\n\t\tin closed form by using Gaussian process priors for both the dynamical\n\n\t\tand the observation mappings. This results in a nonparametric model\n\n\t\tfor dynamical systems that accounts for uncertainty in the model.\n\n\t\tWe demonstrate the approach and compare four learning algorithms\n\n\t\ton human motion capture data, in which each pose is 50-dimensional.\n\n\t\tDespite the use of small data sets, the GPDM learns an effective\n\n\t\trepresentation of the nonlinear dynamics in these spaces.\n\t},\n\tciteulike-article-id = 3504557,\n\tkeywords     = {discriminative, gaussian, motion, process},\n\towner        = {leili},\n\tposted-at    = {2008-11-11 22:28:16},\n\tpriority     = 2,\n\ttimestamp    = {2011.07.28}\n}\n@article{wang2008sample,\n\ttitle        = {Sample average approximation of expected value constrained stochastic programs},\n\tauthor       = {Wang, Wei and Ahmed, Shabbir},\n\tyear         = 2008,\n\tjournal      = {Operations Research Letters},\n\tpublisher    = {Elsevier},\n\tvolume       = 36,\n\tnumber       = 5,\n\tpages        = {515--519}\n}\n@inproceedings{wang2009character,\n\ttitle        = {Character-level analysis of semi-structured documents for set expansion},\n\tauthor       = {Richard C Wang and William W Cohen},\n\tyear         = 2009,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1503--1512}\n}\n@inproceedings{wang2010paraphrasing,\n\ttitle        = {Probabilistic Tree-Edit Models with Structured Latent Variables for Textual Entailment and Question Answering},\n\tauthor       = {Mengqiu Wang and Christopher D. Manning},\n\tyear         = 2010,\n\tbooktitle    = {The International Conference on Computational Linguistics},\n\tpages        = {1164--1172}\n}\n@article{wang2011semantic,\n\ttitle        = {Semantic Frame-Based Spoken Language Understanding},\n\tauthor       = {Ye-Yi Wang and Li Deng and Alex Acero},\n\tyear         = 2011,\n\tjournal      = {Spoken Language Understanding: Systems for Extracting Semantic Information from Speech},\n\tpages        = {41--91}\n}\n@inproceedings{wang2012baselines,\n\ttitle        = {Baselines and Bigrams: Simple, Good Sentiment and Topic Classification},\n\tauthor       = {Sida Wang and Christopher D. Manning},\n\tyear         = 2012,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{wang2012scalable,\n\ttitle        = {A Scalable {CUR} Matrix Decomposition Algorithm: Lower Time Complexity and Tighter Bound},\n\tauthor       = {Wang, Shusen and Zhang, Zhihua},\n\tyear         = 2012,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {647--655}\n}\n@article{wang2013bregman,\n\ttitle        = {Bregman Alternating Direction Method of Multipliers},\n\tauthor       = {Wang, Huahua and Banerjee, Arindam},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1306.3203}\n}\n@inproceedings{wang2013fast,\n\ttitle        = {Fast dropout training},\n\tauthor       = {Sida I. Wang and Christopher Manning},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {118--126}\n}\n@article{wang2013improving,\n\ttitle        = {Improving {CUR} matrix decomposition and the {Nystr{\\\"o}m} approximation via adaptive sampling},\n\tauthor       = {Wang, Shusen and Zhang, Zhihua},\n\tyear         = 2013,\n\tjournal      = {Journal of Machine Learning Research},\n\tpublisher    = {JMLR. org},\n\tvolume       = 14,\n\tnumber       = 1,\n\tpages        = {2729--2769}\n}\n@inproceedings{wang2013learning,\n\ttitle        = {Learning to detect patterns of crime},\n\tauthor       = {Tong Wang and Cynthia Rudin and Daniel Wagner and Rich Sevieri},\n\tyear         = 2013,\n\tbooktitle    = {European Conference on Machine Learning (ECML)},\n\tpages        = {515--530}\n}\n@inproceedings{wang2013noising,\n\ttitle        = {Feature Noising for Log-linear Structured Prediction},\n\tauthor       = {Sida I. Wang and Mengqiu Wang and Stefan Wager and Percy Liang and Chris Manning},\n\tyear         = 2013,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{wang2013nonnegative,\n\ttitle        = {Nonnegative matrix factorization: A comprehensive review},\n\tauthor       = {Wang, Yu-Xiong and Zhang, Yu-Jin},\n\tyear         = 2013,\n\tjournal      = {Knowledge and Data Engineering, IEEE Transactions on},\n\tpublisher    = {IEEE},\n\tvolume       = 25,\n\tnumber       = 6,\n\tpages        = {1336--1353}\n}\n@inproceedings{wang2013simple,\n\ttitle        = {A simple and generic belief tracking mechanism for the dialog state tracking challenge: On the believability of observed information},\n\tauthor       = {Zhuoran Wang and Oliver Lemon},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of the SIGDIAL 2013 Conference},\n\tpages        = {423--432}\n}\n@article{wang2013smoothing,\n\ttitle        = {Smoothing splines with varying smoothing parameter},\n\tauthor       = {Wang, Xiao and Du, Pang and Shen, Jinglai},\n\tyear         = 2013,\n\tjournal      = {Biometrika},\n\tpublisher    = {Oxford University Press},\n\tvolume       = 100,\n\tnumber       = 4,\n\tpages        = {955--970}\n}\n@article{wang2014efficient,\n\ttitle        = {Efficient Algorithms and Error Analysis for the Modified {Nystr{\\\"o}m} Method},\n\tauthor       = {Wang, Shusen and Zhang, Zhihua},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1404.0138}\n}\n@inproceedings{wang2014flexible,\n\ttitle        = {Flexible transfer learning under support and model shift},\n\tauthor       = {Wang, Xuezhi and Schneider, Jeff},\n\tyear         = 2014,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {1898--1906}\n}\n@inproceedings{wang2014iqp,\n\ttitle        = {Relaxations for inference in restricted {B}oltzmann machines},\n\tauthor       = {Sida I. Wang and Roy Frostig and Percy Liang and Chris Manning},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Learning Representations Workshop (ICLR)}\n}\n@techreport{wang2014qa,\n\ttitle        = {An Overview of Microsoft Deep {QA} System on Stanford WebQuestions Benchmark},\n\tauthor       = {Zhenghao Wang and Shengquan Yan and Huaming Wang and Xuedong Huang},\n\tyear         = 2014,\n\tinstitution  = {Microsoft Research}\n}\n@inproceedings{wang2014unsupervised,\n\ttitle        = {Unsupervised learning of disease progression models},\n\tauthor       = {Xiang Wang and David Sontag and Fei Wang},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Knowledge Discovery and Data Mining (KDD)},\n\tpages        = {85--94}\n}\n@article{wang2015adjusting,\n\ttitle        = {Adjusting Leverage Scores by Row Weighting: A Practical Approach to Coherent Matrix Completion},\n\tauthor       = {Wang, Shusen and Zhang, Tong and Zhang, Zhihua},\n\tyear         = 2014,\n\tjournal      = {arXiv:1412.7938}\n}\n@article{wang2015dualitygap,\n\ttitle        = {Vanishing Price of Anarchy in Large Coordinative Nonconvex Optimization},\n\tauthor       = {Mengdi Wang},\n\tyear         = 2015,\n\tjournal      = {Submitted; Optimization Online 2015/07/5021}\n}\n@article{wang2015large,\n\ttitle        = {Large-Scale Approximate Kernel Canonical Correlation Analysis},\n\tauthor       = {Wang, Weiran and Livescu, Karen},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint},\n\tvolume       = {abs/1511.04773}\n}\n@inproceedings{wang2015machine,\n\ttitle        = {Machine comprehension with syntax, frames, and semantics},\n\tauthor       = {Hai Wang and Mohit Bansal and Kevin Gimpel and David McAllester},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{wang2015overnight,\n\ttitle        = {Building a Semantic Parser Overnight},\n\tauthor       = {Yushi Wang and Jonathan Berant and Percy Liang},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{wang2015petpeeves,\n\ttitle        = {That’s So Annoying!!!: A Lexical and Frame-Semantic Embedding Based Data Augmentation Approach to Automatic Categorization of Annoying Behaviors using \\#petpeeve Tweets},\n\tauthor       = {William Y. Wang and Diyi Yang},\n\tyear         = 2015,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{wang2015polynomial,\n\ttitle        = {Estimating Mixture Models via Mixture of Polynomials},\n\tauthor       = {Sida I. Wang and Arun Chaganty and Percy Liang},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{wang2015provably,\n\ttitle        = {Provably Correct Active Sampling Algorithms for Matrix Column Subset Selection with Missing Data},\n\tauthor       = {Wang, Yining and Singh, Aarti},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1505.04343}\n}\n@inproceedings{wang2015transition,\n\ttitle        = {A transition-based algorithm for {AMR} parsing},\n\tauthor       = {Chuan Wang and Nianwen Xue and Sameer Pradhan},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@phdthesis{wang2016combating,\n\ttitle        = {Combating Attacks and Abuse in Large Online Communities},\n\tauthor       = {Gang Wang},\n\tyear         = 2016,\n\tschool       = {University of California Santa Barbara}\n}\n@inproceedings{wang2016dueling,\n\ttitle        = {Dueling network architectures for deep reinforcement learning},\n\tauthor       = {Ziyu Wang and Tom Schaul and Matteo Hessel and Hado Van Hasselt and Marc Lanctot and Nando De Freitas},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{wang2016games,\n\ttitle        = {Learning Language Games through Interaction},\n\tauthor       = {Sida I. Wang and Percy Liang and Chris Manning},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{wang2016learning,\n\ttitle        = {Learning to reinforcement learn},\n\tauthor       = {Jane X Wang and Zeb Kurth-Nelson and Dhruva Tirumala and Hubert Soyer and Joel Z Leibo and Remi Munos and Charles Blundell and Dharshan Kumaran and Matt Botvinick},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1611.05763}\n}\n@article{wang2016likelihood,\n\ttitle        = {Likelihood robust optimization for data-driven problems},\n\tauthor       = {Zizhuo Wang and Peter W Glynn and Yinyu Ye},\n\tyear         = 2016,\n\tjournal      = {Computational Management Science},\n\tvolume       = 13,\n\tnumber       = 2,\n\tpages        = {241--261}\n}\n@article{wang2016multi,\n\ttitle        = {Multi-Perspective Context Matching for Machine Comprehension},\n\tauthor       = {Zhiguo Wang and Haitao Mi and Wael Hamza and Radu Florian},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1612.04211}\n}\n@book{wang2017cooperative,\n\ttitle        = {Cooperative control of multi-agent systems: Theory and applications},\n\tauthor       = {Wang, Yue and Garcia, Eloy and Zhang, Fumin and Casbeer, David},\n\tyear         = 2017,\n\tpublisher    = {John Wiley \\& Sons}\n}\n@inproceedings{wang2017gated,\n\ttitle        = {Gated self-matching networks for reading comprehension and question answering},\n\tauthor       = {Wenhui Wang and Nan Yang and Furu Wei and Baobao Chang and Ming Zhou},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{wang2017learning,\n\ttitle        = {Learning to model the tail},\n\tauthor       = {Wang, Yu-Xiong and Ramanan, Deva and Hebert, Martial},\n\tyear         = 2017,\n\tbooktitle    = {Proceedings of the 31st International Conference on Neural Information Processing Systems},\n\tpages        = {7032--7042}\n}\n@inproceedings{wang2017machine,\n\ttitle        = {Machine Comprehension Using Match-{LSTM} and Answer Pointer},\n\tauthor       = {Shuohang Wang and Jing Jiang},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{wang2017naturalizing,\n\ttitle        = {Naturalizing a Programming Language via Interactive Learning},\n\tauthor       = {Sida I. Wang and Sam Ginn and Percy Liang and Christopher D. Manning},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{wang2017program,\n\ttitle        = {Program synthesis using abstraction refinement},\n\tauthor       = {Xinyu Wang and Isil Dillig and Rishabh Singh},\n\tyear         = 2017,\n\tbooktitle    = {Principles of Programming Languages (POPL)}\n}\n@article{wang2017randomized,\n\ttitle        = {Randomized Linear Programming Solves the Discounted {M}arkov Decision Problem In Nearly-Linear Running Time},\n\tauthor       = {Wang, Mengdi},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1704.01869},\n\tdate-added   = {2017-05-19 05:10:31 +0000},\n\tdate-modified = {2017-05-19 05:10:31 +0000}\n}\n@article{wang2017sketching,\n\ttitle        = {Sketching Meets Random Projection in the Dual: A Provable Recovery Algorithm for Big and High-dimensional Data},\n\tauthor       = {Wang, Jialei and Lee, Jason D and Mahdavi, Mehrdad and Kolar, Mladen and Srebro, Nathan},\n\tyear         = 2017,\n\tjournal      = {Electronic Journal of Statistics}\n}\n@inproceedings{wang2017sql,\n\ttitle        = {Synthesizing Highly Expressive {SQL} Queries from Input-Output Examples},\n\tauthor       = {Chenglong Wang and Alvin Cheung and Rastislav Bodik},\n\tyear         = 2017,\n\tbooktitle    = {Programming Language Design and Implementation (PLDI)}\n}\n@inproceedings{wang2017torontocity,\n\ttitle        = {TorontoCity: Seeing the World with a Million Eyes},\n\tauthor       = {Shenlong Wang and Min Bai and Gellert Mattyus and Hang Chu and Wenjie Luo and Bin Yang and Justin Liang and Joel Cheverie and Sanja Fidler and Raquel Urtasun},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Computer Vision (ICCV)}\n}\n@article{wang2018approximate,\n\ttitle        = {Approximate Leave-One-Out for High-Dimensional Non-Differentiable Learning Problems},\n\tauthor       = {Shuaiwen Wang and Wenda Zhou and Arian Maleki and Haihao Lu and Vahab Mirrokni},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1810.02716}\n}\n@inproceedings{wang2018learning,\n\ttitle        = {Learning Deep Hidden Nonlinear Dynamics from Aggregate Data},\n\tauthor       = {Yisen Wang and Bo Dai and Lingkai Kong and Xingjun Ma and Sarah Monazam Erfani and James Bailey and Shu-Tao Xia and Le Song and Hongyuan Zha},\n\tyear         = 2018,\n\tbooktitle    = {Uncertainty in Artificial Intelligence (UAI)}\n}\n@inproceedings{wang2018multi,\n\ttitle        = {Multi-Granularity Hierarchical Attention Fusion Networks for Reading Comprehension and Question Answering},\n\tauthor       = {Wei Wang and Ming Yan and Chen Wu},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{wang2018questions,\n\ttitle        = {Learning to Ask Questions in Open-domain Conversational Systems with Typed Decoders},\n\tauthor       = {Yansen Wang and Chenyi Liu and Minlie Huang and Liqiang Nie},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.04843}\n}\n@inproceedings{wang2018r3,\n\ttitle        = {{R3}: Reinforced Ranker-Reader for Open-Domain Question Answering},\n\tauthor       = {Shuohang Wang and Mo Yu and Xiaoxiao Guo and Zhiguo Wang and Tim Klinger and Wei Zhang and Shiyu Chang and Gerald Tesauro and Bowen Zhou and Jing Jiang},\n\tyear         = 2018,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@article{wang2018robust,\n\ttitle        = {Robust text-to-{SQL} generation with execution-guided decoding},\n\tauthor       = {Chenglong Wang and Kedar Tatwawadi and Marc Brockschmidt and Po-Sen Huang and Yi Mao and Oleksandr Polozov and Rishabh Singh},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1807.03100}\n}\n@inproceedings{wang2018stochastic,\n\ttitle        = {Stochastic Zeroth-order Optimization in High Dimensions},\n\tauthor       = {Yining Wang and Simon Du and Sivaraman Balakrishnan and Aarti Singh},\n\tyear         = 2018,\n\tmonth        = {09--11 Apr},\n\tbooktitle    = {Proceedings of the Twenty-First International Conference on Artificial Intelligence and Statistics},\n\tpublisher    = {PMLR},\n\tseries       = {Proceedings of Machine Learning Research},\n\tvolume       = 84,\n\tpages        = {1356--1365},\n\turl          = {http://proceedings.mlr.press/v84/wang18e.html},\n\teditor       = {Amos Storkey and Fernando Perez-Cruz},\n\tpdf          = {http://proceedings.mlr.press/v84/wang18e/wang18e.pdf},\n\tabstract     = {We consider the problem of optimizing a high-dimensional convex function using stochastic zeroth-order queries. Under sparsity assumptions on the gradients or function values, we present two algorithms: a successive component/feature selection algorithm and a noisy mirror descent algorithm using Lasso gradient estimates, and show that both algorithms have convergence rates that depend only logarithmically on the ambient dimension of the problem. Empirical results confirm our theoretical findings and show that the algorithms we design outperform classical zeroth-order optimization methods in the high-dimensional setting.}\n}\n@inproceedings{wang2019balanced,\n\ttitle        = {Balanced datasets are not enough: Estimating and mitigating gender bias in deep image representations},\n\tauthor       = {Tianlu Wang and Jieyu Zhao and Mark Yatskar and Kai-Wei Chang and Vicente Ordonez},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Computer Vision (ICCV)},\n\tpages        = {5310--5319}\n}\n@article{wang2019bert,\n\ttitle        = {{BERT} has a mouth, and it must speak: {BERT} as a {M}arkov random field language model},\n\tauthor       = {Alex Wang and Kyunghyun Cho},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1902.04094}\n}\n@article{wang2019exploring,\n\ttitle        = {Exploring model-based planning with policy networks},\n\tauthor       = {Wang, Tingwu and Ba, Jimmy},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1906.08649}\n}\n@inproceedings{wang2019glue,\n\ttitle        = {{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding},\n\tauthor       = {Alex Wang and Amapreet Singh and Julian Michael and Felix Hill and Omer Levy and Samuel R Bowman},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{wang2019learning,\n\ttitle        = {Learning robust global representations by penalizing local predictive power},\n\tauthor       = {Haohan Wang and Songwei Ge and Zachary Lipton and Eric P Xing},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{wang2019neural,\n\ttitle        = {Neural Policy Gradient Methods: Global Optimality and Rates of Convergence},\n\tauthor       = {Wang, Lingxiao and Cai, Qi and Yang, Zhuoran and Wang, Zhaoran},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations}\n}\n@article{wang2019optimism,\n\ttitle        = {Optimism in Reinforcement Learning with Generalized Linear Function Approximation},\n\tauthor       = {Wang, Yining and Wang, Ruosong and Du, Simon S and Krishnamurthy, Akshay},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1912.04136}\n}\n@inproceedings{wang2019rcm,\n\ttitle        = {Reinforced Cross-Modal Matching and Self-Supervised Imitation Learning for Vision-Language Navigation},\n\tauthor       = {Xin Eric Wang and Qiuyuan Huang and Asli Celikyilmaz and Jianfeng Gao and Dinghan Shen and Yuan-Fang Wang and William Yang Wang and Lei Zhang},\n\tyear         = 2019,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@article{wang2019repairing,\n\ttitle        = {Repairing without Retraining: Avoiding Disparate Impact with Counterfactual Distributions},\n\tauthor       = {Hao Wang and Berk Ustun and Flavio P Calmon},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1901.10501}\n}\n@inproceedings{wang2019superglue,\n\ttitle        = {{SuperGLUE}: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n\tauthor       = {Alex Wang and Yada Pruksachatkun and Nikita Nangia and Amanpreet Singh and Julian Michael and Felix Hill and Omer Levy and Samuel R. Bowman},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{wang2020beyond,\n\ttitle        = {Beyond Lazy Training for Over-parameterized Tensor Decomposition},\n\tauthor       = {Wang, Xiang and Wu, Chenwei and Lee, Jason D and Ma, Tengyu and Ge, Rong},\n\tyear         = 2020,\n\tjournal      = {Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{wang2020dual,\n\ttitle        = {DualSMC: Tunneling Differentiable Filtering and Planning under Continuous POMDPs},\n\tauthor       = {Wang, Yunbo and Liu, Bo and Wu, Jiajun and Zhu, Yuke and Du, Simon S. and Fei-Fei, Li and Tenenbaum, Joshua B.},\n\tyear         = 2020,\n\tbooktitle    = {Proceedings of the Twenty-Ninth International Joint Conference on Artificial Intelligence, {IJCAI-20}},\n\tpublisher    = {International Joint Conferences on Artificial Intelligence Organization},\n\tpages        = {4190--4198},\n\teditor       = {Christian Bessiere}\n}\n@article{wang2020long,\n\ttitle        = {Long-tailed recognition by routing diverse distribution-aware experts},\n\tauthor       = {Wang, Xudong and Lian, Long and Miao, Zhongqi and Liu, Ziwei and Yu, Stella X},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.01809},\n\tbooktitle    = {Advances in Neural Information Processing Systems}\n}\n@article{wang2020nearly,\n\ttitle        = {Nearly Dimension-Independent Sparse Linear Bandit over Small Action Spaces via Best Subset Selection},\n\tauthor       = {Wang, Yining and Chen, Yi and Fang, Ethan X and Wang, Zhaoran and Li, Runze},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2009.02003}\n}\n@article{wang2020planning,\n\ttitle        = {Planning with General Objective Functions: Going Beyond Total Rewards},\n\tauthor       = {Wang, Ruosong and Zhong, Peilin and Du, Simon S and Salakhutdinov, Russ R and Yang, Lin},\n\tyear         = 2020,\n\tjournal      = {Advances in Neural Information Processing Systems},\n\tvolume       = 33\n}\n@article{wang2020provably,\n\ttitle        = {Provably Efficient Reinforcement Learning with General Value Function Approximation},\n\tauthor       = {Wang, Ruosong and Salakhutdinov, Ruslan and Yang, Lin F},\n\tyear         = 2020,\n\tjournal      = {Advances in Neural Information Processing Systems}\n}\n@article{wang2020reinforcement,\n\ttitle        = {Reinforcement learning with general value function approximation: Provably efficient approach via bounded eluder dimension},\n\tauthor       = {Wang, Ruosong and Salakhutdinov, Russ R and Yang, Lin},\n\tyear         = 2020,\n\tjournal      = {Advances in Neural Information Processing Systems},\n\tvolume       = 33\n}\n@inproceedings{wang2020reward,\n\ttitle        = {On Reward-Free Reinforcement Learning with Linear Function Approximation},\n\tauthor       = {Wang, Ruosong and Du, Simon S and Yang, Lin and Salakhutdinov, Russ R},\n\tyear         = 2020,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tvolume       = 33,\n\tpages        = {17816--17826}\n}\n@article{wang2020statistical,\n\ttitle        = {What are the Statistical Limits of Offline {RL} with Linear Function Approximation?},\n\tauthor       = {Wang, Ruosong and Foster, Dean P and Kakade, Sham M},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.11895}\n}\n@inproceedings{wang2020understanding,\n\ttitle        = {Understanding contrastive representation learning through alignment and uniformity on the hypersphere},\n\tauthor       = {Wang, Tongzhou and Isola, Phillip},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {9929--9939},\n\torganization = {PMLR}\n}\n@article{wang2020weakly,\n\ttitle        = {Weakly Supervised Deep Learning for Segmentation of Remote Sensing Imagery},\n\tauthor       = {Sherrie Wang and William Chen and Sang Michael Xie and George Azzari and David B. Lobell},\n\tyear         = 2020,\n\tjournal      = {Remote Sensing},\n\tvolume       = 12\n}\n@article{wang2021cdcl,\n\ttitle        = {Cross-domain Contrastive Learning for Unsupervised Domain Adaptation},\n\tauthor       = {Rui Wang and Zuxuan Wu and Zejia Weng and Jingjing Chen and Guo-Jun Qi and Yu-Gang Jiang},\n\tyear         = 2021,\n\tjournal      = {arXiv}\n}\n@article{wang2021exponential,\n\ttitle        = {An Exponential Lower Bound for Linearly-Realizable MDPs with Constant Suboptimality Gap},\n\tauthor       = {Wang, Yuanhao and Wang, Ruosong and Kakade, Sham M},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2103.12690}\n}\n@inproceedings{wang2021longtailed,\n\ttitle        = {Long-tailed Recognition by Routing Diverse Distribution-Aware Experts},\n\tauthor       = {Xudong Wang and Long Lian and Zhongqi Miao and Ziwei Liu and Stella Yu},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Learning Representations}\n}\n@article{wang2021near,\n\ttitle        = {Near-Linear Time Local Polynomial Nonparametric Estimation with Box Kernels},\n\tauthor       = {Wang, Yining and Wu, Yi and Du, Simon S},\n\tyear         = 2021,\n\tjournal      = {INFORMS Journal on Computing},\n\tpublisher    = {INFORMS}\n}\n@inproceedings{wang2021optimism,\n\ttitle        = {Optimism in Reinforcement Learning with Generalized Linear Function Approximation},\n\tauthor       = {Yining Wang and Ruosong Wang and Simon Shaolei Du and Akshay Krishnamurthy},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Learning Representations},\n\turl          = {https://openreview.net/forum?id=CBmJwzneppz}\n}\n@article{wanggeneralization,\n\ttitle        = {Generalization Bounds for Transfer Learning under Model Shift},\n\tauthor       = {Wang, Xuezhi and Schneider, Jeff}\n}\n@article{WangMMR2015,\n\ttitle        = {Faster Parallel Solver for Positive Linear Programs via Dynamically-Bucketed Selective Coordinate Descent},\n\tauthor       = {Di Wang and Michael W. Mahoney and Nishanth Mohan and Satish Rao},\n\tyear         = 2015,\n\tmonth        = nov,\n\tjournal      = {ArXiv e-prints},\n\tvolume       = {abs/1511.06468}\n}\n@article{WangWGS2016-CCA,\n\ttitle        = {{Efficient Globally Convergent Stochastic Optimization for Canonical Correlation Analysis}},\n\tauthor       = {Weiran Wang and Jialei Wang and Dan Garber and Nathan Srebro},\n\tyear         = 2016,\n\tmonth        = apr,\n\tjournal      = {ArXiv e-prints},\n\tvolume       = {abs/1604.01870}\n}\n@article{wanwright09sharp,\n\ttitle        = {Sharp thresholds for noisy and high-dimensional recovery of sparsity using $\\ell_1$-constrained quadratic programming (Lasso)},\n\tauthor       = {M. J. Wainwright},\n\tyear         = 2009,\n\tjournal      = {IEEE Transactions on Information Theory},\n\tvolume       = 55,\n\tpages        = {2183--2202}\n}\n@article{warde2018unsupervised,\n\ttitle        = {Unsupervised control through non-parametric discriminative rewards},\n\tauthor       = {David Warde-Farley and Tom Van de Wiele and Tejas Kulkarni and Catalin Ionescu and Steven Hansen and Volodymyr Mnih},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1811.11359}\n}\n@article{warga1986higher,\n\ttitle        = {Higher order conditions with and without Lagrange multipliers},\n\tauthor       = {Warga, Jack},\n\tyear         = 1986,\n\tjournal      = {SIAM journal on control and optimization},\n\tpublisher    = {SIAM},\n\tvolume       = 24,\n\tnumber       = 4,\n\tpages        = {715--730}\n}\n@article{warner1965randomized,\n\ttitle        = {Randomized response: A survey technique for eliminating evasive answer bias},\n\tauthor       = {Stanley L Warner},\n\tyear         = 1965,\n\tjournal      = {Journal of the American Statistical Association (JASA)},\n\tvolume       = 60,\n\tnumber       = 309,\n\tpages        = {63--69}\n}\n@article{warren82chat80,\n\ttitle        = {An Efficient Easily Adaptable System for Interpreting Natural Language Queries},\n\tauthor       = {D. Warren and F. Pereira},\n\tyear         = 1982,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 8,\n\tpages        = {110--122}\n}\n@article{warstadt2018cola,\n\ttitle        = {Neural Network Acceptability Judgments},\n\tauthor       = {Alex Warstadt and Amanpreet Singh and Samuel R. Bowman},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.12471}\n}\n@book{wasserman2006all,\n\ttitle        = {All of nonparametric statistics},\n\tauthor       = {Wasserman, Larry},\n\tyear         = 2006,\n\tpublisher    = {Springer Science \\& Business Media}\n}\n@article{watanabe2017question,\n\ttitle        = {Question Answering from Unstructured Text by Retrieval and Comprehension},\n\tauthor       = {Yusuke Watanabe and Bhuwan Dhingra and Ruslan Salakhutdinov},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1703.08885}\n}\n@article{watkins1989learning,\n\ttitle        = {Learning from delayed rewards},\n\tauthor       = {C. Watkins},\n\tyear         = 1989,\n\tjournal      = {King's College, Cambridge}\n}\n@article{watkins1992q,\n\ttitle        = {Q-learning},\n\tauthor       = {Watkins, Christopher JCH and Dayan, Peter},\n\tyear         = 1992,\n\tjournal      = {Machine learning},\n\tpublisher    = {Springer},\n\tvolume       = 8,\n\tnumber       = {3-4},\n\tpages        = {279--292}\n}\n@inproceedings{watter2015embed,\n\ttitle        = {Embed to control: a locally linear latent dynamics model for control from raw images},\n\tauthor       = {Manuel Watter and Jost Springenberg and Joschka Boedecker and Martin Riedmiller},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {2746--2754}\n}\n@inproceedings{waytowich2019narration,\n\ttitle        = {A Narration-based Reward Shaping Approach using Grounded Natural Language Commands},\n\tauthor       = {Nicholas Waytowich and Sean L. Barton and Vernon Lawhern and Garrett Warnell},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@misc{web19latent,\n\ttitle        = {Reinforcement Learning with Latent State Decoding Library},\n\tyear         = 2019,\n\tnote         = {{https://github.com/Microsoft/StateDecoding}}\n}\n@misc{web20ai,\n\ttitle        = {AI Goes To High School},\n\thowpublished = {\\url{forbes.com/sites/insights-intelai/2019/05/22/ai-goes-to-high-school/#40ad5d971d0c}}\n}\n@misc{web20cmu,\n\ttitle        = {{CMU} {ML} {Blog}},\n\tyear         = {{2018}},\n\tnote         = {{https://blog.ml.cmu.edu/}}\n}\n@misc{web20csta,\n\ttitle        = {Computer Science Teacher Association},\n\thowpublished = {\\url{https://www.csteachers.org/}}\n}\n@misc{web20iisme,\n\ttitle        = {IISME Community Website},\n\thowpublished = {\\url{http://community.iisme.org/ }}\n}\n@misc{web20offconvex,\n\ttitle        = {{Off the Convex Path}},\n\tyear         = {{2015}},\n\tnote         = {{http://www.offconvex.org/}}\n}\n@misc{web20oneworld,\n\ttitle        = {{One World Seminar Series on the Mathematics of Machine Learning}},\n\tyear         = 2020,\n\tnote         = {{https://sites.google.com/view/oneworldml/home}}\n}\n@misc{web21dltbook,\n\ttitle        = {Deep Learning Theory},\n\thowpublished = {\\url{https://www.cs.princeton.edu/courses/archive/fall19/cos597B/lecnotes/bookdraft.pdf}}\n}\n@misc{web21wrlt,\n\ttitle        = {Workshop on Reinforcement Learning Theory at {ICML} 2021},\n\thowpublished = {\\url{https://lyang36.github.io/icml2021\\_rltheory/}}\n}\n@misc{webai4k12,\n\ttitle        = {AI4K12 GitHub Homepage},\n\tnote         = {Accessed: 2020-07-20},\n\thowpublished = {\\url{ https://github.com/touretzkyds/ai4k12/wiki }}\n}\n@misc{webaicurriculum,\n\ttitle        = {AI Curriculum Is Coming for K-12 At Last. What Will It Include?},\n\tnote         = {Accessed: 2020-07-20},\n\thowpublished = {\\url{https://www.edsurge.com/news/2019-01-15-ai-curriculum-is-coming-for-k-12-at-last-what-will-it-include}}\n}\n@misc{webaielectricity,\n\ttitle        = {Artificial intelligence: the new electricity},\n\tnote         = {Accessed: 2020-07-20},\n\thowpublished = {\\url{https://www.wipo.int/wipo_magazine/en/2019/03/article_0001.html}}\n}\n@misc{webapstudents,\n\ttitle        = {AP Students in College: An Analysis of Five-Year Academic Careers. Research Report No. 2007-4},\n\tnote         = {Accessed: 2020-07-20},\n\thowpublished = {\\url{ https://eric.ed.gov/?id=ED561034 }}\n}\n@phdthesis{webber2010measurement,\n\ttitle        = {Measurement in Information Retrieval Evaluation},\n\tauthor       = {William Edward Webber},\n\tyear         = 2010,\n\tschool       = {University of Melbourne}\n}\n@article{webber2010rbo,\n\ttitle        = {A Similarity Measure for Indefinite Rankings},\n\tauthor       = {William Webber and Alistair Moffat and Justin Zobel},\n\tyear         = 2010,\n\tjournal      = {ACM Transactions on Information Systems (TOIS)}\n}\n@misc{webcomputer,\n\ttitle        = {Computer and Information Technology Occupations},\n\tnote         = {Accessed: 2020-07-20},\n\thowpublished = {\\url{https://www.bls.gov/ooh/computer-and-information-technology/home.htm}}\n}\n@article{weber2017imagination,\n\ttitle        = {Imagination-Augmented Agents for Deep Reinforcement Learning},\n\tauthor       = {T. Weber and S. Racani{\\`e}re and D. P. Reichert and L. Buesing and A. Guez and D. J. Rezende and A. P. Badia and O. Vinyals and N. Heess and Y. Li and others},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1707.06203}\n}\n@misc{webidc4u,\n\ttitle        = {IDC4U: Data Science High School Course},\n\tnote         = {Accessed: 2020-07-20},\n\thowpublished = {\\url{ https://cognitiveclass.ai/partner-courses/data-science-high-school-vhs }}\n}\n@misc{webready,\n\ttitle        = {Read AI: Live Online AI Classes at Outschool!},\n\tnote         = {Accessed: 2020-07-20},\n\thowpublished = {\\url{https://www.readyai.org/}}\n}\n@misc{webskillshift,\n\ttitle        = {Skill shift: Automation and the future of the workforce},\n\tnote         = {Accessed: 2020-07-20},\n\thowpublished = {\\url{ https://www.mckinsey.com/featured-insights/future-of-work/skill-shift-automation-and-the-future-of-the-workforce# }}\n}\n@article{Wedin1972,\n\ttitle        = {Perturbation bounds in connection with singular value decomposition},\n\tauthor       = {Wedin, Per-{\\AA}ke},\n\tyear         = 1972,\n\tmonth        = mar,\n\tday          = {01},\n\tjournal      = {BIT Numerical Mathematics},\n\tpublisher    = {Springer},\n\tvolume       = 12,\n\tnumber       = 1,\n\tpages        = {99--111},\n\tdoi          = {10.1007/BF01932678},\n\tissn         = {1572-9125},\n\turl          = {https://doi.org/10.1007/BF01932678},\n\tabstract     = {LetA be anm {\\texttimes}n-matrix which is slightly perturbed. In this paper we will derive an estimate of how much the invariant subspaces ofA H A andAA H will then be affected. These bounds have the sin Ï theorem for Hermitian linear operators in Davis and Kahan [1] as a special case. They are applicable to computational solution of overdetermined systems of linear equations and especially cover the rank deficient case when the matrix is replaced by one of lower rank.}\n}\n@article{wei1998generalized,\n\ttitle        = {Generalized leverage and its applications},\n\tauthor       = {Bo-Cheng Wei and Yue-Qing Hu and Wing-Kam Fung},\n\tyear         = 1998,\n\tjournal      = {Scandinavian Journal of Statistics},\n\tvolume       = 25,\n\tpages        = {25--37}\n}\n@inproceedings{wei2007dynamic,\n\ttitle        = {Dynamic mixture models for multiple time series},\n\tauthor       = {Wei, Xing and Sun, Jimeng and Wang, Xuerui},\n\tyear         = 2007,\n\tbooktitle    = {\n\t\tProceedings of the 20th international joint conference on Artifical\n\n\t\tintelligence\n\t},\n\tlocation     = {Hyderabad, India},\n\tpublisher    = {Morgan Kaufmann Publishers Inc.},\n\taddress      = {San Francisco, CA, USA},\n\tpages        = {2909--2914},\n\tacmid        = 1625744,\n\tnumpages     = 6\n}\n@article{wei2008dynamics,\n\ttitle        = {Dynamics of learning near singularities in layered networks},\n\tauthor       = {Wei, Haikun and Zhang, Jun and Cousseau, Florent and Ozeki, Tomoko and Amari, Shun-ichi},\n\tyear         = 2008,\n\tjournal      = {Neural computation},\n\tpublisher    = {MIT Press},\n\tvolume       = 20,\n\tnumber       = 3,\n\tpages        = {813--843}\n}\n@inproceedings{wei2009non,\n\ttitle        = {A non-iterative compact model for carbon nanotube FETs incorporating source exhaustion effects},\n\tauthor       = {Wei, Lan and Frank, David J and Chang, Leland and Wong, H-SP},\n\tyear         = 2009,\n\tbooktitle    = {IEEE International Electron Devices Meeting}\n}\n@inproceedings{wei2015overview,\n\ttitle        = {Overview of the {BioCreative} {V} chemical disease relation (CDR) task},\n\tauthor       = {Chih-Hsuan Wei and Yifan Peng and Robert Leaman and Allan Peter Davis and Carolyn J Mattingly and Jiao Li and Thomas C Wiegers and Zhiyong Lu},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the Fifth BioCreative Challenge Evaluation Workshop},\n\tpages        = {154--166}\n}\n@inproceedings{wei2017online,\n\ttitle        = {Online Reinforcement Learning in Stochastic Games},\n\tauthor       = {Wei, Chen-Yu and Hong, Yi-Te and Lu, Chi-Jen},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {4994--5004}\n}\n@inproceedings{wei2019data,\n\ttitle        = {Data-dependent sample complexity of deep neural networks via lipschitz augmentation},\n\tauthor       = {Wei, Colin and Ma, Tengyu},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1905.03684},\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {9722--9733}\n}\n@article{wei2019improved,\n\ttitle        = {Improved sample complexities for deep networks and robust classification via an all-layer margin},\n\tauthor       = {Wei, Colin and Ma, Tengyu},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1910.04284}\n}\n@article{wei2019noise,\n\ttitle        = {How noise affects the Hessian spectrum in overparameterized neural networks},\n\tauthor       = {Wei, Mingwei and Schwab, David J},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1910.00195}\n}\n@inproceedings{wei2019regularization,\n\ttitle        = {Regularization matters: Generalization and optimization of neural nets vs their induced kernel},\n\tauthor       = {Wei, Colin and Lee, Jason D and Liu, Qiang and Ma, Tengyu},\n\tyear         = 2019,\n\tjournal      = {Neural Information Processing Systems (NeurIPS)},\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {9709--9721}\n}\n@article{wei2020implicit,\n\ttitle        = {The Implicit and Explicit Regularization Effects of Dropout},\n\tauthor       = {Wei, Colin and Kakade, Sham and Ma, Tengyu},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2002.12915}\n}\n@inproceedings{wei2020improved,\n\ttitle        = {Improved Sample Complexities for Deep Networks and Robust Classification via an All-Layer Margin},\n\tauthor       = {Colin Wei and Tengyu Ma},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@misc{wei2020theoretical,\n\ttitle        = {Theoretical Analysis of Self-Training with Deep Networks on Unlabeled Data},\n\tauthor       = {Colin Wei and Kendrick Shen and Yining Chen and Tengyu Ma},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.03622},\n\tbooktitle    = {International Conference on Learning Representations},\n\turl          = {https://openreview.net/forum?id=rC8sJ4i6kaH},\n\teprint       = {2010.03622},\n\tarchiveprefix = {arXiv},\n\tprimaryclass = {cs.LG}\n}\n@article{wei2021finetuned,\n\ttitle        = {Finetuned Language Models Are Zero-Shot Learners},\n\tauthor       = {Jason Wei and Maarten Bosma and Vincent Y. Zhao and Kelvin Guu and Adams Wei Yu and Brian Lester and Nan Du and Andrew M. Dai and Quoc V. Le},\n\tyear         = 2021,\n\tjournal      = {arXiv}\n}\n@article{wei2021pretrained,\n\ttitle        = {Why Do Pretrained Language Models Help in Downstream Tasks? An Analysis of Head and Prompt Tuning},\n\tauthor       = {Wei, Colin and Xie, Sang Michael and Ma, Tengyu},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2106.09226}\n}\n@article{wei2021why,\n\ttitle        = {Why Do Pretrained Language Models Help in Downstream Tasks? An Analysis of Head and Prompt Tuning},\n\tauthor       = {Colin Wei and Sang Michael Xie and Tengyu Ma},\n\tyear         = 2021,\n\tjournal      = {arXiv}\n}\n@inproceedings{weigelt2020programmingfuse,\n\ttitle        = {Programming in Natural Language with fuSE: Synthesizing Methods from Spoken Utterances Using Deep Natural Language Understanding},\n\tauthor       = {Sebastian Weigelt and Vanessa Steurer and Tobias Hey and W. Tichy},\n\tyear         = 2020,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{weinberger2015speech,\n\ttitle        = {Speech accent archive},\n\tauthor       = {Steven Weinberger},\n\tyear         = 2015,\n\tjournal      = {George Mason University}\n}\n@article{weinstein2013cancer,\n\ttitle        = {The cancer genome atlas pan-cancer analysis project},\n\tauthor       = {John N Weinstein and Eric A Collisson and Gordon B Mills and Kenna R Mills Shaw and Brad A Ozenberger and Kyle Ellrott and Ilya Shmulevich and Chris Sander and Joshua M Stuart and Cancer Genome Atlas Research Network and others},\n\tyear         = 2013,\n\tjournal      = {Nature genetics},\n\tvolume       = 45,\n\tnumber       = 10\n}\n@inproceedings{weisman2012learning,\n\ttitle        = {Learning Verb Inference Rules from Linguistically-motivated Evidence},\n\tauthor       = {Hila Weisman and Jonathan Berant and Idan Szpektor and Ido Dagan},\n\tyear         = 2012,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{weiss2010cascades,\n\ttitle        = {Structured Prediction Cascades},\n\tauthor       = {David Weiss and Ben Taskar},\n\tyear         = 2010,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)}\n}\n@inproceedings{weiss2010sidestepping,\n\ttitle        = {Sidestepping intractable inference with structured ensemble cascades},\n\tauthor       = {David Weiss and Benjamin Sapp and Ben Taskar},\n\tyear         = 2010,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {2415--2423}\n}\n@inproceedings{weiss2013adaptive,\n\ttitle        = {Learning adaptive value of information for structured prediction},\n\tauthor       = {David J Weiss and Ben Taskar},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {953--961}\n}\n@article{weiss2016survey,\n\ttitle        = {A survey of transfer learning},\n\tauthor       = {Karl Weiss and Taghi M Khoshgoftaar and DingDing Wang},\n\tyear         = 2016,\n\tjournal      = {Journal of Big Data},\n\tvolume       = 3\n}\n@article{weiss2018practical,\n\ttitle        = {On the practical computational power of finite precision RNNs for language recognition},\n\tauthor       = {Weiss, Gail and Goldberg, Yoav and Yahav, Eran},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.04908}\n}\n@inproceedings{weissenborn2017fastqa,\n\ttitle        = {Making Neural {QA} as Simple as Possible but not Simpler},\n\tauthor       = {Dirk Weissenborn and Georg Wiese and Laura Seiffe},\n\tyear         = 2017,\n\tbooktitle    = {Computational Natural Language Learning (CoNLL)}\n}\n@article{weissman2003inequalities,\n\ttitle        = {Inequalities for the L1 deviation of the empirical distribution},\n\tauthor       = {Weissman, Tsachy and Ordentlich, Erik and Seroussi, Gadiel and Verdu, Sergio and Weinberger, Marcelo J},\n\tyear         = 2003,\n\tjournal      = {Hewlett-Packard Labs, Tech. Rep}\n}\n@article{weisstein2003gershgorin,\n\ttitle        = {Gershgorin circle theorem},\n\tauthor       = {Weisstein, Eric W},\n\tyear         = 2003,\n\tpublisher    = {Wolfram Research, Inc.}\n}\n@misc{weisz2020exponential,\n\ttitle        = {Exponential Lower Bounds for Planning in MDPs With Linearly-Realizable Optimal Action-Value Functions},\n\tauthor       = {Gellert Weisz and Philip Amortila and Csaba Szepesvári},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.01374},\n\teprint       = {2010.01374},\n\tarchiveprefix = {arXiv},\n\tprimaryclass = {cs.LG}\n}\n@article{weizenbaum1966eliza,\n\ttitle        = {{ELIZA}--a computer program for the study of natural language communication between man and machine},\n\tauthor       = {Joseph Weizenbaum},\n\tyear         = 1966,\n\tjournal      = {Communications of the ACM},\n\tvolume       = 9,\n\tnumber       = 1,\n\tpages        = {36--45}\n}\n@article{welbl2017constructing,\n\ttitle        = {Constructing Datasets for Multi-hop Reading Comprehension Across Documents},\n\tauthor       = {Johannes Welbl and Pontus Stenetorp and Sebastian Riedel},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1710.06481}\n}\n@misc{welch1995introduction,\n\ttitle        = {An introduction to the {K}alman filter},\n\tauthor       = {Welch, Greg and Bishop, Gary},\n\tyear         = 1995\n}\n@misc{welch2001introduction,\n\ttitle        = {An Introduction to the Kalman Filter, SIGGRAPH 2001 Courses},\n\tauthor       = {Gregory Welch and Gary Bishop},\n\tyear         = 2001\n}\n@techreport{WelinderEtal2010,\n\ttitle        = {{Caltech-UCSD Birds 200}},\n\tauthor       = {P. Welinder and S. Branson and T. Mita and C. Wah and F. Schroff and S. Belongie and P. Perona},\n\tyear         = 2010,\n\tnumber       = {CNS-TR-2010-001},\n\tinstitution  = {California Institute of Technology}\n}\n@inproceedings{welling06flexible,\n\ttitle        = {Flexible Priors for Infinite Mixture Models},\n\tauthor       = {M. Welling},\n\tyear         = {2006 2006},\n\tbooktitle    = {International Conference on Machine Learning Workshop on Nonparametric Bayesian Methods}\n}\n@inproceedings{welling2011bayesian,\n\ttitle        = {Bayesian learning via stochastic gradient Langevin dynamics},\n\tauthor       = {Welling, Max and Teh, Yee W},\n\tyear         = 2011,\n\tbooktitle    = {Proceedings of the 28th international conference on machine learning (ICML-11)},\n\tpages        = {681--688}\n}\n@book{welzl1991smallest,\n\ttitle        = {Smallest enclosing disks (balls and ellipsoids)},\n\tauthor       = {Welzl, Emo},\n\tyear         = 1991,\n\tpublisher    = {Springer}\n}\n@techreport{wen2009first,\n\ttitle        = {First-order methods for semidefinite programming},\n\tauthor       = {Zaiwen Wen},\n\tyear         = 2009,\n\tinstitution  = {Columbia University}\n}\n@inproceedings{wen2013efficient,\n\ttitle        = {Efficient exploration and value function generalization in deterministic systems},\n\tauthor       = {Wen, Zheng and Van Roy, Benjamin},\n\tyear         = 2013,\n\tjournal      = {Advances in Neural Information Processing Systems},\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {3021--3029}\n}\n@inproceedings{wen2014robust,\n\ttitle        = {Robust Learning under Uncertain Test Distributions: Relating Covariate Shift to Model Misspecification},\n\tauthor       = {Junfeng Wen and Chun-Nam Yu and Russell Greiner},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {631--639}\n}\n@inproceedings{wen2015semantically,\n\ttitle        = {Semantically conditioned {LSTM}-based natural language generation for spoken dialogue systems},\n\tauthor       = {Tsung-Hsien Wen and Milica Gasic and Nikola Mrksic and Pei-Hao Su and David Vandyke and Steve Young},\n\tyear         = 2015,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{wen2017efficient,\n\ttitle        = {Efficient reinforcement learning in deterministic systems with value function generalization},\n\tauthor       = {Wen, Zheng and Van Roy, Benjamin},\n\tyear         = 2017,\n\tjournal      = {Mathematics of Operations Research},\n\tpublisher    = {INFORMS},\n\tvolume       = 42,\n\tnumber       = 3,\n\tpages        = {762--782}\n}\n@inproceedings{wen2017latent,\n\ttitle        = {Latent Intention Dialogue Models},\n\tauthor       = {Tsung-Hsien Wen and Yishu Miao and Phil Blunsom and Steve Young},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{wen2017network,\n\ttitle        = {A Network-based End-to-End Trainable Task-oriented Dialogue System},\n\tauthor       = {Tsung-Hsien Wen and Milica Gasic and Nikola Mrksic and Lina M Rojas-Barahona and Pei-Hao Su and Stefan Ultes and David Vandyke and Steve Young},\n\tyear         = 2017,\n\tbooktitle    = {European Association for Computational Linguistics (EACL)},\n\tpages        = {438--449}\n}\n@article{wen2019interplay,\n\ttitle        = {Interplay between optimization and generalization of stochastic gradient descent with covariance noise},\n\tauthor       = {Wen, Yeming and Luk, Kevin and Gazeau, Maxime and Zhang, Guodong and Chan, Harris and Ba, Jimmy},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1902.08234}\n}\n@article{Wendel1948note,\n\ttitle        = {Note on the gamma function},\n\tauthor       = {Wendel, J. G.},\n\tyear         = 1948,\n\tjournal      = {The American Mathematical Monthly},\n\tvolume       = 55,\n\tnumber       = 9,\n\tpages        = {563--564}\n}\n@article{weng2018towards,\n\ttitle        = {Towards Fast Computation of Certified Robustness for ReLU Networks},\n\tauthor       = {Tsui-Wei Weng and Huan Zhang and Hongge Chen and Zhao Song and Cho-Jui Hsieh and Duane Boning and Inderjit S Dhillon and Luca Daniel},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1804.09699}\n}\n@article{werbos1974beyond,\n\ttitle        = {Beyond regression: New tools for prediction and analysis in the behavioral sciences},\n\tauthor       = {Werbos, Paul},\n\tyear         = 1974\n}\n@inproceedings{werling2015onthejob,\n\ttitle        = {On-the-Job Learning with {B}ayesian Decision Theory},\n\tauthor       = {Keenon Werling and Arun Chaganty and Percy Liang and Chris Manning},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{werling2015robust,\n\ttitle        = {Robust subgraph generation improves abstract meaning representation parsing},\n\tauthor       = {Keenon Werling and Gabor Angeli and Christopher Manning},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{werts1995instructive,\n\ttitle        = {Instructive feedback: Review of parameters and effects},\n\tauthor       = {Margaret G Werts and Mark Wolery and Ariane Holcombe and David L Gast},\n\tyear         = 1995,\n\tjournal      = {Journal of Behavioral Education},\n\tvolume       = 5,\n\tnumber       = 1,\n\tpages        = {55--75}\n}\n@article{west2014exploiting,\n\ttitle        = {Exploiting social network structure for person-to-person sentiment analysis},\n\tauthor       = {Robert West and Hristo S Paskov and Jure Leskovec and Christopher Potts},\n\tyear         = 2014,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 2,\n\tpages        = {297--310}\n}\n@inproceedings{west2014knowledge,\n\ttitle        = {Knowledge base completion via search-based question answering},\n\tauthor       = {Robert West and Evgeniy Gabrilovich and Kevin Murphy and Shaohua Sun and Rahul Gupta and Dekang Lin},\n\tyear         = 2014,\n\tbooktitle    = {Proceedings of the 23rd international conference on World wide web}\n}\n@inproceedings{west2019bottlesum,\n\ttitle        = {BottleSum: Self-Supervised and Unsupervised Sentence Summarization using the Information Bottleneck Principle},\n\tauthor       = {Peter West and Ari Holtzman and Jan Buys and Yejin Choi},\n\tyear         = 2019,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@techreport{west95hyper,\n\ttitle        = {Hyperparameter estimation in {D}irichlet process mixture models},\n\tauthor       = {M. West},\n\tyear         = 1995,\n\tinstitution  = {Department of Statistics, Duke University (Duke University)}\n}\n@book{weste2010cmos,\n\ttitle        = {{CMOS} {VLSI} design: a circuits and systems perspective},\n\tauthor       = {Weste, Neil and Harris, David},\n\tyear         = 2010,\n\tpublisher    = {Addison-Wesley Publishing Company}\n}\n@article{weston2014memory,\n\ttitle        = {Memory networks},\n\tauthor       = {Weston, Jason and Chopra, Sumit and Bordes, Antoine},\n\tyear         = 2014,\n\tjournal      = {arXiv preprint arXiv:1410.3916}\n}\n@inproceedings{weston2015memory,\n\ttitle        = {Memory Networks},\n\tauthor       = {Jason Weston and Sumit Chopra and Antoine Bordes},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{weston2015towards,\n\ttitle        = {Towards {AI}-Complete Question Answering: A Set of Prerequisite Toy Tasks},\n\tauthor       = {Jason Weston and Antoine Bordes and Sumit Chopra and Tomas Mikolov},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1502.05698}\n}\n@inproceedings{weston2016dialog,\n\ttitle        = {Dialog-based language learning},\n\tauthor       = {Jason E Weston},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {829--837}\n}\n@inproceedings{Weyer99,\n\ttitle        = {Finite Sample Properties of System Identification Methods},\n\tauthor       = {Erik Weyer and M. C. Campi},\n\tyear         = 1999,\n\tbooktitle    = {Proceedings of the 38th Conference on Decision and Control},\n\tdate-added   = {2016-04-02 18:44:53 +0000},\n\tdate-modified = {2016-04-02 18:45:38 +0000}\n}\n@article{Weyl1912,\n\ttitle        = {Das asymptotische Verteilungsgesetz der Eigenwerte linearer partieller Differentialgleichungen (mit einer Anwendung auf die Theorie der Hohlraumstrahlung)},\n\tauthor       = {Weyl, H.},\n\tyear         = 1912,\n\tjournal      = {Mathematische Annalen},\n\tvolume       = 71,\n\tpages        = {441--479},\n\turl          = {http://eudml.org/doc/158545}\n}\n@article{white06hlds,\n\ttitle        = {Efficient Realization of Coordinate Structures in Combinatory Categorial Grammar},\n\tauthor       = {Michael White},\n\tyear         = 2006,\n\tjournal      = {Research on Language and Computation},\n\tvolume       = 4,\n\tpages        = {39--75}\n}\n@inproceedings{white1994arpa,\n\ttitle        = {The {ARPA} {MT} evaluation methodologies: evaluation, lessons, and future approaches},\n\tauthor       = {John White and Theresa O'Connell and Francis O'Mara},\n\tyear         = 1994,\n\tbooktitle    = {First Conference of the Association for Machine Translation in the Americas}\n}\n@article{white1999chatting,\n\ttitle        = {Chatting a singer up the pop charts},\n\tauthor       = {Erin White},\n\tyear         = 1999,\n\tjournal      = {Wall Street Journal}\n}\n@article{white82mis,\n\ttitle        = {Maximum likelihood estimation of misspecified models},\n\tauthor       = {Halbert White},\n\tyear         = 1982,\n\tjournal      = {Econometrica},\n\tvolume       = 50\n}\n@inproceedings{whitehead2014complexity,\n\ttitle        = {Complexity and cooperation in {Q}-learning},\n\tauthor       = {Whitehead, Steven D},\n\tyear         = 2014,\n\tbooktitle    = {Proceedings of the Eighth International Workshop on Machine Learning},\n\tpages        = {363--367}\n}\n@article{whitney1934analytic,\n\ttitle        = {Analytic extensions of differentiable functions defined in closed sets},\n\tauthor       = {Whitney, Hassler},\n\tyear         = 1934,\n\tjournal      = {Transactions of the American Mathematical Society},\n\tpublisher    = {JSTOR},\n\tvolume       = 36,\n\tnumber       = 1,\n\tpages        = {63--89}\n}\n@article{whittle1988restless,\n\ttitle        = {Restless bandits: Activity allocation in a changing world},\n\tauthor       = {Whittle, Peter},\n\tyear         = 1988,\n\tjournal      = {Journal of applied probability},\n\tpublisher    = {JSTOR},\n\tpages        = {287--298}\n}\n@article{wibisono2016variational,\n\ttitle        = {A variational perspective on accelerated methods in optimization},\n\tauthor       = {Wibisono, Andre and Wilson, Ashia C and Jordan, Michael I},\n\tyear         = 2016,\n\tjournal      = {proceedings of the National Academy of Sciences},\n\tpublisher    = {National Acad Sciences},\n\tvolume       = 113,\n\tnumber       = 47,\n\tpages        = {E7351--E7358}\n}\n@inproceedings{wick2011query,\n\ttitle        = {Query-Aware {MCMC}},\n\tauthor       = {Michael L Wick and Andrew McCallum},\n\tyear         = 2011,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {2564--2572}\n}\n@article{widmann2019calibration,\n\ttitle        = {Calibration tests in multi-class classification: A unifying framework},\n\tauthor       = {Widmann, David and Lindsten, Fredrik and Zachariah, Dave},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1910.11385},\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{wieland2007constructive,\n\ttitle        = {Constructive safety using control barrier functions},\n\tauthor       = {Wieland, Peter and Allg{\\\"o}wer, Frank},\n\tyear         = 2007,\n\tjournal      = {IFAC Proceedings Volumes},\n\tpublisher    = {Elsevier},\n\tvolume       = 40,\n\tnumber       = 12,\n\tpages        = {462--467}\n}\n@article{wiering2012reinforcement,\n\ttitle        = {Reinforcement learning},\n\tauthor       = {Wiering, Marco and Van Otterlo, Martijn},\n\tyear         = 2012,\n\tjournal      = {Adaptation, Learning, and Optimization},\n\tvolume       = 12\n}\n@inproceedings{wieting2018paranmt,\n\ttitle        = {{P}ara{NMT}-50{M}: Pushing the Limits of Paraphrastic Sentence Embeddings with Millions of Machine Translations},\n\tauthor       = {John Wieting and Kevin Gimpel},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@book{wightman1998lsac,\n\ttitle        = {{LSAC} national longitudinal bar passage study},\n\tauthor       = {Linda F Wightman and Henry Ramsey},\n\tyear         = 1998,\n\tpublisher    = {Law School Admission Council}\n}\n@article{wigner1955characteristic,\n\ttitle        = {Characteristic vectors of bordered matrices with infinite dimensions},\n\tauthor       = {Wigner, Eugene P},\n\tyear         = 1955,\n\tjournal      = {The Annals of Mathematics},\n\tpublisher    = {JSTOR},\n\tvolume       = 62,\n\tnumber       = 3,\n\tpages        = {548--564}\n}\n@misc{wiki-hermite,\n\ttitle        = {Hermite polynomials --- Wikipedia{,} The Free Encyclopedia},\n\tauthor       = {Wikipedia},\n\tyear         = 2017,\n\turl          = {https://en.wikipedia.org/w/index.php?title=Hermite_polynomials&oldid=796842411},\n\tnote         = {[Online; accessed 1-September-2017 ]}\n}\n@misc{wiki:fixedpoint,\n\ttitle        = {Schauder fixed point theorem --- Wikipedia{,} The Free Encyclopedia},\n\tauthor       = {Wikipedia},\n\tyear         = 2016,\n\turl          = {https://en.wikipedia.org/w/index.php?title=Schauder_fixed_point_theorem&oldid=722238234},\n\tnote         = {[Online; accessed 26-May-2016]}\n}\n@misc{wiki:incompleteGamma,\n\ttitle        = {Incomplete gamma function --- Wikipedia{,} The Free Encyclopedia},\n\tauthor       = {Wikipedia},\n\tyear         = 2016,\n\turl          = {\\url{https://en.wikipedia.org/w/index.php?title=Incomplete_gamma_function&oldid=730854137}},\n\tnote         = {[Online; accessed 13-September-2016]}\n}\n@misc{wiki:JL,\n\ttitle        = {Johnson–Lindenstrauss lemma --- Wikipedia{,} The Free Encyclopedia},\n\tauthor       = {Wikipedia},\n\tyear         = 2016,\n\turl          = {https://en.wikipedia.org/w/index.php?title=Johnson%E2%80%93Lindenstrauss_lemma&oldid=743553642}\n}\n@misc{wiki:power_series,\n\ttitle        = {Formal power series --- Wikipedia{,} The Free Encyclopedia},\n\tauthor       = {Wikipedia},\n\tyear         = 2017,\n\turl          = {https://en.wikipedia.org/w/index.php?title=Formal_power_series&oldid=797671381},\n\tnote         = {[Online; accessed 20-September-2017 ]}\n}\n@article{williams1991function,\n\ttitle        = {Function optimization using connectionist reinforcement learning algorithms},\n\tauthor       = {Ronald J Williams and Jing Peng},\n\tyear         = 1991,\n\tjournal      = {Connection Science},\n\tvolume       = 3,\n\tnumber       = 3,\n\tpages        = {241--268}\n}\n@article{williams1992simple,\n\ttitle        = {Simple statistical gradient-following algorithms for connectionist reinforcement learning},\n\tauthor       = {Williams, Ronald J},\n\tyear         = 1992,\n\tjournal      = {Machine learning},\n\tpublisher    = {Springer},\n\tvolume       = 8,\n\tnumber       = {3-4},\n\tpages        = {229--256}\n}\n@inproceedings{williams2001using,\n\ttitle        = {Using the {Nystr{\\\"o}m} method to speed up kernel machines},\n\tauthor       = {Williams, Christopher and Seeger, Matthias},\n\tyear         = 2001,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tnumber       = {},\n\tpages        = {682--688}\n}\n@article{williams2007partially,\n\ttitle        = {Partially observable {M}arkov decision processes for spoken dialog systems},\n\tauthor       = {Jason D Williams and Steve Young},\n\tyear         = 2007,\n\tjournal      = {Computer Speech \\& Language},\n\tvolume       = 21,\n\tnumber       = 2,\n\tpages        = {393--422}\n}\n@inproceedings{williams2012omega,\n\ttitle        = {Multiplying matrices faster than {C}oppersmith-{W}inograd},\n\tauthor       = {Virginia Vassilevska Williams},\n\tyear         = 2012,\n\tbooktitle    = {Symposium on Theory of Computing (STOC)}\n}\n@inproceedings{williams2013dialog,\n\ttitle        = {The dialog state tracking challenge},\n\tauthor       = {Jason Williams and Antoine Raux and Deepak Ramachandran and Alan Black},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of the SIGDIAL 2013 Conference},\n\tpages        = {404--413}\n}\n@article{williams2015model,\n\ttitle        = {Model predictive path integral control using covariance variable importance sampling},\n\tauthor       = {Williams, Grady and Aldrich, Andrew and Theodorou, Evangelos},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1509.01149}\n}\n@inproceedings{williams2016axis,\n\ttitle        = {Axis: Generating explanations at scale with learnersourcing and machine learning},\n\tauthor       = {Joseph Jay Williams and Juho Kim and Anna Rafferty and Samuel Maldonado and Krzysztof Z Gajos and Walter S Lasecki and Neil Heffernan},\n\tyear         = 2016,\n\tbooktitle    = {Proceedings of the Third (2016) ACM Conference on Learning@Scale},\n\tpages        = {379--388}\n}\n@article{williams2016dstc,\n\ttitle        = {The Dialog State Tracking Challenge Series: A Review},\n\tauthor       = {Jason D. Williams and Antoine Raux and Matthew Henderson},\n\tyear         = 2016,\n\tjournal      = {Dialogue and Discourse},\n\tvolume       = 7\n}\n@inproceedings{williams2017dialog,\n\ttitle        = {Hybrid Code Networks: Practical and Efficient End-to-End Dialog Control with Supervised and Reinforcement Learning},\n\tauthor       = {Jason D. Williams and Kavoshi Asadi and Geoffrey Zweig},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{williams2018broad,\n\ttitle        = {A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference},\n\tauthor       = {Adina Williams and Nikita Nangia and Samuel Bowman},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {1112--1122}\n}\n@article{wilson1996relative,\n\ttitle        = {Relative end-effector control using cartesian position based visual servoing},\n\tauthor       = {William J Wilson and CC Williams Hulls and Graham S Bell},\n\tyear         = 1996,\n\tjournal      = {IEEE Transactions on Robotics (T-RO)},\n\tvolume       = 12,\n\tpages        = {684--696}\n}\n@article{wilson2016lyapunov,\n\ttitle        = {A {L}yapunov analysis of momentum methods in optimization},\n\tauthor       = {Wilson, Ashia C and Recht, Benjamin and Jordan, Michael I},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1611.02635}\n}\n@inproceedings{wilson2017marginal,\n\ttitle        = {The marginal value of adaptive gradient methods in machine learning},\n\tauthor       = {Wilson, Ashia C and Roelofs, Rebecca and Stern, Mitchell and Srebro, Nati and Recht, Benjamin},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {4148--4158}\n}\n@article{winder66partition,\n\ttitle        = {Partitions of {N}-Space by Hyperplanes},\n\tauthor       = {R. O. Winder},\n\tyear         = 1966,\n\tjournal      = {SIAM Journal on Applied Mathematics},\n\tvolume       = 14,\n\tnumber       = 4,\n\tpages        = {811--818}\n}\n@book{winograd1972language,\n\ttitle        = {Understanding Natural Language},\n\tauthor       = {Terry Winograd},\n\tyear         = 1972,\n\tpublisher    = {Academic Press}\n}\n@incollection{winograd1991thinking,\n\ttitle        = {Thinking machines: Can there be? Are we?},\n\tauthor       = {Terry Winograd},\n\tyear         = 1991,\n\tbooktitle    = {The Boundaries of Humanity: Humans, Animals, Machines},\n\tpages        = {198--223}\n}\n@inproceedings{winstein2013tcp,\n\ttitle        = {{TCP} ex Machina: Computer-Generated Congestion Control},\n\tauthor       = {Keith Winstein and Hari Balakrishnan},\n\tyear         = 2013,\n\tbooktitle    = {SIGCOMM}\n}\n@inproceedings{wiseman2016beam,\n\ttitle        = {Sequence-to-Sequence Learning as Beam-Search Optimization},\n\tauthor       = {Sam Wiseman and Alexander M. Rush},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{witten1987arithmetic,\n\ttitle        = {Arithmetic coding for data compression},\n\tauthor       = {Ian H. Witten and Radford M. Neal and John G. Cleary},\n\tyear         = 1987,\n\tjournal      = {Communications of the ACM},\n\tvolume       = 30,\n\tpages        = {520--540}\n}\n@article{witten2009penalized,\n\ttitle        = {A penalized matrix decomposition, with applications to sparse principal components and canonical correlation analysis},\n\tauthor       = {Witten, Daniela M and Tibshirani, Robert and Hastie, Trevor},\n\tyear         = 2009,\n\tjournal      = {Biostatistics},\n\tpublisher    = {Biometrika Trust},\n\tpages        = {kxp008}\n}\n@book{wittgenstein1953philosophical,\n\ttitle        = {Philosophical Investigations},\n\tauthor       = {L. Wittgenstein},\n\tyear         = 1953,\n\tpublisher    = {Blackwell, Oxford}\n}\n@article{WJ,\n\ttitle        = {Graphical models, exponential families, and variational inference},\n\tauthor       = {M. Wainwright and M. Jordan},\n\tyear         = 2008,\n\tjournal      = {Foundations and Trends in Machine Learning},\n\tpages        = {1--305}\n}\n@article{wojnowicz2016sketching,\n\ttitle        = {``{I}nfluence Sketching'': Finding Influential Samples In Large-Scale Regressions},\n\tauthor       = {Mike Wojnowicz and Ben Cruz and Xuan Zhao and Brian Wallace and Matt Wolff and Jay Luan and Caleb Crable},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1611.05923}\n}\n@article{wolf2019transformers,\n\ttitle        = {{HuggingFace}'s Transformers: State-of-the-art Natural Language Processing},\n\tauthor       = {Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and R'emi Louf and Morgan Funtowicz and Jamie Brew},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1910.03771}\n}\n@inproceedings{wolff88syntax,\n\ttitle        = {Learning syntax and meanings through optimization and distributional analysis},\n\tauthor       = {J. G. Wolff},\n\tyear         = 1988,\n\tbooktitle    = {Categories and processes in language acquisition},\n\tpages        = {179--215}\n}\n@misc{wolfram:incompleteGamma,\n\ttitle        = {Incomplete Gamma Function --- From MathWorld--A Wolfram Web Resource},\n\tauthor       = {Weisstein, Eric W.},\n\tyear         = 2016,\n\turl          = {\\url{http://mathworld.wolfram.com/IncompleteGammaFunction.html}}\n}\n@misc{wolfram2009alpha,\n\ttitle        = {Wolfram|Alpha},\n\tauthor       = {Wolfram Alpha LLC},\n\tyear         = 2009,\n\thowpublished = {\\url{http://www.wolframalpha.com/}}\n}\n@article{wolfson2020break,\n\ttitle        = {Break It Down: A Question Understanding Benchmark},\n\tauthor       = {Tomer Wolfson and Mor Geva and Ankit Gupta and Matt Gardner and Yoav Goldberg and Daniel Deutch and Jonathan Berant},\n\tyear         = 2020,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 8\n}\n@inproceedings{wong06mt,\n\ttitle        = {Learning for semantic parsing with statistical machine translation},\n\tauthor       = {Yuk Wah Wong and Raymond J. Mooney},\n\tyear         = 2006,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)},\n\tpages        = {439--446}\n}\n@inproceedings{wong07generation,\n\ttitle        = {Generation by Inverting a Semantic Parser That Uses Statistical Machine Translation},\n\tauthor       = {Yuk Wah Wong and Raymond J. Mooney},\n\tyear         = 2007,\n\tbooktitle    = {Human Language Technology and North American Association for Computational Linguistics (HLT/NAACL)},\n\tpages        = {172--179}\n}\n@inproceedings{wong07synchronous,\n\ttitle        = {Learning Synchronous Grammars for Semantic Parsing with Lambda Calculus},\n\tauthor       = {Yuk Wah Wong and Raymond J. Mooney},\n\tyear         = 2007,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {960--967}\n}\n@article{wong1980efficient,\n\ttitle        = {An efficient method for weighted sampling without replacement},\n\tauthor       = {Wong, Chak-Kuen and Easton, Malcolm C.},\n\tyear         = 1980,\n\tjournal      = {SIAM Journal on Computing},\n\tpublisher    = {SIAM},\n\tvolume       = 9,\n\tnumber       = 1,\n\tpages        = {111--113}\n}\n@inproceedings{wong2009scalable,\n\ttitle        = {Scalable attribute-value extraction from semi-structured text},\n\tauthor       = {Yuk Wah Wong and Dominic Widdows and Tom Lokovic and Kamal Nigam},\n\tyear         = 2009,\n\tbooktitle    = {IEEE International Conference on Data Mining Workshops},\n\tpages        = {302--307}\n}\n@inproceedings{wong2018provable,\n\ttitle        = {Provable defenses against adversarial examples via the convex outer adversarial polytope},\n\tauthor       = {Eric Wong and J. Zico Kolter},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{wong2018scaling,\n\ttitle        = {Scaling provable adversarial defenses},\n\tauthor       = {Eric Wong and Frank Schmidt and Jan Hendrik Metzen and J Zico Kolter},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{wong2020learningpert,\n\ttitle        = {Learning perturbation sets for robust machine learning},\n\tauthor       = {Eric Wong and J. Zico Kolter},\n\tyear         = 2020,\n\tjournal      = {arXiv}\n}\n@inproceedings{wongpiromsarn2010receding,\n\ttitle        = {Receding horizon control for temporal logic specifications},\n\tauthor       = {Tichakorn Wongpiromsarn and Ufuk Topcu and Richard M Murray},\n\tyear         = 2010,\n\tbooktitle    = {International Conference on Hybrid Systems: Computation and Control},\n\tpages        = {101--110}\n}\n@article{wood1994simulation,\n\ttitle        = {Simulation of the von Mises Fisher distribution},\n\tauthor       = {Andrew T. Wood},\n\tyear         = 1994,\n\tjournal      = {Communications in statistics-simulation and computation},\n\tpages        = {157--164}\n}\n@inproceedings{wood2009stochastic,\n\ttitle        = {A stochastic memoizer for sequence data},\n\tauthor       = {Frank Wood and Cédric Archambeau and Jan Gasthaus and Lancelot James and Yee Whye Teh},\n\tyear         = 2009,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1129--1136}\n}\n@article{woodruff2014sketching,\n\ttitle        = {Sketching as a tool for numerical linear algebra},\n\tauthor       = {Woodruff, David P},\n\tyear         = 2014,\n\tjournal      = {Foundations and Trends{\\textregistered} in Theoretical Computer Science},\n\tpublisher    = {Now Publishers, Inc.},\n\tvolume       = 10,\n\tnumber       = {1--2},\n\tpages        = {1--157}\n}\n@techreport{woods72lunar,\n\ttitle        = {The Lunar Sciences Natural Language Information System: Final Report},\n\tauthor       = {W. A. Woods and R. M. Kaplan and B. N. Webber},\n\tyear         = 1972,\n\tinstitution  = {BBN Report 2378, Bolt Beranek and Newman Inc.}\n}\n@inproceedings{woodworth2016tight,\n\ttitle        = {Tight complexity bounds for optimizing composite objectives},\n\tauthor       = {Woodworth, Blake E and Srebro, Nati},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {3639--3647}\n}\n@inproceedings{woodworth2017,\n\ttitle        = {Learning Non-Discriminatory Predictors},\n\tauthor       = {Blake Woodworth and Suriya Gunasekar and Mesrob I. Ohannessian and Nathan Srebro},\n\tyear         = 2017,\n\tbooktitle    = {Conference on Learning Theory (COLT)},\n\tpages        = {1920--1953}\n}\n@article{woodworth2019kernel,\n\ttitle        = {Kernel and Deep Regimes in Overparametrized Models},\n\tauthor       = {Woodworth, Blake and Gunasekar, Suriya and Lee, Jason and Soudry, Daniel and Srebro, Nathan},\n\tyear         = 2020,\n\tjournal      = {Conference on Learning Theory (COLT)}\n}\n@article{woodworth2020kernel,\n\ttitle        = {Kernel and rich regimes in overparametrized models},\n\tauthor       = {Woodworth, Blake and Gunasekar, Suriya and Lee, Jason D and Moroshko, Edward and Savarese, Pedro and Golan, Itay and Soudry, Daniel and Srebro, Nathan},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2002.09277}\n}\n@inproceedings{WoodworthSrebro2016,\n\ttitle        = {{Tight Complexity Bounds for Optimizing Composite Objectives}},\n\tauthor       = {Blake Woodworth and Nati Srebro},\n\tyear         = 2016,\n\tbooktitle    = {NIPS}\n}\n@inproceedings{word2vec,\n\ttitle        = {Distributed Representations of Words and Phrases and their Compositionality},\n\tauthor       = {Tomas Mikolov and Ilya Sutskever and Kai Chen and Gregory S. Corrado and Jeffrey Dean},\n\tbooktitle    = {Advances in Neural Information Processing Systems (NIPS), 2015},\n\turl          = {http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality},\n\tcrossref     = {DBLP:conf/nips/2013},\n\ttimestamp    = {Thu, 07 May 2015 20:02:01 +0200},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/nips/MikolovSCCD13},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@inproceedings{worrall2017harmonic,\n\ttitle        = {Harmonic networks: Deep translation and rotation equivariance},\n\tauthor       = {Daniel E Worrall and Stephan J Garbin and Daniyar Turmukhambetov and Gabriel J Brostow},\n\tyear         = 2017,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {5028--5037}\n}\n@article{wright2012sparse,\n\ttitle        = {Exact Recovery of Sparsely-Used Dictionaries},\n\tauthor       = {John Wright},\n\tyear         = 2012,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 1,\n\tpages        = {1--35}\n}\n@inproceedings{wu2010open,\n\ttitle        = {Open information extraction using {W}ikipedia},\n\tauthor       = {Fei Wu and Daniel S Weld},\n\tyear         = 2010,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {118--127}\n}\n@inproceedings{wu2016ask,\n\ttitle        = {Ask me anything: Free-form visual question answering based on knowledge from external sources},\n\tauthor       = {Qi Wu and Peng Wang and Chunhua Shen and Anthony Dick and Anton van den Hengel},\n\tyear         = 2016,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {4622--4630}\n}\n@inproceedings{wu2016collaborative,\n\ttitle        = {Collaborative denoising auto-encoders for top-n recommender systems},\n\tauthor       = {Wu, Yao and DuBois, Christopher and Zheng, Alice X and Ester, Martin},\n\tyear         = 2016,\n\tbooktitle    = {Proceedings of the Ninth ACM International Conference on Web Search and Data Mining},\n\tpages        = {153--162},\n\torganization = {ACM}\n}\n@article{wu2016google,\n\ttitle        = {Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation},\n\tauthor       = {Yonghui Wu and Mike Schuster and Zhifeng Chen and Quoc V Le and Mohammad Norouzi and Wolfgang Macherey and Maxim Krikun and Yuan Cao and Qin Gao and Klaus Macherey and others},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1609.08144}\n}\n@inproceedings{wu2017framework,\n\ttitle        = {Framework for control and deep reinforcement learning in traffic},\n\tauthor       = {Wu, Cathy and Parvate, Kanaad and Kheterpal, Nishant and Dickstein, Leah and Mehta, Ankur and Vinitsky, Eugene and Bayen, Alexandre M},\n\tyear         = 2017,\n\tbooktitle    = {2017 IEEE 20th International Conference on Intelligent Transportation Systems (ITSC)},\n\tpages        = {1--8},\n\torganization = {IEEE}\n}\n@article{wu2017visual,\n\ttitle        = {Visual question answering: A survey of methods and datasets},\n\tauthor       = {Qi Wu and Damien Teney and Peng Wang and Chunhua Shen and Anthony Dick and Anton van den Hengel},\n\tyear         = 2017,\n\tjournal      = {Computer Vision and Image Understanding}\n}\n@article{wu2018building,\n\ttitle        = {Building Generalizable Agents with a Realistic and Rich 3D Environment},\n\tauthor       = {Wu, Yi and Wu, Yuxin and Gkioxari, Georgia and Tian, Yuandong},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1801.02209}\n}\n@inproceedings{wu2018discrete,\n\ttitle        = {Discrete-Continuous Mixtures in Probabilistic Programming: Generalized Semantics and Inference Algorithms},\n\tauthor       = {Wu, Yi and Srivastava, Siddharth and Hay, Nicholas and Du, Simon and Russell, Stuart},\n\tyear         = 2018,\n\tmonth        = {10--15 Jul},\n\tbooktitle    = {Proceedings of the 35th International Conference on Machine Learning},\n\tpublisher    = {PMLR},\n\tseries       = {Proceedings of Machine Learning Research},\n\tvolume       = 80,\n\tpages        = {5343--5352},\n\turl          = {http://proceedings.mlr.press/v80/wu18f.html},\n\teditor       = {Dy, Jennifer and Krause, Andreas},\n\tpdf          = {http://proceedings.mlr.press/v80/wu18f/wu18f.pdf},\n\tabstract     = {Despite the recent successes of probabilistic programming languages (PPLs) in AI applications, PPLs offer only limited support for random variables whose distributions combine discrete and continuous elements. We develop the notion of measure-theoretic Bayesian networks (MTBNs) and use it to provide more general semantics for PPLs with arbitrarily many random variables defined over arbitrary measure spaces. We develop two new general sampling algorithms that are provably correct under the MTBN framework: the lexicographic likelihood weighting (LLW) for general MTBNs and the lexicographic particle filter (LPF), a specialized algorithm for state-space models. We further integrate MTBNs into a widely used PPL system, BLOG, and verify the effectiveness of the new inference algorithms through representative examples.}\n}\n@inproceedings{wu2018fonduer,\n\ttitle        = {Fonduer: Knowledge Base Construction from Richly Formatted Data},\n\tauthor       = {Sen Wu and Luke Hsiao and Xiao Cheng and Braden Hancock and Theodoros Rekatsinas and Philip Levis and Christopher R\\'{e}},\n\tyear         = 2018,\n\tbooktitle    = {Proceedings of SIGMOD 2018}\n}\n@article{wu2018NoSpurious,\n\ttitle        = {No Spurious Local Minima in a Two Node Neural Network},\n\tauthor       = {Wu, Chenwei and Luo, Jiajun and Lee, Jason D},\n\tyear         = 2018,\n\tjournal      = {International Conference on Learning Representations (ICLR) Workshop Track}\n}\n@article{wu2018response,\n\ttitle        = {Response Generation by Context-aware Prototype Editing},\n\tauthor       = {Yu Wu and Furu Wei and SHaohan Huang and Zhoujun Li and Ming Zhou},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1806.07042}\n}\n@inproceedings{wu2018unsupervised,\n\ttitle        = {Unsupervised feature learning via non-parametric instance discrimination},\n\tauthor       = {Wu, Zhirong and Xiong, Yuanjun and Yu, Stella X and Lin, Dahua},\n\tyear         = 2018,\n\tbooktitle    = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},\n\tpages        = {3733--3742}\n}\n@article{wu2019behavior,\n\ttitle        = {Behavior regularized offline reinforcement learning},\n\tauthor       = {Wu, Yifan and Tucker, George and Nachum, Ofir},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1911.11361}\n}\n@inproceedings{wu2019domain,\n\ttitle        = {Domain Adaptation with Asymmetrically-Relaxed Distribution Alignment},\n\tauthor       = {Yifan Wu and Ezra Winston and Divyansh Kaushik and Zachary Lipton},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {6872--6881}\n}\n@article{wu2019global,\n\ttitle        = {Global convergence of adaptive gradient methods for an over-parameterized neural network},\n\tauthor       = {Wu, Xiaoxia and Du, Simon S and Ward, Rachel},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1902.07111}\n}\n@article{wu2019implicit,\n\ttitle        = {Implicit Regularization of Normalization Methods},\n\tauthor       = {Wu, Xiaoxia and Dobriban, Edgar and Ren, Tongzheng and Wu, Shanshan and Li, Zhiyuan and Gunasekar, Suriya and Ward, Rachel and Liu, Qiang},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1911.07956}\n}\n@inproceedings{wu2019zero,\n\ttitle        = {Zero shot learning for code education: Rubric sampling with deep learning inference},\n\tauthor       = {Mike Wu and Milan Mosse and Noah Goodman and Chris Piech},\n\tyear         = 2019,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tvolume       = 33,\n\tpages        = {782--790}\n}\n@inproceedings{wu2020multitask,\n\ttitle        = {Understanding and Improving Information Transfer in Multi-Task Learning},\n\tauthor       = {Sen Wu and Hongyang R. Zhang and Christopher Ré},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{wu2020steepest,\n\ttitle        = {Steepest Descent Neural Architecture Optimization: Escaping Local Optimum with Signed Neural Splitting},\n\tauthor       = {Wu, Lemeng and Ye, Mao and Lei, Qi and Lee, Jason D and Liu, Qiang},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2003.10392}\n}\n@article{wu2020variational,\n\ttitle        = {Variational Item Response Theory: Fast, Accurate, and Expressive},\n\tauthor       = {Mike Wu and Richard L Davis and Benjamin W Domingue and Chris Piech and Noah Goodman},\n\tyear         = 2020,\n\tjournal      = {International Conference on Educational Data Mining}\n}\n@article{wu2021greedy,\n\ttitle        = {Greedy Hierarchical Variational Autoencoders for Large-Scale Video Prediction},\n\tauthor       = {Bohan Wu and Suraj Nair and Roberto Martín-Martín and Li Fei-Fei and Chelsea Finn},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2103.04174}\n}\n@article{wu97itg,\n\ttitle        = {Stochastic inversion transduction grammars and bilingual parsing of parallel corpora},\n\tauthor       = {Dekai Wu},\n\tyear         = 1997,\n\tjournal      = {Computational Linguistics},\n\tvolume       = 23,\n\tpages        = {377--404}\n}\n@inproceedings{wulenzsaxena2014hierarchical,\n\ttitle        = {Hierarchical Semantic Labeling for Task-Relevant {RGB-D} Perception},\n\tauthor       = {C. Wu and I. Lenz and A. Saxena},\n\tyear         = 2014,\n\tbooktitle    = {Robotics: Science and Systems (RSS)}\n}\n@inproceedings{wulfmeier2018incremental,\n\ttitle        = {Incremental Adversarial Domain Adaptation for Continually Changing Environments},\n\tauthor       = {Wulfmeier, Markus and Bewley, Alex and Posner, Ingmar},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Robotics and Automation (ICRA)}\n}\n@inproceedings{wzcshbdd18,\n\ttitle        = {Towards Fast Computation of Certified Robustness for {R}e{LU} Networks},\n\tauthor       = {Weng, Tsui-Wei and Zhang, Huan and Chen, Hongge and Song, Zhao and Hsieh, Cho-Jui and Boning, Duane and Dhillon, Inderjit S and Daniel, Luca},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpublisher    = {arXiv preprint arXiv:1804.09699}\n}\n@inproceedings{xbssp18,\n\ttitle        = {Dynamical Isometry and a Mean Field Theory of {CNN}s: How to Train 10,000-Layer Vanilla Convolutional Neural Networks},\n\tauthor       = {Xiao, Lechao and Bahri, Yasaman and Sohl-Dickstein, Jascha and Schoenholz, Samuel S. and Pennington, Jeffrey},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{Xiao2010,\n\ttitle        = {{Dual averaging method for regularized stochastic learning and online optimization}},\n\tauthor       = {Xiao, Lin},\n\tyear         = 2010,\n\tjournal      = {The Journal of Machine Learning Research},\n\tvolume       = 11,\n\tpages        = {2543--2596},\n\tannote       = {Contains the so-called \"dual averaging\" step.},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Xiao - 2010 - Dual averaging method for regularized stochastic learning and online optimization.pdf:pdf},\n\tmendeley-groups = {Optimization/Stochastic Online Optimization}\n}\n@article{xiao2010rda,\n\ttitle        = {Dual Averaging Methods for Regularized Stochastic Learning and Online Optimization},\n\tauthor       = {Lin Xiao},\n\tyear         = 2010,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 11,\n\tpages        = {2543--2596}\n}\n@inproceedings{xiao2012adversarial,\n\ttitle        = {Adversarial Label Flips Attack on Support Vector Machines},\n\tauthor       = {Han Xiao and Huang Xiao and Claudia Eckert},\n\tyear         = 2012,\n\tbooktitle    = {European Conference on Artificial Intelligence}\n}\n@article{xiao2014proximal,\n\ttitle        = {A proximal stochastic gradient method with progressive variance reduction},\n\tauthor       = {Xiao, Lin and Zhang, Tong},\n\tyear         = 2014,\n\tjournal      = {SIAM Journal on Optimization},\n\tpublisher    = {SIAM},\n\tvolume       = 24,\n\tnumber       = 4,\n\tpages        = {2057--2075},\n\tdoi          = {10.1137/140961791},\n\tissn         = {1052-6234},\n\tabstract     = {We consider the problem of minimizing the sum of two convex functions: one is the average of a large number of smooth component functions, and the other is a general convex function that admits a simple proximal mapping. We assume the whole objective function is strongly convex. Such problems often arise in machine learning, known as regularized empirical risk minimization. We propose and analyze a new proximal stochastic gradient method, which uses a multi-stage scheme to progressively reduce the variance of the stochastic gradient. While each iteration of this algorithm has similar cost as the classical stochastic gradient method (or incremental gradient method), we show that the expected objective value converges to the optimum at a geometric rate. The overall complexity of this method is much lower than both the proximal full gradient method and the standard proximal stochastic gradient method. 1},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {arXiv:1403.4699v1},\n\teprint       = {arXiv:1403.4699v1},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Xiao, Zhang - 2014 - A Proximal Stochastic Gradient Method with Progressive Variance Reduction(2).pdf:pdf},\n\tmendeley-groups = {Optimization/[with Yuan Yang],Optimization/Variance Reduction}\n}\n@article{xiao2015contamination,\n\ttitle        = {Support vector machines under adversarial label contamination},\n\tauthor       = {Huang Xiao and Battista Biggio and Blaine Nelson and Han Xiao and Claudia Eckert and Fabio Roli},\n\tyear         = 2015,\n\tjournal      = {Neurocomputing},\n\tvolume       = 160,\n\tpages        = {53--62}\n}\n@inproceedings{xiao2015lasso,\n\ttitle        = {Is Feature Selection Secure against Training Data Poisoning?},\n\tauthor       = {Huang Xiao and Battista Biggio and Gavin Brown and Giorgio Fumera and Claudia Eckert and Fabio Roli},\n\tyear         = 2015,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{xiao2016sequence,\n\ttitle        = {Sequence-based Structured Prediction for Semantic Parsing},\n\tauthor       = {Chunyang Xiao and Marc Dymetman and Claire Gardent},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{xiao2020noise,\n\ttitle        = {Noise or Signal: The Role of Image Backgrounds in Object Recognition},\n\tauthor       = {Kai Xiao and Logan Engstrom and Andrew Ilyas and Aleksander Madry},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.09994}\n}\n@inproceedings{Xiaodi2012,\n\ttitle        = {Parallel Approximation of Min-max Problems with Applications to Classical and Quantum Zero-Sum Games},\n\tauthor       = {Gutoski, G. and Xiaodi Wu},\n\tyear         = 2012,\n\tmonth        = jun,\n\tbooktitle    = {Computational Complexity (CCC), 2012 IEEE 27th Annual Conference on},\n\tpages        = {21--31},\n\tdoi          = {10.1109/CCC.2012.12},\n\tissn         = {1093-0159},\n\tkeywords     = {approximation theory;computational complexity;game theory;mathematical programming;matrix multiplication;minimax techniques;parallel algorithms;quantum theory;theorem proving;DQIP;PSPACE;QRG(2);SQG;competing-provers complexity class;direct polynomial-space simulation;matrix multiplicative weights update method;min-max problems;multimessage quantum interactive proofs;near-optimal strategies;parallel algorithm;parallel approximation scheme;semidefinite matrices;semidefinite programs;transcript-like consistency condition;two player classical zero-sum games;two player quantum zero-sum games;Approximation methods;Bismuth;Complexity theory;Game theory;Games;Parallel algorithms;Registers;interactive proofs with competing provers;parallel approximation algorithms;semidefinite programs;zero-sum games}\n}\n@article{XiaoZhang2013-homotopy,\n\ttitle        = {A proximal-gradient homotopy method for the sparse least-squares problem},\n\tauthor       = {Xiao, Lin and Zhang, Tong},\n\tyear         = 2013,\n\tjournal      = {SIAM Journal on Optimization},\n\tpublisher    = {SIAM},\n\tvolume       = 23,\n\tnumber       = 2,\n\tpages        = {1062--1091}\n}\n@inproceedings{Xie2006,\n\ttitle        = {{Efficient algorithm for approximating maximum inscribed sphere in high dimensional polytope}},\n\tauthor       = {Xie, Yulai and Snoeyink, Jack and Xu, Jinhui},\n\tyear         = 2006,\n\tbooktitle    = {Proceedings of the 22nd annual symposium on computational geometry - SCG '06},\n\tdoi          = {10.1145/1137856.1137861},\n\tisbn         = 1595933409,\n\tmendeley-groups = {Algorithms/Computational Geometry}\n}\n@inproceedings{xie2016transfer,\n\ttitle        = {Transfer Learning from Deep Features for Remote Sensing and Poverty Mapping},\n\tauthor       = {Michael Xie and Neal Jean and Marshall Burke and David Lobell and Stefano Ermon},\n\tyear         = 2016,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{xie2017diverse,\n\ttitle        = {Diverse Neural Network Learns True Target Functions},\n\tauthor       = {Xie, Bo and Liang, Yingyu and Song, Le},\n\tyear         = 2017,\n\tbooktitle    = {Artificial Intelligence and Statistics},\n\tpages        = {1216--1224}\n}\n@article{xie2019unsupervised,\n\ttitle        = {Unsupervised data augmentation},\n\tauthor       = {Qizhe Xie and Zihang Dai and Eduard Hovy and Minh-Thang Luong and Quoc V Le},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1904.12848}\n}\n@inproceedings{xie2020adversarial,\n\ttitle        = {Adversarial examples improve image recognition},\n\tauthor       = {Cihang Xie and Mingxing Tan and Boqing Gong and Jiang Wang and Alan L Yuille and Quoc V Le},\n\tyear         = 2020,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {819--828}\n}\n@article{xie2020diffusion,\n\ttitle        = {A Diffusion Theory for Deep Learning Dynamics: Stochastic Gradient Descent Escapes From Sharp Minima Exponentially Fast},\n\tauthor       = {Xie, Zeke and Sato, Issei and Sugiyama, Masashi},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2002.03495}\n}\n@inproceedings{xie2020lili,\n\ttitle        = {Learning Latent Representations to Influence Multi-Agent Interaction},\n\tauthor       = {Annie Xie and Dylan P. Losey and Ryan Tolsma and Chelsea Finn and Dorsa Sadigh},\n\tyear         = 2020,\n\tbooktitle    = {Conference on Robot Learning (CORL)}\n}\n@article{xie2020risk,\n\ttitle        = {Risk variance penalization: From distributional robustness to causality},\n\tauthor       = {Xie, Chuanlong and Chen, Fei and Liu, Yue and Li, Zhenguo},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.07544}\n}\n@article{xie2020selftraining,\n\ttitle        = {Self-training with Noisy Student improves ImageNet classification},\n\tauthor       = {Qizhe Xie and Minh-Thang Luong and Eduard Hovy and Quoc V. Le},\n\tyear         = 2020,\n\tjournal      = {arXiv}\n}\n@inproceedings{xie2020unsupervised,\n\ttitle        = {Unsupervised data augmentation for consistency training},\n\tauthor       = {Qizhe Xie and Zihang Dai and Eduard Hovy and Minh-Thang Luong and Quoc V Le},\n\tyear         = 2020,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{xie2021composed,\n\ttitle        = {Composed Fine-Tuning: Freezing Pre-Trained Denoising Autoencoders for Improved Generalization},\n\tauthor       = {Sang Michael Xie and Tengyu Ma and Percy Liang},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{xie2021innout,\n\ttitle        = {In-{N}-Out: Pre-Training and Self-Training using Auxiliary Information for Out-of-Distribution Robustness},\n\tauthor       = {Sang Michael Xie and Ananya Kumar and Robert Jones and Fereshte Khani and Tengyu Ma and Percy Liang},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{XieLiangSong2015-kernalPCA,\n\ttitle        = {Scale up nonlinear component analysis with doubly stochastic gradients},\n\tauthor       = {Xie, Bo and Liang, Yingyu and Song, Le},\n\tyear         = 2015,\n\tbooktitle    = {NIPS},\n\tpages        = {2341--2349}\n}\n@inproceedings{xing04haplotype,\n\ttitle        = {{B}ayesian Haplotype Inference via the {D}irichlet Process},\n\tauthor       = {E. P. Xing and R. Sharan and M. I. Jordan},\n\tyear         = 2004,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {111--118}\n}\n@inproceedings{xing08haplotype,\n\ttitle        = {Beam Sampling for the Infinite Hidden {M}arkov Model},\n\tauthor       = {Jurgen Van Gael and Yunus Saatci and Yee Whye Teh and Zoubin Ghahramani},\n\tyear         = 2008,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1088--1095}\n}\n@inproceedings{xing2002generalized,\n\ttitle        = {A generalized mean field algorithm for variational inference in exponential families},\n\tauthor       = {Eric P Xing and Michael I Jordan and Stuart Russell},\n\tyear         = 2002,\n\tbooktitle    = {Uncertainty in Artificial Intelligence (UAI)},\n\tpages        = {583--591}\n}\n@article{xing2018walk,\n\ttitle        = {A walk with sgd},\n\tauthor       = {Xing, Chen and Arpit, Devansh and Tsirigotis, Christos and Bengio, Yoshua},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1802.08770}\n}\n@inproceedings{xiong2014latent,\n\ttitle        = {Latent Domains Modeling for Visual Domain Adaptation},\n\tauthor       = {Caiming Xiong and Scott McCloskey and Shao-Hang Hsieh and Jason J. Corso},\n\tyear         = 2014,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@article{xiong2016achieving,\n\ttitle        = {Achieving human parity in conversational speech recognition},\n\tauthor       = {Wayne Xiong and Jasha Droppo and Xuedong Huang and Frank Seide and Mike Seltzer and Andreas Stolcke and Dong Yu and Geoffrey Zweig},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@inproceedings{xiong2017dynamic,\n\ttitle        = {Dynamic Coattention Networks For Question Answering},\n\tauthor       = {Caiming Xiong and Victor Zhong and Richard Socher},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{xiong2021randomized,\n\ttitle        = {Randomized Exploration is Near-Optimal for Tabular {MDP}},\n\tauthor       = {Xiong, Zhihan and Shen, Ruoqi and Du, Simon S},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2102.09703}\n}\n@inproceedings{XLG,\n\ttitle        = {Document clustering based on non-negative matrix factorization},\n\tauthor       = {Xu, Wei and Liu, Xin and Gong, Yihong},\n\tyear         = 2003,\n\tbooktitle    = {Proceedings of the 26th annual international ACM SIGIR conference on Research and development in informaion retrieval},\n\tlocation     = {Toronto, Canada},\n\tpublisher    = {ACM},\n\taddress      = {New York, NY, USA},\n\tseries       = {SIGIR '03},\n\tpages        = {267--273},\n\tdoi          = {10.1145/860435.860485},\n\tisbn         = {1-58113-646-3},\n\turl          = {http://doi.acm.org/10.1145/860435.860485},\n\tnumpages     = 7,\n\tacmid        = 860485,\n\tkeywords     = {document clustering, non-negative matrix factorization}\n}\n@inproceedings{xu2006maximum,\n\ttitle        = {A maximum entropy framework that integrates word dependencies and grammatical relations for reading comprehension},\n\tauthor       = {Kui Xu and Helen Meng and Fuliang Weng},\n\tyear         = 2006,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)},\n\tpages        = {185--188}\n}\n@article{xu2007convergence,\n\ttitle        = {Convergence analysis of sample average approximation methods for a class of stochastic mathematical programs with equality constraints},\n\tauthor       = {Xu, Huifu and Meng, Fanwen},\n\tyear         = 2007,\n\tjournal      = {Mathematics of Operations Research},\n\tpublisher    = {INFORMS},\n\tvolume       = 32,\n\tnumber       = 3,\n\tpages        = {648--668}\n}\n@article{xu2008satzilla,\n\ttitle        = {{SAT}zilla: portfolio-based algorithm selection for {SAT}},\n\tauthor       = {Lin Xu and Frank Hutter and Holger H. Hoos and Kevin Leyton-Brown},\n\tyear         = 2008,\n\tjournal      = {Journal of Artificial Intelligence Research (JAIR)},\n\tvolume       = 32,\n\tpages        = {565--606}\n}\n@article{xu2010principal,\n\ttitle        = {Principal component analysis with contaminated data: The high dimensional case},\n\tauthor       = {Huan Xu and Constantine Caramanis and Shie Mannor},\n\tyear         = 2010,\n\tjournal      = {arXiv}\n}\n@article{xu2012alternating,\n\ttitle        = {An alternating direction algorithm for matrix completion with nonnegative factors},\n\tauthor       = {Xu, Yangyang and Yin, Wotao and Wen, Zaiwen and Zhang, Yin},\n\tyear         = 2012,\n\tjournal      = {Frontiers of Mathematics in China},\n\tpublisher    = {Springer},\n\tvolume       = 7,\n\tnumber       = 2,\n\tpages        = {365--384}\n}\n@article{xu2015attend,\n\ttitle        = {Show, Attend and Tell: Neural Image Caption Generation with Visual Attention},\n\tauthor       = {Kelvin Xu and Jimmy Lei Ba and Ryan Kiros and Kyunghyun Cho and Aaron Courville and Ruslan Salakhutdinov and Richard S. Zemel and Yoshua Bengio},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1502.03044}\n}\n@misc{xu2015empirical,\n\ttitle        = {Empirical Evaluation of Rectified Activations in Convolutional Network},\n\tauthor       = {Bing Xu and Naiyan Wang and Tianqi Chen and Mu Li},\n\tyear         = 2015,\n\teprint       = {1505.00853},\n\tarchiveprefix = {arXiv},\n\tprimaryclass = {cs.LG}\n}\n@inproceedings{xu2016global,\n\ttitle        = {Global analysis of expectation maximization for mixtures of two gaussians},\n\tauthor       = {Xu, Ji and Hsu, Daniel J and Maleki, Arian},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {2676--2684}\n}\n@inproceedings{xu2016question,\n\ttitle        = {Question answering on freebase via relation extraction and textual evidence},\n\tauthor       = {Kun Xu and Siva Reddy and Yansong Feng and Songfang Huang and Dongyan Zhao},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{xu2017feature,\n\ttitle        = {Feature Squeezing: Detecting Adversarial Examples in Deep Neural Networks},\n\tauthor       = {Weilin Xu and David Evans and Yanjun Qi},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@article{xu2017sqlnet,\n\ttitle        = {Sqlnet: Generating structured queries from natural language without reinforcement learning},\n\tauthor       = {Xiaojun Xu and Chang Liu and Dawn Song},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1711.04436}\n}\n@article{xu2018minimal,\n\ttitle        = {The minimal measurement number for low-rank matrix recovery},\n\tauthor       = {Xu, Zhiqiang},\n\tyear         = 2018,\n\tjournal      = {Applied and Computational Harmonic Analysis},\n\tpublisher    = {Elsevier},\n\tvolume       = 44,\n\tnumber       = 2,\n\tpages        = {497--508}\n}\n@inproceedings{xu2018powerful,\n\ttitle        = {How Powerful are Graph Neural Networks?},\n\tauthor       = {Keyulu Xu and Weihua Hu and Jure Leskovec and Stefanie Jegelka},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{xu2018spherical,\n\ttitle        = {Spherical Latent Spaces for Stable Variational Autoencoders},\n\tauthor       = {Jiacheng Xu and Greg Durrett},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{xu2019joint,\n\ttitle        = {Joint inference of reward machines and policies for reinforcement learning},\n\tauthor       = {Xu, Zhe and Gavran, Ivan and Ahmad, Yousef and Majumdar, Rupak and Neider, Daniel and Topcu, Ufuk and Wu, Bo},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1909.05912}\n}\n@inproceedings{xu2020adversarial,\n\ttitle        = {Adversarial domain adaptation with domain mixup},\n\tauthor       = {Minghao Xu and Jian Zhang and Bingbing Ni and Teng Li and Chengjie Wang and Qi Tian and Wenjun Zhang},\n\tyear         = 2020,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tvolume       = 34,\n\tnumber       = 4,\n\tpages        = {6502--6509}\n}\n@inproceedings{xu2020autoqa,\n\ttitle        = {Auto{QA}: From databases to {QA} semantic parsers with only synthetic training data},\n\tauthor       = {Silei Xu and Sina J Semnani and Giovanni Campagna and Monica S Lam},\n\tyear         = 2020,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{xu2020neural,\n\ttitle        = {How neural networks extrapolate: From feedforward to graph neural networks},\n\tauthor       = {Xu, Keyulu and Li, Jingling and Zhang, Mozhi and Du, Simon S and Kawarabayashi, Ken-ichi and Jegelka, Stefanie},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2009.11848}\n}\n@inproceedings{xu2020what,\n\ttitle        = {What Can Neural Networks Reason About?},\n\tauthor       = {Keyulu Xu and Jingling Li and Mozhi Zhang and Simon S. Du and Ken-ichi Kawarabayashi and Stefanie Jegelka},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Learning Representations},\n\turl          = {https://openreview.net/forum?id=rJxbJeHFPS}\n}\n@article{xu2021fine,\n\ttitle        = {Fine-Grained Gap-Dependent Bounds for Tabular MDPs via Adaptive Multi-Step Bootstrap},\n\tauthor       = {Xu, Haike and Ma, Tengyu and Du, Simon S},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2102.04692}\n}\n@inproceedings{xu2021how,\n\ttitle        = {How Neural Networks Extrapolate: From Feedforward to Graph Neural Networks},\n\tauthor       = {Keyulu Xu and Mozhi Zhang and Jingling Li and Simon Shaolei Du and Ken-Ichi Kawarabayashi and Stefanie Jegelka},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Learning Representations},\n\turl          = {https://openreview.net/forum?id=UH-cmocLJC}\n}\n@inproceedings{xu2021understanding,\n\ttitle        = {Understanding the role of importance weighting for deep learning},\n\tauthor       = {Da Xu and Yuting Ye and Chuanwei Ruan},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Learning Representations}\n}\n@article{xue07multitask,\n\ttitle        = {Multi-task learning for classification with {D}irichlet process priors},\n\tauthor       = {Y. Xue and X. Liao and L. Carin and B. Krishnapuram},\n\tyear         = 2007,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 8,\n\tpages        = {35--63}\n}\n@article{yadlowsky2019calibration,\n\ttitle        = {A Calibration Metric for Risk Scores with Survival Data},\n\tauthor       = {Steve Yadlowsky and Sanjay Basu and Lu Tian},\n\tyear         = 2019,\n\tjournal      = {Machine Learning for Healthcare}\n}\n@inproceedings{yadollahpour2013discriminative,\n\ttitle        = {Discriminative re-ranking of diverse segmentations},\n\tauthor       = {Payman Yadollahpour and Dhruv Batra and Gregory Shakhnarovich},\n\tyear         = 2013,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {1923--1930}\n}\n@inproceedings{yaeger1996effective,\n\ttitle        = {Effective Training of a Neural Network Character Classifier for Word Recognition},\n\tauthor       = {Larry Yaeger and Richard Lyon and Brandyn Webb},\n\tyear         = 1996,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {807--813}\n}\n@inproceedings{yaghmazadeh2016hierarchy,\n\ttitle        = {Synthesizing Transformations on Hierarchically Structured Data},\n\tauthor       = {Navid Yaghmazadeh and Christian Klinger and Isil Dillig and Swarat Chaudhuri},\n\tyear         = 2016,\n\tbooktitle    = {Programming Language Design and Implementation (PLDI)}\n}\n@inproceedings{yaghmazadeh2017sqlizer,\n\ttitle        = {SQLizer: Query Synthesis from Natural Language},\n\tauthor       = {Navid Yaghmazadeh and Yuepeng Wang and Isil Dillig and Thomas Dillig},\n\tyear         = 2017,\n\tbooktitle    = {Object-Oriented Programming, Systems, Languages, and Applications (OOPSLA)}\n}\n@article{yaghoobzadeh2019increasing,\n\ttitle        = {Increasing Robustness to Spurious Correlations using Forgettable Examples},\n\tauthor       = {Yadollah Yaghoobzadeh and Soroush Mehri and Remi Tachet and Timothy J Hazen and Alessandro Sordoni},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1911.03861}\n}\n@article{yahia2000rough,\n\ttitle        = {Rough neural expert systems},\n\tauthor       = {M.E. Yahia and R. Mahmood and N. Sulaiman and F. Ahmad},\n\tyear         = 2000,\n\tjournal      = {Expert Systems with Applications},\n\tvolume       = 18,\n\tpages        = {87--99}\n}\n@inproceedings{yahya2012natural,\n\ttitle        = {Natural language questions for the web of data},\n\tauthor       = {Mohamed Yahya and Klaus Berberich and Shady Elbassuoni and Maya Ramanath and Volker Tresp and Gerhard Weikum},\n\tyear         = 2012,\n\tbooktitle    = {Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP/CoNLL)},\n\tpages        = {379--390}\n}\n@article{yaida2018fluctuation,\n\ttitle        = {Fluctuation-dissipation relations for stochastic gradient descent},\n\tauthor       = {Yaida, Sho},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1810.00004}\n}\n@inproceedings{yakout2012infogather,\n\ttitle        = {Infogather: entity augmentation and attribute discovery by holistic matching with web tables},\n\tauthor       = {Mohamed Yakout and Kris Ganjam and Kaushik Chakrabarti and Surajit Chaudhuri},\n\tyear         = 2012,\n\tbooktitle    = {International Conference on Management of Data (SIGMOD)}\n}\n@inproceedings{yan2011active,\n\ttitle        = {Active learning from crowds},\n\tauthor       = {Yan Yan and Glenn M Fung and R{\\'o}mer Rosales and Jennifer G Dy},\n\tyear         = 2011,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1161--1168}\n}\n@inproceedings{yan2015deep,\n\ttitle        = {Deep correlation for matching images and text},\n\tauthor       = {Fei Yan and Krystian Mikolajczyk},\n\tyear         = 2015,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {3441--3450}\n}\n@inproceedings{yan2016learning,\n\ttitle        = {Learning to respond with deep neural networks for retrieval-based human-computer conversation system},\n\tauthor       = {Rui Yan and Yiping Song and Hua Wu},\n\tyear         = 2016,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)},\n\tpages        = {55--64}\n}\n@article{yang2008estimating,\n\ttitle        = {Estimating Location Using Wi-Fi},\n\tauthor       = {Qiang Yang and Sinno Jialin Pan and Vincent Wenchen Zheng},\n\tyear         = 2008,\n\tjournal      = {IEEE Intelligent Systems},\n\tvolume       = 23,\n\tnumber       = 1,\n\tpages        = {8--13}\n}\n@inproceedings{yang2008image,\n\ttitle        = {Image super-resolution as sparse representation of raw image patches},\n\tauthor       = {Yang, Jianchao and Wright, John and Huang, Thomas and Ma, Yi},\n\tyear         = 2008,\n\tbooktitle    = {Computer Vision and Pattern Recognition, 2008. CVPR 2008. IEEE Conference on},\n\tpages        = {1--8},\n\torganization = {IEEE},\n\towner        = {gewor_000},\n\ttimestamp    = {2013.11.10}\n}\n@inproceedings{yang2009dual,\n\ttitle        = {Dual temporal difference learning},\n\tauthor       = {Yang, Min and Li, Yuxi and Schuurmans, Dale},\n\tyear         = 2009,\n\tbooktitle    = {International Conference on Artificial Intelligence and Statistics},\n\tpages        = {631--638}\n}\n@article{yang2010landuse,\n\ttitle        = {Bag-of-visual-words and spatial extensions for land-use classification},\n\tauthor       = {Yi Yang and Shawn Newsam},\n\tyear         = 2010,\n\tjournal      = {Geographic Information Systems}\n}\n@inproceedings{yang2014joint,\n\ttitle        = {Joint Relational Embeddings for Knowledge-based Question Answering},\n\tauthor       = {Min-Chul Yang and Nan Duan and Ming Zhou and Hae-Chang Rim},\n\tyear         = 2014,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{yang2015embeddings,\n\ttitle        = {Embedding Entities and Relations for Learning and Inference in Knowledge Bases},\n\tauthor       = {Bishan Yang and Wen-tau Yih and Xiaodong He and Jianfeng Gao and Li Deng},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1412.6575}\n}\n@article{yang2015explicit,\n\ttitle        = {An Explicit Sampling Dependent Spectral Error Bound for Column Subset Selection},\n\tauthor       = {Yang, Tianbao and Zhang, Lijun and Jin, Rong and Zhu, Shenghuo},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1505.00526}\n}\n@inproceedings{yang2015smart,\n\ttitle        = {{S-MART}: Novel Tree-based Structured Learning Algorithms Applied to Tweet Entity Linking},\n\tauthor       = {Yi Yang and Ming-Wei Chang},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{yang2015wikiqa,\n\ttitle        = {{W}iki{QA}: A Challenge Dataset for Open-Domain Question Answering},\n\tauthor       = {Yi Yang and Wen-tau Yih and Christopher Meek},\n\tyear         = 2015,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {2013--2018}\n}\n@inproceedings{yang2016stacked,\n\ttitle        = {Stacked attention networks for image question answering},\n\tauthor       = {Zichao Yang and Xiaodong He and Jianfeng Gao and Li Deng and Alex Smola},\n\tyear         = 2016,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@article{yang2017generative,\n\ttitle        = {Generative Poisoning Attack Method Against Neural Networks},\n\tauthor       = {Chaofei Yang and Qing Wu and Hai Li and Yiran Chen},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@article{yang2018benchmark,\n\ttitle        = {A benchmark and comparison of active learning for logistic regression},\n\tauthor       = {Yazhou Yang and Marco Loog},\n\tyear         = 2018,\n\tjournal      = {Pattern Recognition},\n\tvolume       = 83\n}\n@inproceedings{yang2018breaking,\n\ttitle        = {Breaking the beam search curse: A study of (re-) scoring methods and stopping criteria for neural machine translation},\n\tauthor       = {Yilin Yang and Liang Huang and Mingbo Ma},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{yang2018hotpotqa,\n\ttitle        = {{HotpotQA}: A dataset for diverse, explainable multi-hop question answering},\n\tauthor       = {Zhilin Yang and Peng Qi and Saizheng Zhang and Yoshua Bengio and William W Cohen and Ruslan Salakhutdinov and Christopher D Manning},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{yang2018mastering,\n\ttitle        = {Mastering the Dungeon: Grounded Language Learning by Mechanical Turker Descent},\n\tauthor       = {Z. Yang and Saizheng Zhang and Jack Urbanek and Will Feng and Alexander H. Miller and Arthur Szlam and Douwe Kiela and J. Weston},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{yang2018nmtadv,\n\ttitle        = {Improving Neural Machine Translation with Conditional Sequence Generative Adversarial Nets},\n\tauthor       = {Zhen Yang and Wei Chen and Feng Wang and Bo Xu},\n\tyear         = 2018,\n\tbooktitle    = {Human Language Technology and North American Association for Computational Linguistics (HLT/NAACL)},\n\tpages        = {1346--1355}\n}\n@inproceedings{yang2019invariance,\n\ttitle        = {Invariance-inducing regularization using worst-case transformations suffices to boost accuracy and spatial robustness},\n\tauthor       = {Fanny Yang and Zuowen Wang and Christina Heinze-Deml},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{yang2019norml,\n\ttitle        = {Norml: No-reward meta learning},\n\tauthor       = {Yuxiang Yang and Ken Caluwaerts and Atil Iscen and Jie Tan and Chelsea Finn},\n\tyear         = 2019,\n\tbooktitle    = {Proceedings of the 18th International Conference on Autonomous Agents and MultiAgent Systems},\n\tpages        = {323--331}\n}\n@inproceedings{yang2019reinforcement,\n\ttitle        = {Reinforcement leaning in feature space: Matrix bandit, kernels, and regret bound},\n\tauthor       = {Yang, Lin F and Wang, Mengdi},\n\tyear         = 2020,\n\tjournal      = {International Conference on Machine Learning},\n\tbooktitle    = {International Conference on Machine Learning}\n}\n@inproceedings{yang2019sample,\n\ttitle        = {Sample-optimal parametric {Q}-learning using linearly additive features},\n\tauthor       = {Yang, Lin and Wang, Mengdi},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {6995--7004}\n}\n@inproceedings{yang2019xlnet,\n\ttitle        = {{XLN}et: Generalized Autoregressive Pretraining for Language Understanding},\n\tauthor       = {Zhilin Yang and Zihang Dai and Yiming Yang and Jaime Carbonell and Ruslan Salakhutdinov and Quoc V. Le},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{yang2020accelerating,\n\ttitle        = {Accelerating safe reinforcement learning with constraint-mismatched policies},\n\tauthor       = {Yang, Tsung-Yen and Rosca, Justinian and Narasimhan, Karthik and Ramadge, Peter J},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.11645}\n}\n@article{yang2020bridging,\n\ttitle        = {Bridging exploration and general function approximation in reinforcement learning: Provably efficient kernel and neural value iterations},\n\tauthor       = {Yang, Zhuoran and Jin, Chi and Wang, Zhaoran and Wang, Mengdi and Jordan, Michael I},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2011.04622}\n}\n@article{yang2020generating,\n\ttitle        = {Generating Strategic Dialogue for Negotiation with Theory of Mind},\n\tauthor       = {Runzhe Yang and Jingxiao Chen and Karthik Narasimhan},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.09954}\n}\n@inproceedings{yang2020improving,\n\ttitle        = {Improving Molecular Design by Stochastic Iterative Target Augmentation},\n\tauthor       = {Kevin Yang and Wengong Jin and Kyle Swanson and Regina Barzilay and Tommi Jaakkola},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{yang2020kernel,\n\ttitle        = {Provably Efficient Reinforcement Learning with Kernel and Neural Function Approximations},\n\tauthor       = {Yang, Zhuoran and Jin, Chi and Wang, Zhaoran and Wang, Mengdi and Jordan, Michael},\n\tyear         = 2020,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 33,\n\tpages        = {13903--13916},\n\turl          = {https://proceedings.neurips.cc/paper/2020/file/9fa04f87c9138de23e92582b4ce549ec-Paper.pdf},\n\teditor       = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin}\n}\n@article{yang2020provable,\n\ttitle        = {Provable Benefits of Representation Learning in Linear Bandits},\n\tauthor       = {Yang, Jiaqi and Hu, Wei and Lee, Jason D and Du, Simon S},\n\tyear         = 2021,\n\tjournal      = {International Conference on Learning Representations (ICLR)}\n}\n@article{yang2020q,\n\ttitle        = {{$Q$}-learning with Logarithmic Regret},\n\tauthor       = {Yang, Kunhe and Yang, Lin F and Du, Simon S},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.09118},\n\tbooktitle    = {International Conference on Artificial Intelligence and Statistics},\n\tpages        = {1576--1584},\n\torganization = {PMLR}\n}\n@inproceedings{yang2020randomized,\n\ttitle        = {Randomized smoothing of all shapes and sizes},\n\tauthor       = {Greg Yang and Tony Duan and J Edward Hu and Hadi Salman and Ilya Razenshteyn and Jerry Li},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{yang2020reinforcement,\n\ttitle        = {Reinforcement learning in feature space: Matrix bandit, kernels, and regret bound},\n\tauthor       = {Yang, Lin and Wang, Mengdi},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {10746--10756},\n\torganization = {PMLR}\n}\n@inproceedings{yang2020rethinking,\n\ttitle        = {Rethinking the Value of Labels for Improving Class-Imbalanced Learning},\n\tauthor       = {Yang, Yuzhe and Xu, Zhi},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2002.11328},\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 33,\n\tpages        = {19290--19301}\n}\n@article{yang2021believe,\n\ttitle        = {Believe What You See: Implicit Constraint Approach for Offline Multi-Agent Reinforcement Learning},\n\tauthor       = {Yang, Yiqin and Ma, Xiaoteng and Li, Chenghao and Zheng, Zewu and Zhang, Qiyuan and Huang, Gao and Yang, Jun and Zhao, Qianchuan},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2106.03400}\n}\n@inproceedings{yang2021delving,\n\ttitle        = {Delving into Deep Imbalanced Regression},\n\tauthor       = {Yang, Yuzhe and Zha, Kaiwen and Chen, Yingcong and Wang, Hao and Katabi, Dina},\n\tyear         = 2021,\n\tmonth        = {18--24 Jul},\n\tbooktitle    = {Proceedings of the 38th International Conference on Machine Learning},\n\tseries       = {Proceedings of Machine Learning Research},\n\tvolume       = 139,\n\tpages        = {11842--11851}\n}\n@inproceedings{yang2021impact,\n\ttitle        = {Impact of representation learning in linear bandits},\n\tauthor       = {Yang, Jiaqi and Hu, Wei and Lee, Jason D and Du, Simon S},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Learning Representations}\n}\n@article{yannakakis1991expressing,\n\ttitle        = {Expressing combinatorial optimization problems by linear programs},\n\tauthor       = {Yannakakis, Mihalis},\n\tyear         = 1991,\n\tjournal      = {Journal of Computer and System Sciences},\n\tpublisher    = {Elsevier}\n}\n@inproceedings{yao1977probabilistic,\n\ttitle        = {Probabilistic computations: Toward a unified measure of complexity},\n\tauthor       = {Yao, Andrew Chi-Chin},\n\tyear         = 1977,\n\tbooktitle    = {Foundations of Computer Science (FOCS), 1977 IEEE 18th Annual Symposium on},\n\tpages        = {222--227},\n\torganization = {IEEE}\n}\n@article{yao1994near,\n\ttitle        = {Near-optimal time-space tradeoff for element distinctness},\n\tauthor       = {Yao, Andrew Chi-Chih},\n\tyear         = 1994,\n\tjournal      = {SIAM Journal on Computing},\n\tpublisher    = {SIAM},\n\tvolume       = 23,\n\tnumber       = 5,\n\tpages        = {966--975}\n}\n@inproceedings{yao2009efficient,\n\ttitle        = {Efficient Methods for Topic Model Inference on Streaming Document Collections},\n\tauthor       = {Limin Yao and David Mimno and Andrew McCallum},\n\tyear         = 2009,\n\tbooktitle    = {KDD}\n}\n@article{yao2012semantics,\n\ttitle        = {Semantics-based Question Generation and Implementation},\n\tauthor       = {Xuchen Yao and Gosse Bouma and Yi Zhang},\n\tyear         = 2012,\n\tjournal      = {Dialogue and Discourse},\n\tvolume       = 3,\n\tpages        = {11--42}\n}\n@inproceedings{yao2014freebase,\n\ttitle        = {{F}reebase {QA}: Information Extraction or Semantic Parsing},\n\tauthor       = {Xuchen Yao and Jonathan Berant and Benjamin Van-Durme},\n\tyear         = 2014,\n\tbooktitle    = {Workshop on Semantic parsing}\n}\n@inproceedings{yao2014ie,\n\ttitle        = {Information extraction over structured data: Question answering with {F}reebase},\n\tauthor       = {Xuchen Yao and Benjamin Van-Durme},\n\tyear         = 2014,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{yao2014pseudo,\n\ttitle        = {Pseudo-MDPs and factored linear action models},\n\tauthor       = {Yao, Hengshuai and Szepesv{\\'a}ri, Csaba and Pires, Bernardo Avila and Zhang, Xinhua},\n\tyear         = 2014,\n\tbooktitle    = {IEEE Symposium on Adaptive Dynamic Programming and Reinforcement Learning}\n}\n@inproceedings{yao2017parity,\n\ttitle        = {Beyond Parity:Fairness Objectives for Collaborative Filtering},\n\tauthor       = {Sirui Yao and Bert Huang},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{yao2019model,\n\ttitle        = {Model-based Interactive Semantic Parsing: A Unified Framework and A Text-to-{SQL} Case Study},\n\tauthor       = {Ziyu Yao and Yu Su and Huan Sun and Wen-tau Yih},\n\tyear         = 2019,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{yarats2018hierarchical,\n\ttitle        = {Hierarchical Text Generation and Planning for Strategic Dialogue},\n\tauthor       = {Denis Yarats and Mike Lewis},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{yarotsky2021universal,\n\ttitle        = {Universal approximations of invariant maps by neural networks},\n\tauthor       = {Yarotsky, Dmitry},\n\tyear         = 2021,\n\tjournal      = {Constructive Approximation},\n\tpublisher    = {Springer},\n\tpages        = {1--68}\n}\n@inproceedings{yarowsky95unsupervised,\n\ttitle        = {Unsupervised word sense disambiguation rivaling supervised methods},\n\tauthor       = {David Yarowsky},\n\tyear         = 1995,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{yasunaga2017graph,\n\ttitle        = {Graph-based neural multi-document summarization},\n\tauthor       = {Michihiro Yasunaga and Rui Zhang and Kshitijh Meelu and Ayush Pareek and Krishnan Srinivasan and Dragomir Radev},\n\tyear         = 2017,\n\tbooktitle    = {Computational Natural Language Learning (CoNLL)}\n}\n@inproceedings{yasunaga2020repair,\n\ttitle        = {Graph-based, Self-Supervised Program Repair from Diagnostic Feedback},\n\tauthor       = {Michihiro Yasunaga and Percy Liang},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{yasunaga2021break,\n\ttitle        = {{Break-It-Fix-It}: Unsupervised Learning for Program Repair},\n\tauthor       = {Michihiro Yasunaga and Percy Liang},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@inproceedings{yasunaga2021language,\n\ttitle        = {{LM-Critic}: Language Models for Unsupervised Grammatical Error Correction},\n\tauthor       = {Michihiro Yasunaga and Jure Leskovec and Percy Liang},\n\tyear         = 2021,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{yasunaga2021qagnn,\n\ttitle        = {{QA-GNN}: Reasoning with Language Models and Knowledge Graphs for Question Answering},\n\tauthor       = {Michihiro Yasunaga and Hongyu Ren and Antoine Bosselut and Percy Liang and Jure Leskovec},\n\tyear         = 2021,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{yates2009unsupervised,\n\ttitle        = {Unsupervised methods for determining object and relation synonyms on the web},\n\tauthor       = {Alexander Yates and Oren Etzioni},\n\tyear         = 2009,\n\tjournal      = {Journal of Artificial Intelligence Research (JAIR)},\n\tvolume       = 34,\n\tnumber       = 1\n}\n@article{ye2005new,\n\ttitle        = {A new complexity result on solving the {M}arkov decision problem},\n\tauthor       = {Ye, Yinyu},\n\tyear         = 2005,\n\tjournal      = {Mathematics of Operations Research},\n\tpublisher    = {INFORMS},\n\tvolume       = 30,\n\tnumber       = 3,\n\tpages        = {733--749},\n\tdate-added   = {2017-05-19 05:09:58 +0000},\n\tdate-modified = {2017-05-19 05:09:58 +0000}\n}\n@article{ye2011simplex,\n\ttitle        = {The simplex and policy-iteration methods are strongly polynomial for the {M}arkov decision problem with a fixed discount rate},\n\tauthor       = {Ye, Yinyu},\n\tyear         = 2011,\n\tjournal      = {Mathematics of Operations Research},\n\tpublisher    = {INFORMS},\n\tvolume       = 36,\n\tnumber       = 4,\n\tpages        = {593--603},\n\tdate-added   = {2017-05-19 05:07:20 +0000},\n\tdate-modified = {2017-05-19 05:07:20 +0000}\n}\n@inproceedings{ye2019unsupervised,\n\ttitle        = {Unsupervised embedding learning via invariant and spreading instance feature},\n\tauthor       = {Ye, Mang and Zhang, Xu and Yuen, Pong C and Chang, Shih-Fu},\n\tyear         = 2019,\n\tbooktitle    = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},\n\tpages        = {6210--6219}\n}\n@inproceedings{yeh2009sikuli,\n\ttitle        = {Sikuli: using {GUI} screenshots for search and automation},\n\tauthor       = {Tom Yeh and Tsung-Hsiang Chang and Rob Miller},\n\tyear         = 2009,\n\tbooktitle    = {User Interface Software and Technology (UIST)}\n}\n@article{yeh2020poverty,\n\ttitle        = {Using publicly available satellite imagery and deep learning to understand economic well-being in Africa},\n\tauthor       = {Christopher Yeh and Anthony Perez and Anne Driscoll and George Azzari and Zhongyi Tang and David Lobell and Stefano Ermon and Marshall Burke},\n\tyear         = 2020,\n\tjournal      = {Nature Communications},\n\tvolume       = 11\n}\n@misc{yelp2017yelp,\n\ttitle        = {Yelp {D}ataset {C}hallenge, {R}ound 8},\n\tauthor       = {Yelp},\n\tyear         = 2017,\n\thowpublished = {\\url{https://www.yelp.com/dataset_challenge}}\n}\n@article{yeredor2002non,\n\ttitle        = {Non-orthogonal joint diagonalization in the least-squares sense with application in blind source separation},\n\tauthor       = {Arie Yeredor},\n\tyear         = 2002,\n\tjournal      = {IEEE Transactions on Signal Processing},\n\tvolume       = 50,\n\tnumber       = 7,\n\tpages        = {1545--1553}\n}\n@article{yeredor2004approximate,\n\ttitle        = {Approximate Joint Diagonalization Using a Natural Gradient Approach},\n\tauthor       = {Arie Yeredor and Andreas Ziehe and Klaus-Robert Müller},\n\tyear         = 2004,\n\tjournal      = {Independent Component Analysis and Blind Signal Separation},\n\tvolume       = 1,\n\tpages        = {86--96}\n}\n@inproceedings{yessenalina2010automatically,\n\ttitle        = {Automatically generating annotator rationales to improve sentiment classification},\n\tauthor       = {Ainur Yessenalina and Yejin Choi and Claire Cardie},\n\tyear         = 2010,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {336--341}\n}\n@inproceedings{yi1998efficient,\n\ttitle        = {Efficient Retrieval of Similar Time Sequences Under Time Warping},\n\tauthor       = {Byoung-Kee Yi and H. V. Jagadish and Christos Faloutsos},\n\tyear         = 1998,\n\tbooktitle    = {Proceeding of 14th International Conference on Data Engineering},\n\tpages        = {201--208},\n\tdoi          = {10.1109/ICDE.1998.655778},\n\tkeywords     = {\n\t\tEuclidean distance;FastMap;decelerations;dissimilarity metric;fast\n\n\t\tlinear test;fast similarity searching;field tested dissimilarity\n\n\t\tmetric;indexing viewpoint;large time sequence databases;local accelerations;sequence\n\n\t\tlength;sequential scanning;similar time sequence retrieval;synthetic\n\n\t\tdatasets;time warping;query processing;signal processing;temporal\n\n\t\tdatabases;\n\t},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@inproceedings{yi2000online,\n\ttitle        = {Online Data Mining for Co-Evolving Time Sequences},\n\tauthor       = {\n\t\tByoung-Kee Yi and N.D. Sidiropoulos and Theodore Johnson and H.V.\n\n\t\tJagadish and Christos Faloutsos and Alexandros Biliris\n\t},\n\tyear         = 2000,\n\tbooktitle    = {Proceedings of the 16th International Conference on Data Engineering},\n\tlocation     = {San Diego, CA},\n\tpublisher    = {IEEE Computer Society},\n\taddress      = {Washington, DC, USA},\n\tpages        = {13--22},\n\tisbn         = {0-7695-0506-6},\n\tacmid        = 847379,\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@inproceedings{yi2016fast,\n\ttitle        = {Fast Algorithms for Robust {P}{C}{A} via Gradient Descent},\n\tauthor       = {Yi, Xinyang and Park, Dohyung and Chen, Yudong and Caramanis, Constantine},\n\tyear         = 2016,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {4152--4160}\n}\n@inproceedings{yi2018neural,\n\ttitle        = {Neural-symbolic vqa: Disentangling reasoning from vision and language understanding},\n\tauthor       = {Kexin Yi and Jiajun Wu and Chuang Gan and Antonio Torralba and Pushmeet Kohli and Josh Tenenbaum},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1031--1042}\n}\n@inproceedings{yih2013enhanced,\n\ttitle        = {Question Answering Using Enhanced Lexical Semantic Models},\n\tauthor       = {Wen-tau Yih and Ming-Wei Chang and Christopher Meek and Andrzej Pastusiak},\n\tyear         = 2013,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{yih2015stagg,\n\ttitle        = {Semantic Parsing via Staged Query Graph Generation: Question Answering with Knowledge Base},\n\tauthor       = {Wen-tau Yih and Ming-Wei Chang and Xiaodong He and Jianfeng Gao},\n\tyear         = 2015,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{yih2016value,\n\ttitle        = {The Value of Semantic Parse Labeling for Knowledge Base Question Answering},\n\tauthor       = {Wen-tau Yih and Matthew Richardson and Chris Meek and Ming-Wei Chang and Jina Suh},\n\tyear         = 2016,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{yildirim2008two,\n\ttitle        = {Two algorithms for the minimum enclosing ball problem},\n\tauthor       = {Yildirim, E. Alper},\n\tyear         = 2008,\n\tjournal      = {SIAM Journal on Optimization},\n\tpublisher    = {SIAM},\n\tvolume       = 19,\n\tnumber       = 3,\n\tpages        = {1368--1391}\n}\n@inproceedings{yilmaz2008simple,\n\ttitle        = {A simple and efficient sampling method for estimating {AP} and {NDCG}},\n\tauthor       = {Emine Yilmaz and Evangelos Kanoulas and Javed A Aslam},\n\tyear         = 2008,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)},\n\tpages        = {603--610}\n}\n@book{yin2003stochastic,\n\ttitle        = {Stochastic {A}pproximation and {R}ecursive {A}lgorithms and {A}pplications},\n\tauthor       = {Yin, G George and Kushner, Harold J},\n\tyear         = 2003,\n\tpublisher    = {Springer},\n\tvolume       = 35\n}\n@inproceedings{yin2015convolutional,\n\ttitle        = {Convolutional neural network for paraphrase identification},\n\tauthor       = {Wenpeng Yin and Hinrich Sch{\\\"u}tze},\n\tyear         = 2015,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{yin2015enquirer,\n\ttitle        = {Neural Enquirer: Learning to Query Tables},\n\tauthor       = {Pengcheng Yin and Zhengdong Lu and Hang Li and Ben Kao},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1512.00965}\n}\n@article{yin2016abcnn,\n\ttitle        = {{ABCNN}: Attention-Based Convolutional Neural Network for Modeling Sentence Pairs},\n\tauthor       = {Wenpeng Yin and Hinrich Sch{\\\"u}tze and Bing Xiang and Bowen Zhou},\n\tyear         = 2016,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 4\n}\n@article{yin2016neural,\n\ttitle        = {Neural Enquirer: Learning to Query Tables with Natural Language},\n\tauthor       = {Pengcheng Yin and Zhengdong Lu and Hang Li and Ben Kao},\n\tyear         = 2016,\n\tjournal      = {arXiv}\n}\n@inproceedings{yin2017syntactic,\n\ttitle        = {A Syntactic Neural Model for General-Purpose Code Generation},\n\tauthor       = {Pengcheng Yin and Graham Neubig},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {440--450}\n}\n@inproceedings{yin2019fourier,\n\ttitle        = {A fourier perspective on model robustness in computer vision},\n\tauthor       = {Dong Yin and Raphael Gontijo Lopes and Jonathon Shlens and Ekin D Cubuk and Justin Gilmer},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{yin2019rademacher,\n\ttitle        = {Rademacher Complexity for Adversarially Robust Generalization},\n\tauthor       = {Dong Yin and Ramchandran Kannan and Peter Bartlett},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {7085--7094}\n}\n@inproceedings{YinTatPaper,\n\ttitle        = {Uniform Sampling for Matrix Approximation},\n\tauthor       = {Michael B. Cohen and Yin Tat Lee and Cameron Musco and Christopher Musco and Richard Peng and Aaron Sidford},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the 2015 Conference on Innovations in Theoretical Computer Science, {ITCS} 2015, Rehovot, Israel, January 11-13, 2015},\n\tpages        = {181--190},\n\tdoi          = {10.1145/2688073.2688113},\n\turl          = {http://doi.acm.org/10.1145/2688073.2688113},\n\tcrossref     = {DBLP:conf/innovations/2015},\n\ttimestamp    = {Sun, 25 Jan 2015 11:31:05 +0100},\n\tbiburl       = {http://dblp.uni-trier.de/rec/bib/conf/innovations/CohenLMMPS15},\n\tbibsource    = {dblp computer science bibliography, http://dblp.org}\n}\n@article{yogatama2019learning,\n\ttitle        = {Learning and Evaluating General Linguistic Intelligence},\n\tauthor       = {Dani Yogatama and Cyprien de Masson d'Autume and Jerome Connor and Tomas Kocisky and Mike Chrzanowski and Lingpeng Kong and Angeliki Lazaridou and Wang Ling and Lei Yu and Chris Dyer and others},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1901.11373}\n}\n@inproceedings{yoon2019compare,\n\ttitle        = {A Compare-Aggregate Model with Latent Clustering for Answer Selection},\n\tauthor       = {Seunghyun Yoon and Franck Dernoncourt and Doo Soon Kim and Trung Bui and Kyomin Jung},\n\tyear         = 2019,\n\tbooktitle    = {Conference on Information and Knowledge Management (CIKM)}\n}\n@inproceedings{yossef2011context,\n\ttitle        = {Context-sensitive query auto-completion},\n\tauthor       = {Ziv Bar-Yossef and Naama Kraus},\n\tyear         = 2011,\n\tbooktitle    = {World Wide Web (WWW)},\n\tpages        = {107--116}\n}\n@inproceedings{you2017crop,\n\ttitle        = {Deep {gaussian} process for crop yield prediction based on remote sensing data},\n\tauthor       = {Jiaxuan You and Xiaocheng Li and Melvin Low and David Lobell and Stefano Ermon},\n\tyear         = 2017,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{you2017deep,\n\ttitle        = {Deep Lattice Networks and Partial Monotonic Functions},\n\tauthor       = {Seungil You and David Ding and Kevin Canini and Jan Pfeifer and Maya Gupta},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {2985--2993}\n}\n@article{you2017large,\n\ttitle        = {Large batch training of convolutional networks},\n\tauthor       = {You, Yang and Gitman, Igor and Ginsburg, Boris},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1708.03888}\n}\n@article{you2017provable,\n\ttitle        = {Provable Self-Representation Based Outlier Detection in a Union of Subspaces},\n\tauthor       = {Chong You and Daniel P. Robinson and Ren{'e} Vidal},\n\tyear         = 2017,\n\tjournal      = {arXiv}\n}\n@inproceedings{you2018moleculegraph,\n\ttitle        = {Graph Convolutional Policy Network for Goal-Directed Molecular Graph Generation},\n\tauthor       = {J. You and B. Liu and Z. Ying and V. Pande and J. Leskovec},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{Young14,\n\ttitle        = {Nearly Linear-Time Approximation Schemes for Mixed Packing/Covering and Facility-Location Linear Programs},\n\tauthor       = {Neal E. Young},\n\tyear         = 2014,\n\tmonth        = jul,\n\tjournal      = {ArXiv e-prints},\n\tvolume       = {abs/1407.3015},\n\turl          = {http://arxiv.org/abs/1407.3015}\n}\n@article{young2000probabilistic,\n\ttitle        = {Probabilistic methods in spoken-dialogue systems},\n\tauthor       = {Steve J Young},\n\tyear         = 2000,\n\tjournal      = {Philosophical Transactions of the Royal Society of London A: Mathematical, Physical and Engineering Sciences},\n\tvolume       = 358,\n\tnumber       = 1769,\n\tpages        = {1389--1402}\n}\n@inproceedings{Young2001,\n\ttitle        = {{Sequential and parallel algorithms for mixed packing and covering}},\n\tauthor       = {Young, Neal E.},\n\tyear         = 2001,\n\tbooktitle    = {42nd Annual IEEE Symposium on Foundations of Computer Science (FOCS'01)},\n\tpublisher    = {IEEE Comput. Soc},\n\tpages        = {538--546},\n\tdoi          = {10.1109/SFCS.2001.959930},\n\tisbn         = {0-7695-1116-3},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/Documents/Mendeley Desktop/Young - 2001 - Sequential and parallel algorithms for mixed packing and covering.pdf:pdf},\n\tmendeley-groups = {Algorithms/Multiplicative Weight/LP}\n}\n@inproceedings{young2013pomdp,\n\ttitle        = {{POMDP}-based statistical spoken dialog systems: A review},\n\tauthor       = {Steve Young and Milica Ga{\\v{s}}i{\\'c} and Blaise Thomson and Jason D Williams},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of the IEEE},\n\tnumber       = 5,\n\tpages        = {1160--1179}\n}\n@article{young2014image,\n\ttitle        = {From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions},\n\tauthor       = {P. Young and A. Lai and M. Hodosh and J. Hockenmaier},\n\tyear         = 2014,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 2,\n\tpages        = {67--78}\n}\n@article{younger1967recognition,\n\ttitle        = {Recognition and parsing of context-free languages in time n3},\n\tauthor       = {Daniel H Younger},\n\tyear         = 1967,\n\tjournal      = {Information and control},\n\tvolume       = 10,\n\tnumber       = 2,\n\tpages        = {189--208}\n}\n@inproceedings{younger2001meta,\n\ttitle        = {Meta-learning with backpropagation},\n\tauthor       = {A Steven Younger and Sepp Hochreiter and Peter R Conwell},\n\tyear         = 2001,\n\tbooktitle    = {IJCNN'01. International Joint Conference on Neural Networks. Proceedings (Cat. No. 01CH37222)},\n\tvolume       = 3\n}\n@inproceedings{ys17,\n\ttitle        = {Mean Field Residual Networks: On the Edge of Chaos},\n\tauthor       = {Yang, Greg and Schoenholz, Samuel},\n\tyear         = 2017,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NIPS)},\n\tpages        = {7103--7114}\n}\n@article{ys18,\n\ttitle        = {Deep Mean Field Theory: Layerwise Variance and Width Variation as Methods to Control Gradient Explosion},\n\tauthor       = {Yang, Greg and Schoenholz, Sam S.},\n\tyear         = 2018,\n\tjournal      = {ICLR open review},\n\turl          = {https://openreview.net/forum?id=rJGY8GbR-}\n}\n@inproceedings{yu04grounding,\n\ttitle        = {On the integration of grounding language and learning objects},\n\tauthor       = {C. Yu and D. H. Ballard},\n\tyear         = 2004,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tpages        = {488--493}\n}\n@article{yu07unified,\n\ttitle        = {A Unified Model of Early Word Learning: Integrating Statistical and Social Cues},\n\tauthor       = {C. Yu and D. H. Ballard},\n\tyear         = 2007,\n\tjournal      = {Neurocomputing},\n\tvolume       = 70,\n\tnumber       = 13,\n\tpages        = {2149--2165}\n}\n@article{yu2005haptics,\n\ttitle        = {A novel multimodal interface for improving visually impaired people's web accessibility},\n\tauthor       = {Wai Yu and Ravi Kuber and Emma Murphy and Philip Strain and Graham McAllister},\n\tyear         = 2005,\n\tjournal      = {Virtual Reality},\n\tvolume       = 9\n}\n@article{yu2011calibration,\n\ttitle        = {Calibration of Confidence Measures in Speech Recognition},\n\tauthor       = {Dong Yu and Jinyu Li and Li Deng},\n\tyear         = 2011,\n\tjournal      = {Trans. Audio, Speech and Lang. Proc.},\n\tvolume       = 19,\n\tnumber       = 8,\n\tpages        = {2461--2473}\n}\n@article{yu2012analysis,\n\ttitle        = {Analysis of kernel mean matching under covariate shift},\n\tauthor       = {Yu, Yaoliang and Szepesv{\\'a}ri, Csaba},\n\tyear         = 2012,\n\tjournal      = {arXiv preprint arXiv:1206.4650}\n}\n@inproceedings{yu2013grounded,\n\ttitle        = {Grounded Language Learning from Video Described with Sentences},\n\tauthor       = {H. Yu and J. M. Siskind},\n\tyear         = 2013,\n\tbooktitle    = {Association for Computational Linguistics (ACL)},\n\tpages        = {53--63}\n}\n@inproceedings{yu2013max,\n\ttitle        = {Max-Violation {P}erceptron and Forced Decoding for Scalable {MT} Training},\n\tauthor       = {Heng Yu and Liang Huang and Haitao Mi and Kai Zhao},\n\tyear         = 2013,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)},\n\tpages        = {1112--1123}\n}\n@article{yu2016dcr,\n\ttitle        = {End-to-End Answer Chunk Extraction and Ranking for Reading Comprehension},\n\tauthor       = {Yang Yu and Wei Zhang and Kazi Hasan and Mo Yu and Bing Xiang and Bowen Zhou},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1610.09996}\n}\n@article{yu2017compositional,\n\ttitle        = {A Deep Compositional Framework for Human-like Language Acquisition in Virtual Environment},\n\tauthor       = {Haonan Yu and Haichao Zhang and Wei Xu},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1703.09831}\n}\n@inproceedings{yu2017joint,\n\ttitle        = {A Joint Speaker-Listener-Reinforcer Model for Referring Expressions},\n\tauthor       = {Licheng Yu and Hao Tan and Mohit Bansal and Tamara L. Berg},\n\tyear         = 2017,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@article{yu2017seq,\n\ttitle        = {SeqGAN: Sequence Generative Adversarial Nets with Policy Gradient},\n\tauthor       = {Lantao Yu and Weinan Zhang and Jun Wang and Yong Yu},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1609.05473}\n}\n@inproceedings{yu2017skim,\n\ttitle        = {Learning to Skim Text},\n\tauthor       = {Adams Wei Yu and Hongrae Lee and Quoc V. Le},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{yu2018neural,\n\ttitle        = {A Neural Approach to Pun Generation},\n\tauthor       = {Zhiwei Yu and Jiwei Tan and Xiaojun Wan},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{yu2018qanet,\n\ttitle        = {{QANet}: Combining Local Convolution with Global Self-Attention for Reading Comprehension},\n\tauthor       = {Adams Wei Yu and David Dohan and Minh-Thang Luong and Rui Zhao and Kai Chen and Mohammad Norouzi and Quoc V. Le},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{yu2018spider,\n\ttitle        = {Spider: A Large-Scale Human-Labeled Dataset for Complex and Cross-Domain Semantic Parsing and Text-to-{SQL} Task},\n\tauthor       = {Tao Yu and Rui Zhang and Kai Yang and Michihiro Yasunaga and Dongxu Wang and Zifan Li and James Ma and Irene Li and Qingning Yao and Shanelle Roman and Zilin Zhang and Dragomir R. Radev},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{yu2018syntaxsqlnet,\n\ttitle        = {{SyntaxSQLNet}: Syntax Tree Networks for Complex and Cross-DomainText-to-{SQL} Task},\n\tauthor       = {Tao Yu and Michihiro Yasunaga and Kai Yang and Rui Zhang and Dongxu Wang and Zifan Li and Dragomir R. Radev},\n\tyear         = 2018,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{yu2019cosql,\n\ttitle        = {CoSQL: A Conversational Text-to-{SQL} Challenge Towards Cross-Domain Natural Language Interfaces to Databases},\n\tauthor       = {Tao Yu and Rui Zhang and He Yang Er and Suyi Li and Eric Xue and Bo Pang and Xi Victoria Lin and Yi Chern Tan and Tianze Shi and Zihan Li and Youxuan Jiang and Michihiro Yasunaga and Sungrok Shim and Tao Chen and Alexander R. Fabbri and Zifan Li and Luyao Chen and Yuwen Zhang and Shreya Dixit and Vincent Zhang and Caiming Xiong and Richard Socher and Walter S. Lasecki and Dragomir R. Radev},\n\tyear         = 2019,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{yu2019meta,\n\ttitle        = {Meta-world: A benchmark and evaluation for multi-task and meta reinforcement learning},\n\tauthor       = {Tianhe Yu and Deirdre Quillen and Zhanpeng He and Ryan Julian and Karol Hausman and Chelsea Finn and Sergey Levine},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1910.10897}\n}\n@inproceedings{yu2019unsupervised,\n\ttitle        = {Unsupervised Visuomotor Control through Distributional Planning Networks},\n\tauthor       = {Tianhe Yu and Gleb Shevchuk and Dorsa Sadigh and Chelsea Finn},\n\tyear         = 2019,\n\tbooktitle    = {Robotics: Science and Systems (RSS)}\n}\n@inproceedings{yu2020bdd,\n\ttitle        = {BDD100K: A Diverse Driving Dataset for Heterogeneous Multitask Learning},\n\tauthor       = {Fisher Yu and Haofeng Chen and Xin Wang and Wenqi Xian and Yingying Chen and Fangchen Liu and Vashisht Madhavan and Trevor Darrell},\n\tyear         = 2020,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@article{yu2020mopo,\n\ttitle        = {MOPO: Model-based Offline Policy Optimization},\n\tauthor       = {Yu, Tianhe and Thomas, Garrett and Yu, Lantao and Ermon, Stefano and Zou, James Y and Levine, Sergey and Finn, Chelsea and Ma, Tengyu},\n\tyear         = 2020,\n\tjournal      = {Advances in Neural Information Processing Systems},\n\tvolume       = 33,\n\tpages        = {14129--14142}\n}\n@article{yu2021surprising,\n\ttitle        = {The surprising effectiveness of mappo in cooperative, multi-agent games},\n\tauthor       = {Yu, Chao and Velu, Akash and Vinitsky, Eugene and Wang, Yu and Bayen, Alexandre and Wu, Yi},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2103.01955}\n}\n@inproceedings{yuan2015lightlda,\n\ttitle        = {LightLDA: Big Topic Models on Modest Compute Clusters},\n\tauthor       = {Jinhui Yuan and Fei Gao and Qirong Ho and Wei Dai and Jinliang Wei and Xun Zheng and Eric P. Xing and Tie-Yan Liu and Wei-Ying Ma},\n\tyear         = 2015,\n\tbooktitle    = {World Wide Web (WWW)}\n}\n@article{yuan2017machine,\n\ttitle        = {Machine Comprehension by Text-to-Text Neural Question Generation},\n\tauthor       = {Xingdi Yuan and Tong Wang and Caglar Gulcehre and Alessandro Sordoni and Philip Bachman and Sandeep Subramanian and Saizheng Zhang and Adam Trischler},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1705.02012}\n}\n@inproceedings{yue2007support,\n\ttitle        = {A support vector method for optimizing average precision},\n\tauthor       = {Yisong Yue and Thomas Finley and Filip Radlinski and Thorsten Joachims},\n\tyear         = 2007,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)}\n}\n@inproceedings{yue2019domain,\n\ttitle        = {Domain randomization and pyramid consistency: Simulation-to-real generalization without accessing target domain data},\n\tauthor       = {Yue, Xiangyu and Zhang, Yang and Zhao, Sicheng and Sangiovanni-Vincentelli, Alberto and Keutzer, Kurt and Gong, Boqing},\n\tyear         = 2019,\n\tbooktitle    = {Proceedings of the IEEE/CVF International Conference on Computer Vision},\n\tpages        = {2100--2110}\n}\n@article{yun2017global,\n\ttitle        = {Global optimality conditions for deep neural networks},\n\tauthor       = {Yun, Chulhee and Sra, Suvrit and Jadbabaie, Ali},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1707.02444}\n}\n@article{yun2018critical,\n\ttitle        = {A Critical View of Global Optimality in Deep Learning},\n\tauthor       = {Yun, Chulhee and Sra, Suvrit and Jadbabaie, Ali},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1802.03487}\n}\n@article{yun2018small,\n\ttitle        = {Small ReLU networks are powerful memorizers: a tight analysis of memorization capacity},\n\tauthor       = {Yun, Chulhee and Sra, Suvrit and Jadbabaie, Ali},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1810.07770}\n}\n@article{yun2019transformers,\n\ttitle        = {Are Transformers universal approximators of sequence-to-sequence functions?},\n\tauthor       = {Yun, Chulhee and Bhojanapalli, Srinadh and Rawat, Ankit Singh and Reddi, Sashank J and Kumar, Sanjiv},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1912.10077}\n}\n@inproceedings{Yun2020Are,\n\ttitle        = {Are Transformers universal approximators of sequence-to-sequence functions?},\n\tauthor       = {Chulhee Yun and Srinadh Bhojanapalli and Ankit Singh Rawat and Sashank Reddi and Sanjiv Kumar},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Learning Representations},\n\turl          = {https://openreview.net/forum?id=ByxRM0Ntvr}\n}\n@techreport{yun2020evaluating,\n\ttitle        = {Evaluating the Robustness of Natural Language Reward Shaping Models to Spatial Relations},\n\tauthor       = {Anthony Yun},\n\tyear         = 2020,\n\tinstitution  = {The University of Texas at Austin}\n}\n@article{yurochkin2020sensei,\n\ttitle        = {SenSeI: Sensitive Set Invariance for Enforcing Individual Fairness},\n\tauthor       = {Mikhail Yurochkin and Yuekai Sun},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.14168}\n}\n@article{yy1241,\n\ttitle        = {An Alternative View: When Does {SGD} Escape Local Minima?},\n\tauthor       = {Robert Kleinberg and Yuanzhi Li and Yang Yuan},\n\tyear         = 2018,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1802.06175},\n\turl          = {http://arxiv.org/abs/1802.06175},\n\tarchiveprefix = {arXiv},\n\teprint       = {1802.06175},\n\ttimestamp    = {Mon, 13 Aug 2018 16:48:44 +0200},\n\tbiburl       = {https://dblp.org/rec/bib/journals/corr/abs-1802-06175},\n\tbibsource    = {dblp computer science bibliography, https://dblp.org}\n}\n@article{zadeh1983computational,\n\ttitle        = {A computational approach to fuzzy quantifiers in natural languages},\n\tauthor       = {Lotfi A. Zadeh},\n\tyear         = 1983,\n\tjournal      = {Computers and Mathematics with Applications},\n\tvolume       = 9,\n\tnumber       = 1\n}\n@inproceedings{zadrozny2001calibrated,\n\ttitle        = {Obtaining calibrated probability estimates from decision trees and naive Bayesian classifiers},\n\tauthor       = {Bianca Zadrozny and Charles Elkan},\n\tyear         = 2001,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {609--616}\n}\n@inproceedings{zadrozny2001obtaining,\n\ttitle        = {Obtaining calibrated probability estimates from decision trees and naive Bayesian classifiers},\n\tauthor       = {Zadrozny, Bianca and Elkan, Charles},\n\tyear         = 2001,\n\tbooktitle    = {Icml},\n\tvolume       = 1,\n\tpages        = {609--616},\n\torganization = {Citeseer}\n}\n@inproceedings{zadrozny2002transforming,\n\ttitle        = {Transforming classifier scores into accurate multiclass probability estimates},\n\tauthor       = {Zadrozny, Bianca and Elkan, Charles},\n\tyear         = 2002,\n\tbooktitle    = {Proceedings of the eighth ACM SIGKDD international conference on Knowledge discovery and data mining},\n\tpages        = {694--699},\n\torganization = {ACM}\n}\n@inproceedings{zafar2017fairness,\n\ttitle        = {Fairness beyond disparate treatment \\& disparate impact: Learning classification without disparate mistreatment},\n\tauthor       = {Muhammad Bilal Zafar and Isabel Valera and Manuel Gomez Rodriguez and Krishna P Gummadi},\n\tyear         = 2017,\n\tbooktitle    = {World Wide Web (WWW)},\n\tpages        = {1171--1180}\n}\n@article{zagoruyko2016wide,\n\ttitle        = {Wide Residual Networks},\n\tauthor       = {Zagoruyko, Sergey and Komodakis, Nikos},\n\tyear         = 2016,\n\tjournal      = {NIN},\n\tbooktitle    = {British Machine Vision Conference},\n\tvolume       = 8,\n\tpages        = {35--67}\n}\n@inproceedings{zaharia2012resilient,\n\ttitle        = {Resilient distributed datasets: A fault-tolerant abstraction for in-memory cluster computing},\n\tauthor       = {Zaharia, Matei and Chowdhury, Mosharaf and Das, Tathagata and Dave, Ankur and Ma, Justin and McCauley, Murphy and Franklin, Michael J and Shenker, Scott and Stoica, Ion},\n\tyear         = 2012,\n\tbooktitle    = {Proceedings of the 9th USENIX conference on Networked Systems Design and Implementation},\n\tpages        = {2--2},\n\torganization = {USENIX Association}\n}\n@inproceedings{zaidan07annotator,\n\ttitle        = {Using \"Annotator Rationales\" to Improve Machine Learning for Text Categorization},\n\tauthor       = {Omar F. Zaidan and Jason Eisner and Christine D. Piatko},\n\tyear         = 2007,\n\tbooktitle    = {Human Language Technology and North American Association for Computational Linguistics (HLT/NAACL)}\n}\n@inproceedings{zaidan08annotator,\n\ttitle        = {Modeling Annotators: A Generative Approach to Learning from Annotator Rationales},\n\tauthor       = {Omar F. Zaidan and Jason Eisner},\n\tyear         = 2008,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{zaidan2011crowdsourcing,\n\ttitle        = {Crowdsourcing Translation: Professional Quality from Non-Professionals},\n\tauthor       = {Omar F.  Zaidan and Chris  Callison-Burch},\n\tyear         = 2011,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{zajicek1998blind,\n\ttitle        = {A Web Navigation Tool for the Blind},\n\tauthor       = {Mary Zajicek and Chris Powell and Chris Reeves},\n\tyear         = 1998,\n\tbooktitle    = {International ACM Conference on Assistive Technologies}\n}\n@inproceedings{zanette2019limiting,\n\ttitle        = {Limiting Extrapolation in Linear Approximate Value Iteration},\n\tauthor       = {Zanette, Andrea and Lazaric, Alessandro and Kochenderfer, Mykel J and Brunskill, Emma},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {5616--5625}\n}\n@inproceedings{zanette2019tighter,\n\ttitle        = {Tighter Problem-Dependent Regret Bounds in Reinforcement Learning without Domain Knowledge using Value Function Bounds},\n\tauthor       = {Zanette, Andrea and Brunskill, Emma},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {7304--7312}\n}\n@inproceedings{zanette2020learning,\n\ttitle        = {Learning Near Optimal Policies with Low Inherent {Bellman} Error},\n\tauthor       = {Zanette, Andrea and Lazaric, Alessandro and Kochenderfer, Mykel and Brunskill, Emma},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning}\n}\n@inproceedings{zarriass2017obtaining,\n\ttitle        = {Obtaining referential word meanings from visual and distributional information: Experiments on object naming},\n\tauthor       = {Sina Zarriai{\\ss} and David Schlangen},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{zavershynskyi2018naps,\n\ttitle        = {{NAPS}: Natural Program Synthesis Dataset},\n\tauthor       = {Maksym Zavershynskyi and Alexander Skidanov and Illia Polosukhin},\n\tyear         = 2018,\n\tbooktitle    = {Workshop on Neural Abstract Machines \\& Program Induction (NAMPI)}\n}\n@article{zbontar2021barlow,\n\ttitle        = {Barlow twins: Self-supervised learning via redundancy reduction},\n\tauthor       = {Zbontar, Jure and Jing, Li and Misra, Ishan and LeCun, Yann and Deny, St{\\'e}phane},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2103.03230}\n}\n@inproceedings{ZCZ09,\n\ttitle        = {Inverse Time Dependency in Convex Regularized Learning},\n\tauthor       = {Zhu, Zeyuan Allen and Chen, Weizhu and Zhu, Chenguang and Wang, Gang and Wang, Haixun and Chen, Zheng},\n\tyear         = 2009,\n\tseries       = {ICDM}\n}\n@inproceedings{zech2018radio,\n\ttitle        = {Variable generalization performance of a deep learning model to detect pneumonia in chest radiographs: A cross-sectional study},\n\tauthor       = {John R. Zech and Marcus A. Badgeley and Manway Liu and Anthony B. Costa and Joseph J. Titano and Eric Karl Oermann},\n\tyear         = 2018,\n\tbooktitle    = {PLOS Medicine}\n}\n@inproceedings{zeichner2012crowdsourcing,\n\ttitle        = {Crowdsourcing Inference-rule Evaluation},\n\tauthor       = {Naomi Zeichner and Jonathan Berant and Ido Dagan},\n\tyear         = 2012,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{zeiler2012adadelta,\n\ttitle        = {ADADELTA: an adaptive learning rate method},\n\tauthor       = {Zeiler, Matthew D},\n\tyear         = 2012,\n\tjournal      = {arXiv preprint arXiv:1212.5701}\n}\n@article{zelenko2003kernel,\n\ttitle        = {Kernel methods for relation extraction},\n\tauthor       = {Dmitry Zelenko and Chinatsu Aone and Anthony Richardella},\n\tyear         = 2003,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 3,\n\tnumber       = {0},\n\tpages        = {1083--1106}\n}\n@inproceedings{zelle96geoquery,\n\ttitle        = {Learning to parse database queries using inductive logic programming},\n\tauthor       = {M. Zelle and R. J. Mooney},\n\tyear         = 1996,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)},\n\tpages        = {1050--1055}\n}\n@inproceedings{zellers2019neuralfakenews,\n\ttitle        = {Defending Against Neural Fake News},\n\tauthor       = {Rowan Zellers and Ari Holtzman and Hannah Rashkin and Yonatan Bisk and Ali Farhadi and Franziska Roesner and Yejin Choi},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {9054--9065}\n}\n@inproceedings{zellers2019recognition,\n\ttitle        = {From recognition to cognition: Visual commonsense reasoning},\n\tauthor       = {Rowan Zellers and Yonatan Bisk and Ali Farhadi and Yejin Choi},\n\tyear         = 2019,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {6720--6731}\n}\n@inproceedings{zemel2013,\n\ttitle        = {Learning Fair Representations},\n\tauthor       = {Richard Zemel and Yu Wu and Kevin Swersky and Toniann Pitassi and Cynthia Dwork},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {325--333}\n}\n@inproceedings{zemel2013learning,\n\ttitle        = {Learning fair representations},\n\tauthor       = {Zemel, Rich and Wu, Yu and Swersky, Kevin and Pitassi, Toni and Dwork, Cynthia},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {325--333}\n}\n@inproceedings{zeng2014relation,\n\ttitle        = {Relation classification via convolutional deep neural network},\n\tauthor       = {Daojian Zeng and Kang Liu and Siwei Lai and Guangyou Zhou and Jun Zhao},\n\tyear         = 2014,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)}\n}\n@article{zeng2020safety,\n\ttitle        = {Safety-critical model predictive control with discrete-time control barrier function},\n\tauthor       = {Zeng, Jun and Zhang, Bike and Sreenath, Koushil},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.11718}\n}\n@inproceedings{zermelo1913anwendung,\n\ttitle        = {{\\\"U}ber eine Anwendung der Mengenlehre auf die Theorie des Schachspiels},\n\tauthor       = {Ernst Zermelo},\n\tyear         = 1913,\n\tbooktitle    = {Proceedings of the fifth international congress of mathematicians},\n\tvolume       = 2,\n\tpages        = {501--504}\n}\n@inproceedings{zeroinit2018,\n\ttitle        = {Residual Learning Without Normalization via Better Initialization},\n\tauthor       = {Hongyi Zhang and Yann N. Dauphin and Tengyu Ma},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Learning Representations},\n\turl          = {https://openreview.net/forum?id=H1gsz30cKX}\n}\n@inproceedings{zettlemoyer05ccg,\n\ttitle        = {Learning to Map Sentences to Logical Form: Structured Classification with Probabilistic Categorial Grammars},\n\tauthor       = {Luke S. Zettlemoyer and Michael Collins},\n\tyear         = 2005,\n\tbooktitle    = {Uncertainty in Artificial Intelligence (UAI)},\n\tpages        = {658--666}\n}\n@inproceedings{zettlemoyer07relaxed,\n\ttitle        = {Online Learning of Relaxed {CCG} Grammars for Parsing to Logical Form},\n\tauthor       = {Luke S. Zettlemoyer and Michael Collins},\n\tyear         = 2007,\n\tbooktitle    = {Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP/CoNLL)},\n\tpages        = {678--687}\n}\n@inproceedings{zettlemoyer09context,\n\ttitle        = {Learning Context-dependent Mappings from Sentences to Logical Form},\n\tauthor       = {Luke S. Zettlemoyer and Michael Collins},\n\tyear         = 2009,\n\tbooktitle    = {Association for Computational Linguistics and International Joint Conference on Natural Language Processing (ACL-IJCNLP)}\n}\n@article{ZG01,\n\ttitle        = {Rank-one approximation to high order tensors},\n\tauthor       = {T. Zhang and G. Golub},\n\tyear         = 2001,\n\tjournal      = {SIAM Journal on Matrix Analysis and Applications},\n\tvolume       = 23,\n\tpages        = {534--550}\n}\n@inproceedings{zha2001spectral,\n\ttitle        = {Spectral relaxation for {k}-means clustering},\n\tauthor       = {Hongyuan Zha and Xiaofeng He and Chris Ding and Horst Simon and Ming Gu},\n\tyear         = 2001,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)},\n\tpages        = {1057--1064}\n}\n@inproceedings{zhai2017chunking,\n\ttitle        = {Neural Models for Sequence Chunking},\n\tauthor       = {Feifei Zhai and Saloni Potdar and Bing Xiang and Bowen Zhou},\n\tyear         = 2017,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@article{zhai2019adversarially,\n\ttitle        = {Adversarially Robust Generalization Just Requires More Unlabeled Data},\n\tauthor       = {Runtian Zhai and Tianle Cai and Di He and Chen Dan and Kun He and John Hopcroft and Liwei Wang},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1906.00555}\n}\n@article{zhai2020largescale,\n\ttitle        = {A Large-scale Study of Representation Learning with the Visual Task Adaptation Benchmark},\n\tauthor       = {Xiaohua Zhai and Joan Puigcerver and Alexander Kolesnikov and Pierre Ruyssen and Carlos Riquelme and Mario Lucic and Josip Djolonga and Andre Susano Pinto and Maxim Neumann and Alexey Dosovitskiy and Lucas Beyer and Olivier Bachem and Michael Tschannen and Marcin Michalski and Olivier Bousquet and Sylvain Gelly and Neil Houlsby},\n\tyear         = 2020,\n\tjournal      = {arXiv}\n}\n@article{zhang02covering,\n\ttitle        = {Covering number bounds of certain regularized linear function classes},\n\tauthor       = {Tong Zhang},\n\tyear         = 2002,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 2,\n\tpages        = {527--550}\n}\n@inproceedings{zhang12smoothing,\n\ttitle        = {Ontological Smoothing for Relation Extraction with Minimal Supervision},\n\tauthor       = {Congle Zhang and Raphael Hoffmann and Daniel S. Weld},\n\tyear         = 2012,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@inproceedings{zhang2001jordan,\n\ttitle        = {The Jordan canonical form of a real random matrix},\n\tauthor       = {Z. N. Zhang},\n\tyear         = 2001,\n\tbooktitle    = {Numer. Math. J. Chinese Univ.},\n\tissue        = 23,\n\towner        = {leili},\n\tpage         = {363-367},\n\ttimestamp    = {2011.07.28}\n}\n@article{zhang2002covering,\n\ttitle        = {Covering number bounds of certain regularized linear function classes},\n\tauthor       = {Zhang, Tong},\n\tyear         = 2002,\n\tmonth        = mar,\n\tjournal      = {Journal of Machine Learning Research},\n\tpublisher    = {JMLR.org},\n\tvolume       = 2,\n\tnumber       = {Mar},\n\tpages        = {527--550},\n\tdoi          = {10.1162/153244302760200713},\n\tissn         = {1532-4435},\n\turl          = {https://doi.org/10.1162/153244302760200713},\n\tissue_date   = {3/1/2002},\n\tabstract     = {Recently, sample complexity bounds have been derived for problems involving linear functions such as neural networks and support vector machines. In many of these theoretical studies, the concept of covering numbers played an important role. It is thus useful to study covering numbers for linear function classes. In this paper, we investigate two closely related methods to derive upper bounds on these covering numbers. The first method, already employed in some earlier studies, relies on the so-called Maurey's lemma; the second method uses techniques from the mistake bound framework in online learning. We compare results from these two methods, as well as their consequences in some learning formulations.},\n\tnumpages     = 24,\n\tkeywords     = {mistake bounds, learning sample complexity, sparse approximation, covering numbers}\n}\n@inproceedings{zhang2004solving,\n\ttitle        = {Solving large scale linear prediction problems using stochastic gradient descent algorithms},\n\tauthor       = {Zhang, Tong},\n\tyear         = 2004,\n\tbooktitle    = {Proceedings of the 21st International Conference on Machine Learning},\n\tseries       = {ICML 2004}\n}\n@article{zhang2006from,\n\ttitle        = {From $\\epsilon$-entropy to {KL}-entropy: Analysis of minimum information complexity density estimation},\n\tauthor       = {Zhang, Tong},\n\tyear         = 2006,\n\tjournal      = {The Annals of Statistics},\n\tpublisher    = {Institute of Mathematical Statistics}\n}\n@inproceedings{zhang2006learning,\n\ttitle        = {Learning from Incomplete Ratings Using Non-negative Matrix Factorization.},\n\tauthor       = {Zhang, Sheng and Wang, Weihong and Ford, James and Makedon, Fillia},\n\tyear         = 2006,\n\tbooktitle    = {SDM},\n\torganization = {SIAM}\n}\n@article{zhang2008bibliographical,\n\ttitle        = {Bibliographical review on reconfigurable fault-tolerant control systems},\n\tauthor       = {Youmin Zhang and Jin Jiang},\n\tyear         = 2008,\n\tjournal      = {Annual reviews in control},\n\tvolume       = 32,\n\tnumber       = 2,\n\tpages        = {229--252}\n}\n@inproceedings{zhang2010chart,\n\ttitle        = {Chart pruning for fast lexicalised-grammar parsing},\n\tauthor       = {Yue Zhang and Byung-Gyu Ahn and Stephen Clark and Curt Van Wyk and James R. Curran and Laura Rimell},\n\tyear         = 2010,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)}\n}\n@article{zhang2010clustered,\n\ttitle        = {Clustered {Nystr{\\\"o}m} method for large scale manifold learning and dimension reduction},\n\tauthor       = {Zhang, Kai and Kwok, James T},\n\tyear         = 2010,\n\tjournal      = {Neural Networks, IEEE Transactions on},\n\tpublisher    = {IEEE},\n\tvolume       = 21,\n\tnumber       = 10,\n\tpages        = {1576--1587}\n}\n@inproceedings{zhang2012communication,\n\ttitle        = {Communication-efficient algorithms for statistical optimization},\n\tauthor       = {Zhang, Yuchen and Wainwright, Martin J and Duchi, John C},\n\tyear         = 2012,\n\tbooktitle    = {Advances in Neural Information Processing Systems(NIPS)}\n}\n@inproceedings{zhang2012scaling,\n\ttitle        = {Scaling up kernel svm on limited resources: A low-rank linearization approach},\n\tauthor       = {Zhang, Kai and Lan, Liang and Wang, Zhuang and Moerchen, Fabian},\n\tyear         = 2012,\n\tbooktitle    = {International Conference on Artificial Intelligence and Statistics},\n\tpages        = {1425--1434}\n}\n@inproceedings{zhang2013automatic,\n\ttitle        = {Automatic Extraction of Top-k Lists from the Web},\n\tauthor       = {Zhixian Zhang and Kenny Q Zhu and Haixun Wang and Hongsong Li},\n\tyear         = 2013,\n\tbooktitle    = {International Conference on Data Engineering}\n}\n@inproceedings{zhang2013coordinating,\n\ttitle        = {Coordinating Multi-agent Reinforcement Learning with Limited Communication},\n\tauthor       = {Chongjie Zhang and Victor Lesser},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of the 2013 International Conference on Autonomous Agents and Multi-agent Systems},\n\tpages        = {1101--1108}\n}\n@article{zhang2013denoising,\n\ttitle        = {Denoising deep neural networks based voice activity detection},\n\tauthor       = {Xiao-Lei Zhang and Ji Wu},\n\tyear         = 2013,\n\tjournal      = {arXiv}\n}\n@inproceedings{zhang2013domain,\n\ttitle        = {Domain adaptation under target and conditional shift},\n\tauthor       = {Zhang, Kun and Muandet, Krikamol and Wang, Zhikun and others},\n\tyear         = 2013,\n\tbooktitle    = {Proceedings of the 30th International Conference on Machine Learning (ICML-13)},\n\tpages        = {819--827}\n}\n@article{zhang2013gradient,\n\ttitle        = {Gradient methods for convex minimization: better rates under weaker conditions},\n\tauthor       = {Zhang, Hui and Yin, Wotao},\n\tyear         = 2013,\n\tjournal      = {arXiv preprint arXiv:1303.4645}\n}\n@inproceedings{zhang2013information,\n\ttitle        = {Information-theoretic lower bounds for distributed statistical estimation with communication constraints},\n\tauthor       = {Zhang, Yuchen and Duchi, John and Jordan, Michael I and Wainwright, Martin J},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NIPS)}\n}\n@inproceedings{zhang2013online,\n\ttitle        = {Online learning for inexact hypergraph search},\n\tauthor       = {Hao Zhang and Liang Huang and Kai Zhao and Ryan McDonald},\n\tyear         = 2013,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{zhang2013parallelparaphrase,\n\ttitle        = {Harvesting Parallel News Streams to Generate Paraphrases of Event Relations},\n\tauthor       = {Congle Zhang and Daniel S. Weld},\n\tyear         = 2013,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{zhang2014chinese,\n\ttitle        = {Chinese Poetry Generation with Recurrent Neural Networks},\n\tauthor       = {Xingxing Zhang and Mirella Lapata},\n\tyear         = 2014,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{zhang2014face,\n\ttitle        = {Context-assisted face clustering framework with human-in-the-loop},\n\tauthor       = {Liyan Zhang and Dmitri V. Kalashnikov and Sharad Mehrotra},\n\tyear         = 2014,\n\tjournal      = {International Journal of Multimedia Information Retrieval},\n\tvolume       = 3,\n\tnumber       = 2,\n\tpages        = {69--88}\n}\n@inproceedings{zhang2014greed,\n\ttitle        = {Greed is Good if Randomized: New Inference for Dependency Parsing},\n\tauthor       = {Yuan Zhang and Tao Lei and Regina Barzilay and Tommi Jaakkola},\n\tyear         = 2014,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{zhang2014lower,\n\ttitle        = {Lower bounds on the performance of polynomial-time algorithms for sparse linear regression},\n\tauthor       = {Yuchen Zhang and Martin J. Wainwright and Michael I. Jordan},\n\tyear         = 2014,\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@inproceedings{zhang2015character,\n\ttitle        = {Character-level Convolutional Networks for Text Classification},\n\tauthor       = {Xiang Zhang and Junbo Zhao and Yann LeCun},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@phdthesis{zhang2015deepdive,\n\ttitle        = {{DeepDive}: a data management system for automatic knowledge base construction},\n\tauthor       = {Ce Zhang},\n\tyear         = 2015,\n\tschool       = {University of Wisconsin-Madison}\n}\n@article{zhang2015l1,\n\ttitle        = {$\\ell_1$-regularized Neural Networks are Improperly Learnable in Polynomial Time},\n\tauthor       = {Yuchen Zhang and Jason D Lee and Michael I Jordan},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1510.03528}\n}\n@article{zhang2015learning,\n\ttitle        = {Learning halfspaces and neural networks with random initialization},\n\tauthor       = {Zhang, Yuchen and Lee, Jason D and Wainwright, Martin J and Jordan, Michael I},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1511.07948}\n}\n@article{zhang2015parallelevents,\n\ttitle        = {Exploiting Parallel News Streams for Unsupervised Event Extraction},\n\tauthor       = {Congle Zhang and Stephen Soderland and Daniel S. Weld},\n\tyear         = 2015,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 3\n}\n@inproceedings{zhang2015stochastic,\n\ttitle        = {Stochastic primal-dual coordinate method for regularized empirical risk minimization},\n\tauthor       = {Zhang, Yuchen and Xiao, Lin},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the 32nd International Conference on Machine Learning},\n\tvolume       = 951,\n\tpages        = 2015,\n\turl          = {http://arxiv.org/abs/1409.3257},\n\tabstract     = {We consider a generic convex optimization problem associated with regularized empirical risk minimization of linear predictors. The problem structure allows us to reformulate it as a convex-concave saddle point problem. We propose a stochastic primal-dual coordinate (SPDC) method, which alternates between maximizing over a randomly chosen dual variable and minimizing over the primal variable. An extrapolation step on the primal variable is performed to obtain accelerated convergence rate. We also develop a mini-batch version of the SPDC method which facilitates parallel computing, and an extension with weighted sampling probabilities on the dual variables, which has a better complexity than uniform sampling on unnormalized data. Both theoretically and empirically, we show that the SPDC method has comparable or better performance than several state-of-the-art optimization methods.},\n\tarchiveprefix = {arXiv},\n\tarxivid      = {1409.3257},\n\teprint       = {1409.3257},\n\tfile         = {:C$\\backslash$:/Users/Zeyuan/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/1ca7470da54bcc99493d1dac5f702ca0b9ea4d23.pdf:pdf},\n\tmendeley-groups = {Optimization/Saddle-Point,Optimization/[with Yuan Yang]}\n}\n@article{zhang2016convexified,\n\ttitle        = {Convexified convolutional neural networks},\n\tauthor       = {Zhang, Yuchen and Liang, Percy and Wainwright, Martin J},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1609.01000}\n}\n@article{zhang2016identifying,\n\ttitle        = {Identifying significant predictive bias in classifiers},\n\tauthor       = {Zhe Zhang and Daniel B Neill},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1611.08292}\n}\n@article{zhang2016l1,\n\ttitle        = {l1-regularized Neural Networks are Improperly Learnable in Polynomial Time},\n\tauthor       = {Zhang, Yuchen and {Jason D. Lee} and Jordan, Michael I},\n\tyear         = 2016,\n\tjournal      = {International Conference on Machine Learning (ICML)},\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {993--1001}\n}\n@article{zhang2016predicting,\n\ttitle        = {Predicting all-cause mortality from basic physiology in the {Framingham Heart Study}},\n\tauthor       = {William B Zhang and Zachary Pincus},\n\tyear         = 2016,\n\tjournal      = {Aging Cell},\n\tvolume       = 15,\n\tnumber       = 1,\n\tpages        = {39--48}\n}\n@article{zhang2016spectral,\n\ttitle        = {Spectral methods meet EM: A provably optimal algorithm for crowdsourcing},\n\tauthor       = {Yuchen Zhang and Xi Chen and Dengyong Zhou and Michael I Jordan},\n\tyear         = 2016,\n\tjournal      = {Journal of Machine Learning Research (JMLR)},\n\tvolume       = 17,\n\tnumber       = 102,\n\tpages        = {1--44}\n}\n@article{zhang2016understanding,\n\ttitle        = {Understanding deep learning requires rethinking generalization},\n\tauthor       = {Zhang, Chiyuan and Bengio, Samy and Hardt, Moritz and Recht, Benjamin and Vinyals, Oriol},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1611.03530},\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{zhang2017aspect,\n\ttitle        = {Aspect-augmented Adversarial Networks for Domain Adaptation},\n\tauthor       = {Yuan Zhang and Regina Barzilay and Tommi Jaakkola},\n\tyear         = 2017,\n\tjournal      = {Transactions of the Association for Computational Linguistics (TACL)},\n\tvolume       = 5,\n\tpages        = {515--528}\n}\n@inproceedings{zhang2017convexified,\n\ttitle        = {Convexified Convolutional Neural Networks},\n\tauthor       = {Yuchen Zhang and Percy Liang and Martin J. Wainwright},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{zhang2017hitting,\n\ttitle        = {A hitting time analysis of stochastic gradient langevin dynamics},\n\tauthor       = {Zhang, Yuchen and Liang, Percy and Charikar, Moses},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1702.05575},\n\tbooktitle    = {Conference on Learning Theory (COLT)}\n}\n@article{zhang2017jnet,\n\ttitle        = {Exploring Question Understanding and Adaptation in Neural-Network-Based Question Answering},\n\tauthor       = {Junbei Zhang and Xiaodan Zhu and Qian Chen and Lirong Dai and Si Wei and Hui Jiang},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1703.04617}\n}\n@inproceedings{zhang2017learnability,\n\ttitle        = {On the Learnability of Fully-Connected Neural Networks},\n\tauthor       = {Zhang, Yuchen and Lee, Jason and Wainwright, Martin and Jordan, Michael},\n\tyear         = 2017,\n\tbooktitle    = {Artificial Intelligence and Statistics},\n\tpages        = {83--91}\n}\n@article{zhang2017listen,\n\ttitle        = {Listen, Interact and Talk: Learning to Speak via Interaction},\n\tauthor       = {Haichao Zhang and Haonan Yu and Wei Xu},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1705.09906}\n}\n@inproceedings{zhang2017macro,\n\ttitle        = {Macro Grammars and Holistic Triggering for Efficient Semantic Parsing},\n\tauthor       = {Yuchen Zhang and Panupong Pasupat and Percy Liang},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{zhang2017matching,\n\ttitle        = {Adversarial Feature Matching for Text Generation},\n\tauthor       = {Yizhe Zhang and Zhe Gan and Kai Fan and Zhi Chen and Ricardo Henao and Dinghan Shen and Lawrence Carin},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{zhang2017mixup,\n\ttitle        = {mixup: Beyond empirical risk minimization},\n\tauthor       = {Zhang, Hongyi and Cisse, Moustapha and Dauphin, Yann N and Lopez-Paz, David},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1710.09412}\n}\n@article{zhang2017stochastic,\n\ttitle        = {Stochastic Variance-reduced Gradient Descent for Low-rank Matrix Recovery from Linear Measurements},\n\tauthor       = {Zhang, Xiao and Wang, Lingxiao and Gu, Quanquan},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1701.00481}\n}\n@inproceedings{zhang2017tacred,\n\ttitle        = {Position-aware Attention and Supervised Data Improve Slot Filling},\n\tauthor       = {Yuhao Zhang and Victor Zhong and Danqi Chen and Gabor Angeli and Christopher D. Manning},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{zhang2017understanding,\n\ttitle        = {Understanding deep learning requires rethinking generalization},\n\tauthor       = {Chiyuan Zhang and Samy Bengio and Moritz Hardt and Benjamin Recht and Oriol Vinyals},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{zhang2018collaborative,\n\ttitle        = {Collaborative and adversarial network for unsupervised domain adaptation},\n\tauthor       = {Weichen Zhang and Wanli Ouyang and Wen Li and Dong Xu},\n\tyear         = 2018,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {3801--3809}\n}\n@inproceedings{zhang2018fast,\n\ttitle        = {Fast and Sample Efficient Inductive Matrix Completion via Multi-Phase Procrustes Flow},\n\tauthor       = {Zhang, Xiao and Du, Simon and Gu, Quanquan},\n\tyear         = 2018,\n\tmonth        = {10--15 Jul},\n\tbooktitle    = {Proceedings of the 35th International Conference on Machine Learning},\n\tpublisher    = {PMLR},\n\tseries       = {Proceedings of Machine Learning Research},\n\tvolume       = 80,\n\tpages        = {5756--5765},\n\turl          = {http://proceedings.mlr.press/v80/zhang18b.html},\n\teditor       = {Dy, Jennifer and Krause, Andreas},\n\tpdf          = {http://proceedings.mlr.press/v80/zhang18b/zhang18b.pdf},\n\tabstract     = {We revisit the inductive matrix completion problem that aims to recover a rank-$r$ matrix with ambient dimension $d$ given $n$ features as the side prior information. The goal is to make use of the known $n$ features to reduce sample and computational complexities. We present and analyze a new gradient-based non-convex optimization algorithm that converges to the true underlying matrix at a linear rate with sample complexity only linearly depending on $n$ and logarithmically depending on $d$. To the best of our knowledge, all previous algorithms either have a quadratic dependency on the number of features in sample complexity or a sub-linear computational convergence rate. In addition, we provide experiments on both synthetic and real world data to demonstrate the effectiveness of our proposed algorithm.}\n}\n@inproceedings{zhang2018generalized,\n\ttitle        = {Generalized cross entropy loss for training deep neural networks with noisy labels},\n\tauthor       = {Zhilu Zhang and Mert R Sabuncu},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@article{zhang2018interactive,\n\ttitle        = {Interactive Language Acquisition with One-shot Visual Concept Learning through a Conversational Game},\n\tauthor       = {Haichao Zhang and Haonan Yu and Wei Xu},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1805.00462}\n}\n@article{zhang2018learning,\n\ttitle        = {Learning One-hidden-layer ReLU Networks via Gradient Descent},\n\tauthor       = {Zhang, Xiao and Yu, Yaodong and Wang, Lingxiao and Gu, Quanquan},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1806.07808}\n}\n@article{zhang2018personalizing,\n\ttitle        = {Personalizing Dialogue Agents: {I} have a dog, do you have pets too?},\n\tauthor       = {Saizheng Zhang and Emily Dinan and Jack Urbanek and Arthur Szlam and Douwe Kiela and Jason Weston},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1801.07243}\n}\n@article{zhang2018solar,\n\ttitle        = {SOLAR: Deep Structured Latent Representations for Model-Based Reinforcement Learning},\n\tauthor       = {M. Zhang and S. Vikram and L. Smith and P. Abbeel and M. J. Johnson and S. Levine},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1808.09105}\n}\n@article{zhang2018spectral,\n\ttitle        = {Spectral state compression of {Markov} processes},\n\tauthor       = {Zhang, Anru and Wang, Mengdi},\n\tyear         = 2018,\n\tjournal      = {arXiv:1802.02920}\n}\n@article{zhang2019bridging,\n\ttitle        = {Bridging theory and algorithm for domain adaptation},\n\tauthor       = {Zhang, Yuchen and Liu, Tianle and Long, Mingsheng and Jordan, Michael I},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1904.05801},\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {7404--7413}\n}\n@inproceedings{zhang2019discretization,\n\ttitle        = {Defending against Whitebox Adversarial Attacks via Randomized Discretization},\n\tauthor       = {Yuchen Zhang and Percy Liang},\n\tyear         = 2019,\n\tbooktitle    = {Artificial Intelligence and Statistics (AISTATS)}\n}\n@article{zhang2019molecular,\n\ttitle        = {Bayesian semi-supervised learning for uncertainty-calibrated prediction of molecular properties and active learning},\n\tauthor       = {Yao Zhang and Alpha A. Lee},\n\tyear         = 2019,\n\tjournal      = {CoRR},\n\tvolume       = {0}\n}\n@inproceedings{zhang2019paws,\n\ttitle        = {PAWS: Paraphrase Adversaries from Word Scrambling},\n\tauthor       = {Yuan Zhang and Jason Baldridge and Luheng He},\n\tyear         = 2019,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@inproceedings{zhang2019regret,\n\ttitle        = {Regret minimization for reinforcement learning by evaluating the optimal bias function},\n\tauthor       = {Zhang, Zihan and Ji, Xiangyang},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {2823--2832}\n}\n@inproceedings{zhang2019selection,\n\ttitle        = {Selection Bias Explorations and Debias Methods for Natural Language Sentence Matching Datasets},\n\tauthor       = {Guanhua Zhang and Bing Bai and Jian Liang and Kun Bai and Shiyu Chang and Mo Yu and Conghui Zhu and Tiejun Zhao},\n\tyear         = 2019,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{zhang2019theoretically,\n\ttitle        = {Theoretically Principled Trade-off between Robustness and Accuracy},\n\tauthor       = {Zhang, Hongyang and Yu, Yaodong and Jiao, Jiantao and Xing, Eric and El Ghaoui, Laurent and Jordan, Michael},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {7472--7482}\n}\n@inproceedings{zhang2019whole,\n\ttitle        = {From whole slide imaging to microscopy: Deep microscopy adaptation network for histopathology cancer image classification},\n\tauthor       = {Yifan Zhang and Hanbo Chen and Ying Wei and Peilin Zhao and Jiezhang Cao and Xinjuan Fan and Xiaoying Lou and Hailing Liu and Jinlong Hou and Xiao Han and others},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Medical Image Computing and Computer-Assisted Intervention},\n\tpages        = {360--368}\n}\n@article{zhang2020adaptive,\n\ttitle        = {Adaptive Risk Minimization: A Meta-Learning Approach for Tackling Group Shift},\n\tauthor       = {Marvin Zhang and Henrik Marklund and Abhishek Gupta and Sergey Levine and Chelsea Finn},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2007.02931}\n}\n@article{zhang2020almost,\n\ttitle        = {Almost Optimal Model-Free Reinforcement Learningvia Reference-Advantage Decomposition},\n\tauthor       = {Zhang, Zihan and Zhou, Yuan and Ji, Xiangyang},\n\tyear         = 2020,\n\tjournal      = {Advances in Neural Information Processing Systems},\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tvolume       = 33\n}\n@inproceedings{zhang2020approximation,\n\ttitle        = {Approximation capabilities of neural ODEs and invertible residual networks},\n\tauthor       = {Zhang, Han and Gao, Xi and Unterman, Jacob and Arodz, Tom},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {11086--11095},\n\torganization = {PMLR}\n}\n@article{zhang2020coping,\n\ttitle        = {Coping with Label Shift via Distributionally Robust Optimisation},\n\tauthor       = {Jingzhao Zhang and Aditya Menon and Andreas Veit and Srinadh Bhojanapalli and Sanjiv Kumar and Suvrit Sra},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.12230}\n}\n@inproceedings{zhang2020crown,\n\ttitle        = {Towards stable and efficient training of verifiably robust neural networks},\n\tauthor       = {Huan Zhang and Hongge Chen and Chaowei Xiao and Sven Gowal and Robert Stanforth and Bo Li and Duane Boning and Cho-Jui Hsieh},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{zhang2020learn,\n\ttitle        = {Learn to Effectively Explore in Context-Based Meta-{RL}},\n\tauthor       = {Jin Zhang and Jianhao Wang and Hao Hu and Yingfeng Chen and Changjie Fan and Chongjie Zhang},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.08170}\n}\n@article{zhang2020model,\n\ttitle        = {Model-Free Reinforcement Learning: from Clipped Pseudo-Regret to Sample Complexity},\n\tauthor       = {Zhang, Zihan and Zhou, Yuan and Ji, Xiangyang},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.03864}\n}\n@article{zhang2020nearly,\n\ttitle        = {Nearly Minimax Optimal Reward-free Reinforcement Learning},\n\tauthor       = {Zhang, Zihan and Du, Simon S and Ji, Xiangyang},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2010.05901}\n}\n@inproceedings{zhang2020overparameterized,\n\ttitle        = {Over-parameterized Adversarial Training: An Analysis Overcoming the Curse of Dimensionality},\n\tauthor       = {Zhang, Yi and Plevrakis, Orestis and Du, Simon S and Li, Xingguo and Song, Zhao and Arora, Sanjeev},\n\tyear         = 2020,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpublisher    = {Curran Associates, Inc.},\n\tvolume       = 33,\n\tpages        = {679--688},\n\turl          = {https://proceedings.neurips.cc/paper/2020/file/0740bb92e583cd2b88ec7c59f985cb41-Paper.pdf},\n\teditor       = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin}\n}\n@article{zhang2020reinforcement,\n\ttitle        = {Is reinforcement learning more difficult than bandits? a near-optimal algorithm escaping the curse of horizon},\n\tauthor       = {Zhang, Zihan and Ji, Xiangyang and Du, Simon S},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2009.13503}\n}\n@inproceedings{zhang2021distribution,\n\ttitle        = {Distribution Alignment: A Unified Framework for Long-tail Visual Recognition},\n\tauthor       = {Zhang, Songyang and Li, Zeming and Yan, Shipeng and He, Xuming and Sun, Jian},\n\tyear         = 2021,\n\tbooktitle    = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},\n\tpages        = {2361--2370}\n}\n@article{zhang2021inductive,\n\ttitle        = {On the Inductive Bias of Masked Language Modeling: From Statistical to Syntactic Dependencies},\n\tauthor       = {Zhang, Tianyi and Hashimoto, Tatsunori},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2104.05694}\n}\n@inproceedings{zhang2021revisiting,\n\ttitle        = {Revisiting Few-sample {BERT} Fine-tuning},\n\tauthor       = {Tianyi Zhang and Felix Wu and Arzuo Katiyar and Kilian Q. Weinberger and Yoav Artzi},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{zhang2021semi,\n\ttitle        = {Semi-supervised Models are Strong Unsupervised Domain Adaptation Learners},\n\tauthor       = {Yabin Zhang and Haojian Zhang and Bin Deng and Shuai Li and Kui Jia and Lei Zhang},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2106.00417}\n}\n@article{zhang2021variance,\n\ttitle        = {Variance-Aware Confidence Set: Variance-Dependent Bound for Linear Bandits and Horizon-Free Bound for Linear Mixture MDP},\n\tauthor       = {Zhang, Zihan and Yang, Jiaqi and Ji, Xiangyang and Du, Simon S},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2101.12745}\n}\n@inproceedings{zhao2007experimental,\n\ttitle        = {\n\t\tExperimental Study of Virtual Machine Migration in Support of Reservation\n\n\t\tof Cluster Resources\n\t},\n\tauthor       = {Ming Zhao and Figueiredo, R.J.},\n\tyear         = 2007,\n\tbooktitle    = {\n\t\t2nd International Workshop on Virtualization Technology in Distributed\n\n\t\tComputing\n\t}\n}\n@article{zhao2011reinforcement,\n\ttitle        = {Reinforcement learning strategies for clinical trials in nonsmall cell lung cancer},\n\tauthor       = {Zhao, Yufan and Zeng, Donglin and Socinski, Mark A and Kosorok, Michael R},\n\tyear         = 2011,\n\tjournal      = {Biometrics},\n\tpublisher    = {Wiley Online Library},\n\tvolume       = 67,\n\tnumber       = 4,\n\tpages        = {1422--1433}\n}\n@inproceedings{zhao2015nonconvex,\n\ttitle        = {A Nonconvex Optimization Framework for Low Rank Matrix Estimation},\n\tauthor       = {Zhao, Tuo and Wang, Zhaoran and Liu, Han},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {559--567}\n}\n@inproceedings{zhao2015type,\n\ttitle        = {Type-Driven Incremental Semantic Parsing with Polymorphism},\n\tauthor       = {Kai Zhao and Liang Huang},\n\tyear         = 2015,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@inproceedings{zhao2016teen,\n\ttitle        = {Communication Between {A}sian {A}merican Adolescents and Health Care Providers About Sexual Activity, Sexually Transmitted Infections, and Pregnancy Prevention},\n\tauthor       = {Jessie Zhao and May Lau and David Vermette and David Liang and Glenn Flores},\n\tyear         = 2016,\n\tbooktitle    = {Journal of Adolescent Research}\n}\n@inproceedings{zhao2017gender,\n\ttitle        = {Men Also Like Shopping: Reducing Gender Bias Amplification using Corpus-level Constraints},\n\tauthor       = {Jieyu Zhao and Tianlu Wang and Mark Yatskar and Vicente Ordo{\\~n}ez and Kai-Wei Chang},\n\tyear         = 2017,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{zhao2017learning,\n\ttitle        = {Learning Discourse-level Diversity for Neural Dialog Models using Conditional Variational Autoencoders},\n\tauthor       = {Tiancheng Zhao and Ran Zhao and Maxine Eskenazi},\n\tyear         = 2017,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@article{zhao2017multiple,\n\ttitle        = {Multiple Source Domain Adaptation with Adversarial Training of Neural Networks},\n\tauthor       = {Han Zhao and Shanghang Zhang and Guanhang Wu and Jo{\\~a}o P. Costeira and Jos{'e} M. F. Moura and Geoffrey J. Gordon},\n\tyear         = 2017,\n\tjournal      = {CoRR},\n\tvolume       = {0}\n}\n@inproceedings{zhao2018gender,\n\ttitle        = {Gender Bias in Coreference Resolution: Evaluation and Debiasing Methods},\n\tauthor       = {Jieyu Zhao and Tianlu Wang and Mark Yatskar and Vicente Ordo{\\~n}ez and Kai-Wei Chang},\n\tyear         = 2018,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@article{zhao2018learning,\n\ttitle        = {Learning gender-neutral word embeddings},\n\tauthor       = {Jieyu Zhao and Yichao Zhou and Zeyu Li and Wei Wang and Kai-Wei Chang},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1809.01496}\n}\n@inproceedings{zhao2018regularized,\n\ttitle        = {Adversarially Regularized Autoencoders},\n\tauthor       = {Junbo Zhao and Yoon Kim and Kelly Zhang and Alexander M. Rush and Yann LeCun},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@inproceedings{zhao2018unsupervised,\n\ttitle        = {Unsupervised Discrete Sentence Representation Learning for Interpretable Neural Dialog Generation},\n\tauthor       = {Tiancheng Zhao and Kyusong Lee and Maxine Eskenazi},\n\tyear         = 2018,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{zhao2019inherent,\n\ttitle        = {Inherent Tradeoffs in Learning Fair Representations},\n\tauthor       = {H. Zhao and Geoff Gordon},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{zhao2019learning,\n\ttitle        = {On Learning Invariant Representations for Domain Adaptation},\n\tauthor       = {Zhao, Han and Combes, Remi Tachet Des and Zhang, Kun and Gordon, Geoffrey},\n\tyear         = 2019,\n\tmonth        = {09--15 Jun},\n\tbooktitle    = {Proceedings of the 36th International Conference on Machine Learning},\n\tpublisher    = {PMLR},\n\tpages        = {7523--7532},\n\turl          = {http://proceedings.mlr.press/v97/zhao19a.html}\n}\n@inproceedings{zhao2019rethinking,\n\ttitle        = {Rethinking Action Spaces for Reinforcement Learning in End-to-end Dialog Agents with Latent Variable Models},\n\tauthor       = {Tiancheng Zhao and Kaige Xie and M. Eskenazi},\n\tyear         = 2019,\n\tbooktitle    = {North American Association for Computational Linguistics (NAACL)}\n}\n@inproceedings{zhao2019zhao,\n\ttitle        = {On Learning Invariant Representation for Domain Adaptation},\n\tauthor       = {Han Zhao and Remi Tachet des Combes and Kun Zhang and Geoffrey J. Gordon},\n\tyear         = 2019,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{zhao2020individual,\n\ttitle        = {Individual Calibration with Randomized Forecasting},\n\tauthor       = {Zhao, Shengjia and Ma, Tengyu and Ermon, Stefano},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2006.10288}\n}\n@inproceedings{zhao2021calibrate,\n\ttitle        = {Calibrate Before Use: Improving Few-Shot Performance of Language Models},\n\tauthor       = {Tony Z. Zhao and Eric Wallace and Shi Feng and Dan Klein and  Sameer Singh},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Machine Learning (ICML)}\n}\n@article{zhao2021provably,\n\ttitle        = {Provably efficient policy gradient methods for two-player zero-sum Markov games},\n\tauthor       = {Zhao, Yulai and Tian, Yuandong and Lee, Jason D and Du, Simon S},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2102.08903}\n}\n@inproceedings{zhao2021right,\n\ttitle        = {Right decisions from wrong predictions: A mechanism design alternative to individual calibration},\n\tauthor       = {Zhao, Shengjia and Ermon, Stefano},\n\tyear         = 2021,\n\tbooktitle    = {International Conference on Artificial Intelligence and Statistics},\n\tpages        = {2683--2691},\n\torganization = {PMLR}\n}\n@inproceedings{zheng2007template,\n\ttitle        = {Template-independent news extraction based on visual consistency},\n\tauthor       = {Shuyi Zheng and Ruihua Song and Ji-Rong Wen},\n\tyear         = 2007,\n\tbooktitle    = {AAAI},\n\tvolume       = 7,\n\tpages        = {1507--1513}\n}\n@inproceedings{zheng2009efficient,\n\ttitle        = {Efficient record-level wrapper induction},\n\tauthor       = {Shuyi Zheng and Ruihua Song and Ji-Rong Wen and C Lee Giles},\n\tyear         = 2009,\n\tbooktitle    = {Proceedings of the 18th ACM conference on Information and knowledge management},\n\tpages        = {47--56}\n}\n@inproceedings{zheng2015convergent,\n\ttitle        = {A convergent gradient descent algorithm for rank minimization and semidefinite programming from random linear measurements},\n\tauthor       = {Zheng, Qinqing and Lafferty, John},\n\tyear         = 2015,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {109--117}\n}\n@article{zheng2016convergence,\n\ttitle        = {Convergence Analysis for Rectangular Matrix Completion Using Burer-Monteiro Factorization and Gradient Descent},\n\tauthor       = {Zheng, Qinqing and Lafferty, John},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1605.07051}\n}\n@inproceedings{zheng2016improving,\n\ttitle        = {Improving the robustness of deep neural networks via stability training},\n\tauthor       = {Stephan Zheng and Yang Song and Thomas Leung and Ian Goodfellow},\n\tyear         = 2016,\n\tbooktitle    = {Proceedings of the ieee conference on computer vision and pattern recognition},\n\tpages        = {4480--4488}\n}\n@book{zheng2018feature,\n\ttitle        = {Feature engineering for machine learning: principles and techniques for data scientists},\n\tauthor       = {Alice Zheng and Amanda Casari},\n\tyear         = 2018,\n\tpublisher    = {\" O'Reilly Media, Inc.\"}\n}\n@inproceedings{zheng2018learning,\n\ttitle        = {On Learning Intrinsic Rewards for Policy Gradient Methods},\n\tauthor       = {Zeyu Zheng and Junhyuk Oh and Satinder Singh},\n\tyear         = 2018,\n\tbooktitle    = {Advances in Neural Information Processing Systems (NeurIPS)}\n}\n@inproceedings{zhong2013accurate,\n\ttitle        = {Accurate Probability Calibration for Multiple Classifiers},\n\tauthor       = {Leon Wenliang Zhong and James Kwok},\n\tyear         = 2013,\n\tbooktitle    = {International Joint Conference on Artificial Intelligence (IJCAI)},\n\tpages        = {1939--1945}\n}\n@article{zhong2017learning,\n\ttitle        = {Learning Non-overlapping Convolutional Neural Networks with Multiple Kernels},\n\tauthor       = {Zhong, Kai and Song, Zhao and Dhillon, Inderjit S},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1711.03440}\n}\n@article{zhong2017recovery,\n\ttitle        = {Recovery Guarantees for One-hidden-layer Neural Networks},\n\tauthor       = {Zhong, Kai and Song, Zhao and Jain, Prateek and Bartlett, Peter L and Dhillon, Inderjit S},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1706.03175},\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpublisher    = {arXiv preprint arXiv:1706.03175}\n}\n@article{zhong2017seq2sql,\n\ttitle        = {Seq2{SQL}: Generating Structured Queries from Natural Language using Reinforcement Learning},\n\tauthor       = {Victor Zhong and Caiming Xiong and Richard Socher},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1709.00103}\n}\n@article{zhong2019pac,\n\ttitle        = {{PAC} Reinforcement Learning without Real-World Feedback},\n\tauthor       = {Yuren Zhong and Aniket Anand Deshmukh and Clayton Scott},\n\tyear         = 2019,\n\tjournal      = {ArXiv},\n\tvolume       = {abs/1909.10449}\n}\n@inproceedings{zhong2020grounded,\n\ttitle        = {Grounded adaptation for zero-shot executable semantic parsing},\n\tauthor       = {Sewon Min and Danqi Chen and Hannaneh Hajishirzi and Luke Zettlemoyer},\n\tyear         = 2020,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@article{zhong2021factual,\n\ttitle        = {Factual Probing Is [MASK]: Learning vs. Learning to Recall},\n\tauthor       = {Zhong, Zexuan and Friedman, Dan and Chen, Danqi},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2104.05240}\n}\n@article{zhou2005efficient,\n\ttitle        = {Efficient algorithms for the smallest enclosing ball problem},\n\tauthor       = {Zhou, Guanglu and Tohemail, Kim-Chuan and Sun, Jie},\n\tyear         = 2005,\n\tjournal      = {Computational Optimization and Applications},\n\tpublisher    = {Springer},\n\tvolume       = 30,\n\tnumber       = 2,\n\tpages        = {147--160}\n}\n@inproceedings{zhou2014divide,\n\ttitle        = {Divide-and-Conquer Learning by Anchoring a Conical Hull},\n\tauthor       = {Zhou, Tianyi and Bilmes, Jeff A and Guestrin, Carlos},\n\tyear         = 2014,\n\tbooktitle    = {NIPS},\n\tpages        = {1242--1250}\n}\n@article{zhou2015predicting,\n\ttitle        = {Predicting effects of noncoding variants with deep learning--based sequence model},\n\tauthor       = {Jian Zhou and Olga G Troyanskaya},\n\tyear         = 2015,\n\tjournal      = {Nature methods},\n\tvolume       = 12,\n\tnumber       = 10,\n\tpages        = {931--934}\n}\n@article{zhou2015regularized,\n\ttitle        = {Regularized minimax conditional entropy for crowdsourcing},\n\tauthor       = {Dengyong Zhou and Qiang Liu and John C. Platt and Christopher Meek and Nihar B. Shah},\n\tyear         = 2015,\n\tjournal      = {arXiv}\n}\n@article{zhou2015simple,\n\ttitle        = {Simple baseline for visual question answering},\n\tauthor       = {Bolei Zhou and Yuandong Tian and Sainbayar Sukhbaatar and Arthur Szlam and Rob Fergus},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1512.02167}\n}\n@inproceedings{zhou2016amr,\n\ttitle        = {{AMR} Parsing with an Incremental Joint Model},\n\tauthor       = {Junsheng Zhou and Feiyu Xu and Hans Uszkoreit and Weiguang Qu and Ran Li and Yanhui Gu},\n\tyear         = 2016,\n\tbooktitle    = {Empirical Methods in Natural Language Processing (EMNLP)}\n}\n@inproceedings{zhou2016cams,\n\ttitle        = {Learning Deep Features for Discriminative Localization},\n\tauthor       = {Bolei Zhou and Aditya Khosla and Agata Lapedriza and Aude Oliva and Antonio Torralba},\n\tyear         = 2016,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)}\n}\n@article{zhou2016community,\n\ttitle        = {Learning semantic representation with neural networks for community question answering retrieval},\n\tauthor       = {Guangyou Zhou and Yin Zhou and Tingting He and Wensheng Wu},\n\tyear         = 2016,\n\tjournal      = {Knowledge-Based Systems},\n\tpages        = {75--83}\n}\n@inproceedings{zhou2016modeling,\n\ttitle        = {Modeling Adversarial Learning as Nested {S}tackelberg Games},\n\tauthor       = {Yan Zhou and Murat Kantarcioglu},\n\tyear         = 2016,\n\tbooktitle    = {Pacific-Asia Conference on Knowledge Discovery and Data Mining}\n}\n@article{zhou2017critical,\n\ttitle        = {Critical Points of Neural Networks: Analytical Forms and Landscape Properties},\n\tauthor       = {Zhou, Yi and Liang, Yingbin},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1710.11205}\n}\n@article{zhou2017landscape,\n\ttitle        = {The Landscape of Deep Learning Algorithms},\n\tauthor       = {Zhou, Pan and Feng, Jiashi},\n\tyear         = 2017,\n\tjournal      = {arXiv preprint arXiv:1705.07038}\n}\n@article{zhou2017places,\n\ttitle        = {Places: A 10 million image database for scene recognition},\n\tauthor       = {Bolei Zhou and Agata Lapedriza and Aditya Khosla and Aude Oliva and Antonio Torralba},\n\tyear         = 2017,\n\tjournal      = {IEEE Transactions on Pattern Analysis and Machine Intelligence},\n\tvolume       = 40,\n\tnumber       = 6,\n\tpages        = {1452--1464}\n}\n@article{zhou2018critical,\n\ttitle        = {Critical points of linear neural networks: Analytical forms and landscape properties},\n\tauthor       = {Zhou, Yi and Liang, Yingbin},\n\tyear         = 2018\n}\n@inproceedings{zhou2018interpretable,\n\ttitle        = {Interpretable basis decomposition for visual explanation},\n\tauthor       = {Bolei Zhou and Yiyou Sun and David Bau and Antonio Torralba},\n\tyear         = 2018,\n\tbooktitle    = {European Conference on Computer Vision (ECCV)},\n\tpages        = {119--134}\n}\n@inproceedings{zhou2019bbls,\n\ttitle        = {{BERT}-based Lexical Substitution},\n\tauthor       = {Wangchunshu Zhou and Tao Ge and Ke Xu and Furu Wei and Ming Zhou},\n\tyear         = 2019,\n\tbooktitle    = {Association for Computational Linguistics (ACL)}\n}\n@inproceedings{zhou2019effects,\n\ttitle        = {Effects of Influence on User Trust in Predictive Decision Making},\n\tauthor       = {Jianlong Zhou and Zhidong Li and Huaiwen Hu and Kun Yu and Fang Chen and Zelin Li and Yang Wang},\n\tyear         = 2019,\n\tbooktitle    = {Conference on Human Factors in Computing Systems (CHI)}\n}\n@article{zhou2019environment,\n\ttitle        = {Environment probing interaction policies},\n\tauthor       = {Wenxuan Zhou and Lerrel Pinto and Abhinav Gupta},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1907.11740}\n}\n@article{zhou2019watch,\n\ttitle        = {Watch, try, learn: Meta-learning from demonstrations and reward},\n\tauthor       = {Allan Zhou and Eric Jang and Daniel Kappler and Alex Herzog and Mohi Khansari and Paul Wohlhart and Yunfei Bai and Mrinal Kalakrishnan and Sergey Levine and Chelsea Finn},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1906.03352}\n}\n@article{zhou2020curse,\n\ttitle        = {The Curse of Performance Instability in Analysis Datasets: Consequences, Source, and Suggestions},\n\tauthor       = {Xiang Zhou and Yixin Nie and Hao Tan and Mohit Bansal},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2004.13606}\n}\n@inproceedings{zhou2020neural,\n\ttitle        = {Neural contextual bandits with UCB-based exploration},\n\tauthor       = {Zhou, Dongruo and Li, Lihong and Gu, Quanquan},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Machine Learning},\n\tpages        = {11492--11502},\n\torganization = {PMLR}\n}\n@article{zhou2020universality,\n\ttitle        = {Universality of deep convolutional neural networks},\n\tauthor       = {Zhou, Ding-Xuan},\n\tyear         = 2020,\n\tjournal      = {Applied and computational harmonic analysis},\n\tpublisher    = {Elsevier},\n\tvolume       = 48,\n\tnumber       = 2,\n\tpages        = {787--794}\n}\n@misc{zhou2021nearly,\n\ttitle        = {Nearly Minimax Optimal Reinforcement Learning for Linear Mixture Markov Decision Processes},\n\tauthor       = {Dongruo Zhou and Quanquan Gu and Csaba Szepesvari},\n\tyear         = 2021,\n\teprint       = {2012.08507},\n\tarchiveprefix = {arXiv},\n\tprimaryclass = {cs.LG}\n}\n@inproceedings{ZHPA13-contrast,\n\ttitle        = {Contrastive learning using spectral methods},\n\tauthor       = {James Zou and Daniel Hsu and David Parkes and Ryan P. Adams},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems 26},\n\turl          = {http://papers.nips.cc/paper/5007-contrastive-learning-using-spectral-methods}\n}\n@article{zhu06grammar,\n\ttitle        = {A Stochastic Grammar of Images},\n\tauthor       = {S. C. Zhu and D. Mumford},\n\tyear         = 2006,\n\tjournal      = {Foundations and Trends in Computer Graphics and Vision},\n\tvolume       = 2,\n\tpages        = {259--362}\n}\n@techreport{zhu2002learning,\n\ttitle        = {Learning from Labeled and Unlabeled Data with Label Propagation},\n\tauthor       = {Xiaojin Zhu and Zoubin Ghahramani},\n\tyear         = 2002,\n\tinstitution  = {CMU CALD}\n}\n@inproceedings{zhu2003semi,\n\ttitle        = {Semi-supervised learning using gaussian fields and harmonic functions},\n\tauthor       = {Zhu, Xiaojin and Ghahramani, Zoubin and Lafferty, John D},\n\tyear         = 2003,\n\tbooktitle    = {Proceedings of the 20th International conference on Machine learning (ICML-03)},\n\tpages        = {912--919}\n}\n@inproceedings{zhu2005twod,\n\ttitle        = {2{D} conditional random fields for web information extraction},\n\tauthor       = {Jun Zhu and Zaiqing Nie and Ji-Rong Wen and Bo Zhang and Wei-Ying Ma},\n\tyear         = 2005,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {1044--1051}\n}\n@article{zhu2015moviebook,\n\ttitle        = {Aligning Books and Movies: Towards Story-like Visual Explanations by Watching Movies and Reading Books},\n\tauthor       = {Yukun Zhu and Ryan Kiros and Richard Zemel and Ruslan Salakhutdinov and Raquel Urtasun and Antonio Torralba and Sanja Fidler},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1506.06724}\n}\n@inproceedings{zhu2016visual7w,\n\ttitle        = {Visual7{W}: Grounded question answering in images},\n\tauthor       = {Yuke Zhu and Oliver Groth and Michael Bernstein and Li Fei-Fei},\n\tyear         = 2016,\n\tbooktitle    = {Computer Vision and Pattern Recognition (CVPR)},\n\tpages        = {4995--5004}\n}\n@inproceedings{zhu2017cycle,\n\ttitle        = {Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial Networks},\n\tauthor       = {Zhu, Jun-Yan and Park, Taesung and Isola, Phillip and Efros, Alexei A},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Computer Vision (ICCV)}\n}\n@inproceedings{zhu2017target,\n\ttitle        = {Target-driven visual navigation in indoor scenes using deep reinforcement learning},\n\tauthor       = {Yuke Zhu and Roozbeh Mottaghi and Eric Kolve and Joseph J Lim and Abhinav Gupta and Li Fei-Fei and Ali Farhadi},\n\tyear         = 2017,\n\tbooktitle    = {International Conference on Robotics and Automation (ICRA)},\n\tpages        = {3357--3364}\n}\n@inproceedings{zhu2019anisotropic,\n\ttitle        = {The anisotropic noise in stochastic gradient descent: Its behavior of escaping from sharp minima and regularization effects},\n\tauthor       = {Zhu, Zhanxing and Wu, Jingfeng and Yu, Bing and Wu, Lei and Ma, Jinwen},\n\tyear         = 2019\n}\n@article{zhu2019text,\n\ttitle        = {Text Infilling},\n\tauthor       = {Wanrong Zhu and Zhiting Hu and Eric Xing},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1901.00158}\n}\n@inproceedings{zhu2020freelb,\n\ttitle        = {{F}ree{LB}: Enhanced Adversarial Training for Natural Language Understanding},\n\tauthor       = {Chen Zhu and Yu Cheng and Zhe Gan and Siqi Sun and Tom Goldstein and Jingjing Liu},\n\tyear         = 2020,\n\tbooktitle    = {International Conference on Learning Representations (ICLR)}\n}\n@article{zhu2020robosuite,\n\ttitle        = {Robosuite: A Modular Simulation Framework and Benchmark for Robot Learning},\n\tauthor       = {Yuke Zhu and Josiah Wong and Ajay Mandlekar and Roberto Martín-Martín},\n\tyear         = 2020,\n\tjournal      = {arXiv preprint arXiv:2009.12293}\n}\n@inproceedings{zickler2009efficient,\n\ttitle        = {Efficient physics-based planning: sampling search via non-deterministic tactics and skills},\n\tauthor       = {S. Zickler and M. Veloso},\n\tyear         = 2009,\n\tbooktitle    = {International Conference on Autonomous Agents and Multiagent Systems (AAMAS)},\n\tpages        = {27--33}\n}\n@inproceedings{ziebart2008maximum,\n\ttitle        = {Maximum Entropy Inverse Reinforcement Learning},\n\tauthor       = {Brian D. Ziebart and Andrew L. Maas and J. Andrew Bagnell and Anind K. Dey},\n\tyear         = 2008,\n\tbooktitle    = {Association for the Advancement of Artificial Intelligence (AAAI)}\n}\n@article{ziehe2004fast,\n\ttitle        = {A fast algorithm for joint diagonalization with non-orthogonal transformations and its application to blind source separation},\n\tauthor       = {Ziehe, A. and Laskov, P. and Nolte, G. and M{\\\"u}ller, K. R.},\n\tyear         = 2004,\n\tjournal      = {Journal of Machine Learning Research},\n\tvolume       = 5,\n\tpages        = {777--800}\n}\n@inproceedings{zimin2013online,\n\ttitle        = {Online learning in episodic Markovian decision processes by relative entropy policy search},\n\tauthor       = {Zimin, Alexander and Neu, Gergely},\n\tyear         = 2013,\n\tbooktitle    = {Advances in neural information processing systems},\n\tpages        = {1583--1591}\n}\n@article{zimmermann2021contrastive,\n\ttitle        = {Contrastive Learning Inverts the Data Generating Process},\n\tauthor       = {Zimmermann, Roland S and Sharma, Yash and Schneider, Steffen and Bethge, Matthias and Brendel, Wieland},\n\tyear         = 2021,\n\tjournal      = {arXiv preprint arXiv:2102.08850}\n}\n@inproceedings{zinkevich2003online,\n\ttitle        = {Online convex programming and generalized infinitesimal gradient ascent},\n\tauthor       = {Zinkevich, Martin},\n\tyear         = 2003,\n\tbooktitle    = {Proceedings of the 20th international conference on machine learning (icml-03)},\n\tseries       = {ICML 2003},\n\tpages        = {928--936}\n}\n@inproceedings{zinkevich2010parallelized,\n\ttitle        = {Parallelized stochastic gradient descent},\n\tauthor       = {Zinkevich, Martin and Weimer, Markus and Li, Lihong and Smola, Alex J},\n\tyear         = 2010,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {2595--2603}\n}\n@article{zintgraf2019varibad,\n\ttitle        = {VariBAD: A Very Good Method for Bayes-Adaptive Deep {RL} via Meta-Learning},\n\tauthor       = {Luisa Zintgraf and Kyriacos Shiarlis and Maximilian Igl and Sebastian Schulze and Yarin Gal and Katja Hofmann and Shimon Whiteson},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1910.08348}\n}\n@article{zliobaite2015relation,\n\ttitle        = {On the relation between accuracy and fairness in binary classification},\n\tauthor       = {Indre Zliobaite},\n\tyear         = 2015,\n\tjournal      = {arXiv preprint arXiv:1505.05723}\n}\n@inproceedings{zlj16,\n\ttitle        = {$\\ell_1$-regularized neural networks are improperly learnable in polynomial time},\n\tauthor       = {Zhang, Yuchen and Lee, Jason D and Jordan, Michael I},\n\tyear         = 2016,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpages        = {993--1001}\n}\n@inproceedings{ZLM13,\n\ttitle        = {A Local Algorithm for Finding Well-Connected Clusters},\n\tauthor       = {Zeyuan Allen Zhu and Silvio Lattanzi and Vahab Mirrokni},\n\tyear         = 2013,\n\tbooktitle    = {ICML}\n}\n@inproceedings{zlsd18,\n\ttitle        = {Learning Long Term Dependencies via {F}ourier Recurrent Units},\n\tauthor       = {Zhang, Jiong and Lin, Yibo and Song, Zhao and Dhillon, Inderjit S},\n\tyear         = 2018,\n\tbooktitle    = {International Conference on Machine Learning (ICML)},\n\tpublisher    = {arXiv preprint arXiv:1803.06585}\n}\n@inproceedings{zobel1998reliable,\n\ttitle        = {How reliable are the results of large-scale information retrieval experiments?},\n\tauthor       = {Justin Zobel},\n\tyear         = 1998,\n\tbooktitle    = {ACM Special Interest Group on Information Retreival (SIGIR)}\n}\n@article{zoph2016neural,\n\ttitle        = {Neural architecture search with reinforcement learning},\n\tauthor       = {Barret Zoph and Quoc V Le},\n\tyear         = 2016,\n\tjournal      = {arXiv preprint arXiv:1611.01578}\n}\n@article{zoph2020rethinking,\n\ttitle        = {Rethinking Pre-training and Self-training},\n\tauthor       = {Barret Zoph and Golnaz Ghiasi and Tsung-Yi Lin and Yin Cui and Hanxiao Liu and Ekin D. Cubuk and Quoc V. Le},\n\tyear         = 2020,\n\tjournal      = {arXiv}\n}\n@inproceedings{zordan2003mapping,\n\ttitle        = {\n\t\tMapping optical motion capture data to skeletal motion using a physical\n\n\t\tmodel\n\t},\n\tauthor       = {Zordan,, Victor Brian and Van Der Horst,, Nicholas C.},\n\tyear         = 2003,\n\tbooktitle    = {\n\t\tSCA '03: Proceedings of the 2003 ACM SIGGRAPH/Eurographics symposium\n\n\t\ton Computer animation\n\t},\n\tlocation     = {San Diego, California},\n\tpublisher    = {Eurographics Association},\n\taddress      = {Aire-la-Ville, Switzerland, Switzerland},\n\tpages        = {245--250},\n\tisbn         = {1-58113-659-5},\n\towner        = {leili},\n\ttimestamp    = {2011.07.28}\n}\n@article{zou2005regularization,\n\ttitle        = {Regularization and variable selection via the elastic net},\n\tauthor       = {Zou, Hui and Hastie, Trevor},\n\tyear         = 2005,\n\tjournal      = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)},\n\tpublisher    = {Wiley Online Library},\n\tvolume       = 67,\n\tnumber       = 2,\n\tpages        = {301--320}\n}\n@inproceedings{zou2013contrastive,\n\ttitle        = {Contrastive Learning Using Spectral Methods},\n\tauthor       = {J. Y. Zou and D. Hsu and D. C. Parkes and R. P. Adams},\n\tyear         = 2013,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {2238--2246}\n}\n@article{zou2018stochastic,\n\ttitle        = {Stochastic Gradient Descent Optimizes Over-parameterized Deep {ReLU} Networks},\n\tauthor       = {Zou, Difan and Cao, Yuan and Zhou, Dongruo and Gu, Quanquan},\n\tyear         = 2018,\n\tjournal      = {arXiv preprint arXiv:1811.08888}\n}\n@inproceedings{zou2018unsupervised,\n\ttitle        = {Unsupervised domain adaptation for semantic segmentation via class-balanced self-training},\n\tauthor       = {Yang Zou and Zhiding Yu and BVK Vijaya Kumar and Jinsong Wang},\n\tyear         = 2018,\n\tbooktitle    = {European Conference on Computer Vision (ECCV)},\n\tpages        = {289--305}\n}\n@article{zou2019confidence,\n\ttitle        = {Confidence regularized self-training},\n\tauthor       = {Yang Zou and Zhiding Yu and Xiaofeng Liu and BVK Kumar and Jinsong Wang},\n\tyear         = 2019,\n\tjournal      = {arXiv preprint arXiv:1908.09822}\n}\n@inproceedings{zou2019finite,\n\ttitle        = {{Finite-sample analysis for SARSA with linear function approximation}},\n\tauthor       = {Zou, Shaofeng and Xu, Tengyu and Liang, Yingbin},\n\tyear         = 2019,\n\tbooktitle    = {Advances in Neural Information Processing Systems},\n\tpages        = {8668--8678}\n}\n@inproceedings{Zouzias2012,\n\ttitle        = {A Matrix Hyperbolic Cosine Algorithm and Applications},\n\tauthor       = {Zouzias, Anastasios},\n\tyear         = 2012,\n\tbooktitle    = {Proceedings of the 39th International Colloquium Conference on Automata, Languages, and Programming - Volume Part I},\n\tlocation     = {Warwick, UK},\n\tpublisher    = {Springer-Verlag},\n\taddress      = {Berlin, Heidelberg},\n\tseries       = {ICALP'12},\n\tpages        = {846--858},\n\tdoi          = {10.1007/978-3-642-31594-7_71},\n\tisbn         = {978-3-642-31593-0},\n\tnumpages     = 13,\n\tacmid        = 2359454\n}\n@inproceedings{zukerman2002lexical,\n\ttitle        = {Lexical query paraphrasing for document retrieval},\n\tauthor       = {Ingrid Zukerman and Bhavani Raskutti},\n\tyear         = 2002,\n\tbooktitle    = {International Conference on Computational Linguistics (COLING)},\n\tpages        = {1--7}\n}\n@inproceedings{zurel2001efficient,\n\ttitle        = {An efficient approximate allocation algorithm for combinatorial auctions},\n\tauthor       = {Zurel, Edo and Nisan, Noam},\n\tyear         = 2001,\n\tbooktitle    = {Proceedings of the 3rd ACM conference on Electronic Commerce},\n\tpages        = {125--136},\n\torganization = {ACM}\n}\n@inproceedings{zz-sdca-sampling,\n\ttitle        = {Stochastic Optimization with Importance Sampling for Regularized Loss Minimization},\n\tauthor       = {Peilin Zhao and Tong Zhang},\n\tyear         = 2015,\n\tbooktitle    = {Proceedings of the 32nd International Conference on Machine Learning},\n\tvolume       = 37,\n\tpages        = {1--9}\n}\n@article{zz123,\n\ttitle        = {What Can ResNet Learn Efficiently, Going Beyond Kernels?},\n\tauthor       = {Zeyuan Allen{-}Zhu and Yuanzhi Li},\n\tyear         = 2019,\n\tjournal      = {CoRR},\n\tvolume       = {abs/1905.10337},\n\turl          = {http://arxiv.org/abs/1905.10337},\n\tarchiveprefix = {arXiv},\n\teprint       = {1905.10337},\n\ttimestamp    = {Wed, 29 May 2019 11:27:50 +0200},\n\tbiburl       = {https://dblp.org/rec/bib/journals/corr/abs-1905-10337},\n\tbibsource    = {dblp computer science bibliography, https://dblp.org}\n}\n"
  },
  {
    "path": "tex/bibliography.bib",
    "content": "@inproceedings{arora2005fast,\n  title={Fast algorithms for approximate semidefinite programming using the multiplicative weights update method},\n  author={Arora, Sanjeev and Hazan, Elad and Kale, Satyen},\n  booktitle={46th Annual IEEE Symposium on Foundations of Computer Science (FOCS'05)},\n  pages={339--348},\n  year={2005},\n  organization={IEEE}\n}\n\n@article{bartlett2017,\n    author =       \"Peter Bartlett and Dylan J. Foster and Matus Telgarsky\",\n    title =        \"Spectrally-normalized margin bounds for neural networks\",\n    journal =      \"NeurIPS\",\n    year =         \"2017\"\n}\n\n@ARTICLE{belkin2019,\nauthor    = {Belkin, M. and Hsu, D. and Ma, S. and Mandal, S.},\ntitle     = {Reconciling modern machine-learning practice and the classical bias-variance trade-off},\njournal   = {Proceedings of the National Academy of Sciences (PNAS)},\nvolume    = {116},\nnumber    = {32},\npages     = {15849--15854},\nyear      = {2019},\nnote      = {},\ndoi       = {10.1073/pnas.1903070116}\n}\n\n@inproceedings{blanc2020implicit,\n  title={Implicit regularization for deep neural networks driven by an ornstein-uhlenbeck like process},\n  author={Blanc, Guy and Gupta, Neha and Valiant, Gregory and Valiant, Paul},\n  booktitle={Conference on learning theory},\n  pages={483--513},\n  year={2020},\n  organization={PMLR}\n}\n\n@article{freund1997decision,\n\ttitle={A decision-theoretic generalization of on-line learning and an application to boosting},\n\tauthor={Freund, Yoav and Schapire, Robert E},\n\tjournal={Journal of computer and system sciences},\n\tvolume={55},\n\tnumber={1},\n\tpages={119--139},\n\tyear={1997},\n\tpublisher={Elsevier}\n}\n\n@InProceedings{ge2015,\ntitle = {Escaping From Saddle Points --- Online Stochastic Gradient for Tensor Decomposition},\nauthor = {Rong Ge and Furong Huang and Chi Jin and Yang Yuan},\nbooktitle = {Proceedings of The 28th Conference on Learning Theory},\npages = {797--842},\nyear = {2015},\neditor = {Peter Grünwald and Elad Hazan and Satyen Kale},\nvolume = {40},\nseries = {Proceedings of Machine Learning Research},\naddress = {Paris, France},\nmonth = {03--06 Jul},\npublisher = {PMLR}\n}\n\n@inproceedings{ge2016,\n author = {Ge, Rong and Lee, Jason D and Ma, Tengyu},\n booktitle = {Advances in Neural Information Processing Systems},\n editor = {D. Lee and M. Sugiyama and U. Luxburg and I. Guyon and R. Garnett},\n pages = {},\n publisher = {Curran Associates, Inc.},\n title = {Matrix Completion has No Spurious Local Minimum},\n url = {https://proceedings.neurips.cc/paper/2016/file/7fb8ceb3bd59c7956b1df66729296a4c-Paper.pdf},\n volume = {29},\n year = {2016}\n}\n\n@article{hillar2013,\nauthor = {Hillar, Christopher J. and Lim, Lek-Heng},\ntitle = {Most Tensor Problems Are NP-Hard},\nyear = {2013},\nissue_date = {November 2013},\npublisher = {Association for Computing Machinery},\nvolume = {60},\nnumber = {6},\njournal = {Journal of the ACM},\narticleno = {45},\nnumpages = {39}\n}\n\n\n@article{hornik1991,\n    author = \"Kurt Hornik\",\n    title = \"Approximation capabilities of multilayer feedforward networks\",\n    journal = \"Neural Networks\",\n    volume = \"4\",\n    pages = \"251-257\",\n    year = \"1991\"\n}\n\n@InProceedings{lee2016,\ntitle = {Gradient Descent Only Converges to Minimizers},\nauthor = {Jason D. Lee and Max Simchowitz and Michael I. Jordan and Benjamin Recht},\nbooktitle = {29th Annual Conference on Learning Theory},\npages = {1246--1257},\nyear = {2016},\neditor = {Vitaly Feldman and Alexander Rakhlin and Ohad Shamir},\nvolume = {49},\nseries = {Proceedings of Machine Learning Research},\naddress = {Columbia University, New York, New York, USA},\nmonth = {23--26 Jun},\npublisher = {PMLR}\n}\n\n@inproceedings{li2018algorithmic,\n  title={Algorithmic regularization in over-parameterized matrix sensing and neural networks with quadratic activations},\n  author={Li, Yuanzhi and Ma, Tengyu and Zhang, Hongyang},\n  booktitle={Conference On Learning Theory},\n  pages={2--47},\n  year={2018},\n  organization={PMLR}\n}\n\n@misc{haipengnotes,\n\tauthor        = {Haipeng Luo},\n\ttitle         = {Introduction to Online Learning},\n\turl = {https://haipeng-luo.net/courses/CSCI699/}, \n\tyear          = {2017},\n}\n\n@article{murty1987,\n    author = {Murty, Katta G. and Kabadi, Santosh N.},\n    title = {Some NP-complete problems in quadratic and nonlinear programming},\n    journal = {Mathematical Programming},\n    volume = {39},\n    pages = {117--129},\n    year = {1987}\n}\n\n@misc{percynotes,\n  author        = {Percy Liang},\n  title         = {CS229T/STAT231: Statistical Learning Theory (Winter 2016)},\n  month         = {April},\n  year          = {2016},\n}\n\n@book{rice2006mathematical,\n  added-at = {2015-07-23T16:14:13.000+0200},\n  author = {Rice, John A.},\n  biburl = {https://www.bibsonomy.org/bibtex/27c9b064b982dbbdcc47f235520c096d4/asalber},\n  edition = {Third},\n  interhash = {6f67cee500f65c6b5cf366a16a878dd3},\n  intrahash = {7c9b064b982dbbdcc47f235520c096d4},\n  keywords = {biostatistics teaching},\n  publisher = {Belmont, CA: Duxbury Press.},\n  timestamp = {2015-07-23T16:14:13.000+0200},\n  title = {Mathematical Statistics and Data Analysis.},\n  year = 2006\n}\n\n@article{srebro2010optimistic,\n  title={Optimistic rates for learning with a smooth loss},\n  author={Srebro, Nathan and Sridharan, Karthik and Tewari, Ambuj},\n  journal={arXiv preprint arXiv:1009.3896},\n  year={2010}\n}\n\n@misc{thomasliu2018,\n  author        = {Pengda Liu and Garrett Thomas},\n  title         = {CS229T/STAT231: Statistical Learning Theory (Fall 2018)},\n  month         = {October},\n  year          = {2018},\n}\n\n@misc{vanhandel2016high,\n  author        = {Ramon van Handel},\n  title         = {Probability in High Dimension: APC 550 Lecture Notes},\n  month         = {December},\n  year          = {2016},\n  publisher={Princeton University}\n}\n\n@misc{wei2020regularization,\n      title={Regularization Matters: Generalization and Optimization of Neural Nets v.s. their Induced Kernel}, \n      author={Colin Wei and Jason D. Lee and Qiang Liu and Tengyu Ma},\n      year={2020},\n      eprint={1810.05369},\n      archivePrefix={arXiv},\n      primaryClass={stat.ML}\n}\n@article{zhang2002,\n    author =       \"Tong Zhang\",\n    title =        \"Covering Number Bounds of Certain Regularized Linear Function Classes\",\n    journal =      \"Journal of Machine Learning Research\",\n    volume =       \"2\",\n    pages =        \"527-550\",\n    year =         \"2002\"\n}"
  },
  {
    "path": "tex/collection/01supervised.tex",
    "content": "% reset section counter\n\n\\setcounter{section}{0}\n\n\\metadata{1}{Anusri Pampari and Gabriel Poesia}{Jan 11th, 2021}\n\n\nIn this chapter, we will set up the standard theoretical formulation of supervised learning and introduce the \\textit{empirical risk minimization} (ERM) paradigm. The setup will apply to almost the entire monograph \\tnotelong{to be updated}and the ERM paradigm will be the main focus of Chapter~\\ref{chap:asymp}, \\ref{chap:conc}, and \\ref{chap:uc}. \n\n\\sec{Supervised learning}\\label{lec1:sec:sup-learn}\nIn supervised learning, we have a dataset where each data point is associated with a label, and we aim to learn from the data a function that maps data points to their labels. The learned  function can be used to infer the labels of test data points. More formally, suppose the data points, also called inputs,  belong to some input space $\\cX$ (e.g. images of birds), and labels belong to the output space $\\cY$ (e.g. bird species). Suppose we are interested in a specific joint probability distribution $P$ over $\\cX \\times \\cY$ (e.g. images of birds in North America), from which we draw a \\emph{training set}, i.e we draw a a set of $n$ independent and identically distributed (i.i.d.) data points $\\{(x^{(i)}, y^{(i)})\\}_{i=1}^n$ from $P$. The goal of supervised learning is to learn a mapping (i.e. a function) from $\\cX$ to $\\cY$ using the training data. Any such function $h : \\cX \\rightarrow \\cY$ is called a \\emph{predictor} (also \\emph{hypothesis} or \\emph{model}).\n\nGiven two predictors, how do we decide which is better? For that, we define a \\emph{loss function} over the predictions. There are several ways to define loss functions: for now, define a loss function $\\ell$ as a function $\\ell : \\cY \\times \\cY \\rightarrow \\R$. Intuitively, the loss function takes two labels, the prediction made by a model $\\hat{y}$ and the true label $y$, and gives a number that captures how different the two labels are. We assume $\\ell$ is non-negative, i.e $\\ell(\\hat{y}, y) \\geq 0$. Then, the loss of a model $h$ on an example $(x, y)$ is $\\ell(h(x), y)$, i.e. the difference (as measured by $\\ell$) between the prediction made by $h$ and the true label.\n\n\nWith these definitions, we are able to formalize the problem of supervised learning. Precisely, we seek to find a model $h$ that minimizes what we call the expected loss (or population loss or expected risk or population risk):\n\\al{\n    L(h) \\defeq{} \\Exp_{(x, y)\\sim p} [\\ell(h(x), y)].\n}\n\n\nNote that $L$ is nonnegative because $\\ell$ is nonnegative. Typically, the loss function is designed so that the best possible loss is zero when $\\hat{y}$ matches $y$ exactly. Therefore, the goal is to find $h$ such that $L(h)$ is as close to zero as possible. % The best possible $h$ would have , we would find an $h$ with expected loss $0$, since that's the best possible we can do.\n\n\\paragraph{Examples: regression and classification problems.}\n\nHere are two standard types of supervised learning problems based on the properties of the output space:\n\n\\begin{itemize}\n    \\item In the problem of \\emph{regression}, predictions are real numbers ($\\cY = \\R$). We would like predictions to be as close as possible to the real labels. A classical loss function that captures this is the squared error, $\\ell(\\hat{y}, y) = (\\hat{y} - y)^2$.\n    \\item In the problem of \\emph{classification}, predictions are in a discrete set of $k$ unordered classes $\\cY = [k] = \\{1, \\cdots, k \\}$. One possible classification loss is the $0-1$ loss: $\\ell(\\hat{y}, y) = \\mathbbm{1}(\\hat{y} \\neq y)$, i.e. $0$ if the prediction is equal to the true label, and $1$ otherwise.\n\\end{itemize}\n\n\\paragraph{Hypothesis class.}\n\nSo far, we said we would like to find \\emph{any function} that minimizes population risk. However, in practice, we do not have a way of optimizing over arbitrary functions. Instead, we work within a more constrained set of functions $\\cH$, which we call the \\emph{hypothesis family} (or \\emph{hypothesis class}). Each element of $\\cH$ is a function $h : \\cX \\rightarrow \\cY$. Usually, we choose a set $\\cH$ that we know how to optimize over (e.g. linear functions, or neural networks).\n\nGiven one particular function $h \\in \\cH$, we define the \\emph{excess risk} of $h$ with respect to $\\cH$ as the difference between the population risk of $h$ and the best possible population risk inside $\\cH$:\n\n$$E(h) \\defeq{} L(h) - \\inf_{g\\in\\cH} L(g).$$\n\nGenerally we need more assumptions about a specific problem and hypothesis class to bound absolute population risk, hence we focus on bounding the excess risk.\n\nUsually, the family we choose to work with can be parameterized by a vector of parameters $\\theta \\in \\Theta$. In that case, we can refer to an element of $\\cH$ by $h_\\theta$, making that explicit. An example of such a parametrization of the hypothesis class is $\\cH = \\{ h: h_\\theta(x) = \\theta^\\top x, \\theta \\in \\mathbb{R}^d \\}$.\n\n\\sec{Empirical risk minimization}\n\nOur ultimate goal is to minimize population risk. However, in practice we do not have access to the entire population: we only have a \\emph{training set} of $n$ data points, drawn from the same distribution as the entire population. While we cannot compute population risk, we can compute \\emph{empirical risk}, the loss over the training set, and try to minimize that. This is, in short, the paradigm known as \\emph{empirical risk minimization} (ERM): we optimize the training set loss, with the hope that this leads us to a model that has low\npopulation loss. From now on, with some abuse of notation, we often write $\\ell(h_\\theta(x),y)$ as $\\ell((x,y),\\theta)$ and use the two notations interchangeably.  Formally, we define the empirical risk of a model $h$ as:\n\\al{\n\\hatL(h_\\theta) \\defeq{} \\frac{1}{n} \\sum_{i=1}^n \\ell(h_\\theta(x^{(i)}), y^{(i)}) = \\frac{1}{n} \\sum_{i=1}^n \\ell((x^{(i)}, y^{(i)}), \\theta).\n}\n\\emph{Empirical risk minimization} is the method of finding the minimizer of $\\hatL$, which we call $\\hat{\\theta}$:\n\\al{\n    \\label{lec1:eqn:erm}\n    \\hat{\\theta} \\defeq{} \\argmin_{\\theta\\in\\Theta} \\hatL(h_\\theta).\n}\nSince we are assuming that our training examples are drawn from the same distribution as the whole population, we know that empirical risk and population risk are equal\n\\emph{in expectation} (over the randomness of the training dataset):\n\\al{\n    \\Exp_{(x^{(i)}, y^{(i)}) \\iid P}\\ \\hatL(h_\\theta) &= \\Exp_{(x^{(i)}, y^{(i)}) \\iid P} \\frac{1}{n} \\sum_{i=1}^n \\ell(h_\\theta(x^{(i)}), y^{(i)}) \\\\\n    &= \\frac{1}{n} \\sum_{i=1}^n \\Exp_{(x^{(i)}, y^{(i)}) \\iid P} \\ell(h_\\theta(x^{(i)}), y^{(i)}) \\\\\n    &= \\frac{1}{n} \\cdot{} n \\cdot{} \\Exp_{(x^{(i)}, y^{(i)}) \\iid P} \\ell(h_\\theta(x^{(i)}), y^{(i)}) \\\\\n    &= L(h_\\theta).\n}\n\n\nThis is one reason why it makes sense to use empirical risk: it is an unbiased estimator of the population risk.\n\nThe key question that we seek to answer in the first part of this course is: \\textbf{what guarantees do we have on the excess risk for the parameters learned by ERM?} The hope with ERM is that minimizing the training error will lead to small testing error. One way to make this rigorous is by showing that the ERM minimizer's excess risk is bounded.\n"
  },
  {
    "path": "tex/collection/02-01-2021.tex",
    "content": "% reset section counter\n%\\setcounter{section}{0}\n\n%\\metadata{lecture ID}{Your names}{date}\n\\metadata{6}{Daniel Do}{February 1st, 2021}\n\nIn this chapter, we will instantiate Rademacher complexity for two important hypothesis classes: linear models and two-layer neural networks. In the process, we will develop margin theory and use it to bound the generalization gap for binary classifiers.\n\n\\sec{Margin theory for classification problems}\n\n\\subsec{Intuition}\nAssume that we are in the same setting as in the previous section. A fundamental problem we face in this setting is that we do not have a continuous loss: everything is discrete in the output space. We need to find a way to reason about the scale of the output. An example of this is logistic regression: the logistic regression model outputs a probability, and while we compare it to the outcome (0 or 1), how close it is to the true output gives us a measure of how confident we are in the prediction.\n\nFigure \\ref{lec6:fig:margin} gives similar intuition for linear classifiers. Intuitively, the black line is a \"better\" decision boundary than the red line because the minimum distance from any point to the black boundary is greater than the minimum distance from any point to the red line. In the next section, we will formalize this intuition by proving that the larger this margin is, the smaller the bound on generalization gap is.\n\n\\begin{figure}[ht!]\n    \\begin{center}\n  \\includegraphics[width=0.5\\textwidth]{figures/margin.png}\n  \\end{center}\n  \\caption{The red and black lines are two decision boundaries. The X's are positive examples and the O's are negative examples. The black line has a larger margin than the red line, and is intuitively a better classifier.}\n  \\label{lec6:fig:margin}\n\\end{figure}\n\n\\subsec{Formalizing margin theory}\nFirst, assume that the dataset $\\cD = ((x\\sp{1}, y\\sp{1}), \\dots, (x\\sp{n}, y\\sp{n}))$ is \\textit{completely separable}. In other words, there exists some $h_\\theta\\in\\cH$ such that $y^{(i)} = \\sgn(h_\\theta(x^{(i)}))$ holds for all $( x^{(i)},y^{(i)})\\in \\cD$. This is not a necessary condition for our final bound but will make the derivation cleaner.\n\n\\begin{definition}[(Unnormalized) Margin]\nFix the hypothesis $h_\\theta$. The \\textit{(unnormalized) margin} for example $(x, y)$ is defined as $\\margin(x) = yh_\\theta(x)$. Margin is only defined on examples where $\\sgn(h_\\theta(x)) = y$. (Note that $\\margin(x)\\geq 0$ because of our assumption of complete separability.)\n\\end{definition}\n\n\\begin{definition}[Minimum margin] Given a dataset $\\cD = ((x\\sp{1}, y\\sp{1}), \\dots, (x\\sp{n}, y\\sp{n}))$, the \\textit{minimum margin} over the dataset is defined as $\\gamma_{\\min} \\triangleq \\min_{i\\in\\{1,\\dots,|\\cD|\\}} y^{(i)}h_\\theta(x^{(i)})$.\n\\end{definition}\n\nOur final bound will have the form (generalization gap)$\\leq f(\\text{margin},\\text{parameter norm})$. This is very generic since there are many different bounds we could derive based on what margin we use. For this current setting we are using $\\gamma_{\\min}$, which is the minimum margin, but in other settings could use $\\gamma_{\\text{average}}$, which is the average margin of each point in the dataset.\n\nWe will begin by introducing the idea of a \\textit{surrogate loss}, a loss function which approximates zero-one loss but takes the scale of the margin into account. The \\textit{margin loss} (also known as \\textit{ramp loss}) is defined as \n\\begin{equation}\n    \\ell_\\gamma(t) = \\begin{cases} \n      0 & t\\geq \\gamma \\\\\n      1 & t\\leq 0 \\\\\n      1-t/\\gamma & 0\\leq t\\leq \\gamma\n   \\end{cases}\n\\end{equation}\n\n\\begin{figure}[ht!]\n    \\begin{center}\n  \\includegraphics[width=0.5\\textwidth]{figures/margin_loss.png}\n  \\end{center}\n  \\caption{Plotted margin loss.}\n  \\label{lec6:fig:marginloss}\n\\end{figure}\n\nIt is plotted in Figure \\ref{lec6:fig:marginloss}. For convenience, define $\\ell_\\gamma((x,y), h) \\triangleq \\ell_\\gamma(yh(x))$. We can view $\\ell_\\gamma$ as a continuous version of $\\err$ while being more sensitive to the scale of the margin on $[0,\\gamma]$. Notice that $\\err$ is always less than or equal to the $\\ell_\\gamma$ when $\\gamma\\geq 0$, i.e.\n\\begin{equation}\n    \\err((x,y), h)\\leq \\ell_\\gamma ((x,y), h)\n\\end{equation}\nholds for all $(x,y)\\sim P$. Taking the expectation over $(x,y)$ on both sides of this inequality, we see that\n\\begin{equation}\n    L(h) = \\Exp_{(x,y)\\sim P} \\left[ \\err((x,y), h) \\right] \\leq \\Exp_{(x,y)\\sim P} \\left[ \\ell_\\gamma ((x,y), h) \\right].\n\\end{equation}\n\nTherefore, the population loss is bounded by the expectation of the margin loss, and so it is sufficient to bound the expectation of the margin loss in order to bound the population loss.\n\nDefine the population and empirical version of the margin loss:\n\\begin{equation}\nL_\\gamma(h) = \\Exp_{(x,y)\\sim P}\\l[ \\ell_\\gamma((x,y), h)\\r], \\quad \\hat{L}_\\gamma(h) = \\sum_{i=1}^n\\l [\\ell_\\gamma((x^{(i)},y^{(i)}), h)\\r].\n\\end{equation}\n\nBy Corollary \\ref{lec6:cor:ggap-rsbound}, we see that with probability at least $1-\\delta$ that\n\\begin{equation}\nL_\\gamma(h) - \\hat{L}_\\gamma(h)\\leq 2R_S(\\cF) + 3\\sqrt{\\frac{\\log (2/\\delta)}{2n}},\n\\end{equation}\nwhere $\\cF = \\{(x,y)\\mapsto \\ell_\\gamma((x,y), h)\\mid h\\in\\cH\\}$. Note that if we set $\\gamma\\leq \\gamma_{\\min}$, then $\\hat{L}_{\\gamma}(h) = 0$. This follows because by definition of $\\gamma_{\\min}$, $y^{(i)}h(x^{(i)})\\geq \\gamma_{\\min}$ for any $(x^{(i)}, y^{(i)})\\in \\cD$. As a result, $\\ell_\\gamma((x^{(i)}, y^{(i)}), h) = \\ell_\\gamma(y^{(i)}h(x^{(i)})) = 0$ holds. Therefore, it suffices to bound $R_S(\\cF)$.\n\nWe will now use \\textit{Talagrand's lemma} to bound $R_S(\\cF)$ in terms of $R_S(\\cH)$ to remove any dependence on the loss function from the upper bound. \n \n\\begin{lemma}{(Talagrand's lemma)}\nLet $\\phi:\\R\\to\\R$ be a $\\kappa$-Lipschitz function. Then \\begin{equation}\n    R_S(\\phi\\circ \\cH)\\leq \\kappa R_S(\\cH),\n\\end{equation} \nwhere $\\phi\\circ\\cH = \\{z\\mapsto \\phi(h(z))\\mid h\\in\\cH\\}$.\n\\end{lemma}\n\nWe can use Talagrand's lemma directly with $\\phi(t) = \\ell_\\gamma(t)$, which is $\\frac{1}{\\gamma}$-Lipschitz. We can express $\\cF$ as $\\cF=\\ell_\\gamma\\circ\\cH'$ where $\\cH' = \\{(x,y)\\to yh(x)\\mid h\\in\\cH\\}$. Applying Talagrand's lemma, we see that\n\n\\begin{align}\nR_S(\\cF) &\\leq \\frac{1}{\\gamma}R_S(\\cH') \\\\\n&= \\frac{1}{\\gamma}\\Exp_{\\sigma_1,\\dots, \\sigma_n} \\l[ \\sup_{h\\in \\cH} \\frac{1}{n} \\sum^n_{i=1} \\sigma_i y^{(i)}h(x^{(i)}) \\r] \\\\\n&= \\frac{1}{\\gamma}\\Exp_{\\sigma_1,\\dots, \\sigma_n} \\l[ \\sup_{h\\in \\cH} \\frac{1}{n} \\sum^n_{i=1} \\sigma_i h(x^{(i)})  \\r] \\\\\n&= \\frac{1}{\\gamma}R_S(\\cH).\n\\end{align}\n\nPutting this all together, we have shown that for $\\gamma \\leq \\gamma_{\\min}$,\n\\begin{align}\n\\Err(h) \\leq L_\\gamma(h) &\\leq 0 + O \\left( \\frac{R_S(\\cH)}{\\gamma} \\right) + \\tilO \\left( \\sqrt{\\frac{\\log (2 / \\delta)}{2n}} \\right) \\\\\n&= O \\left( \\frac{R_S(\\cH)}{\\min_i y\\sp{i} h(x\\sp{i}) } \\right) + \\tilO \\left( \\sqrt{\\frac{\\log (2 / \\delta)}{2n}} \\right).\n\\end{align}\n\nIn other words, for training data of the form $S = \\{(x\\sp{i},y\\sp{i})\\}_{i=1}^n \\subset \\mathbb{R}^d \\times \\{-1,1\\}$, a hypothesis class~$\\mathcal{H}$ and 0-1 loss, we can derive a bound of the form\n\\begin{equation}\\label{lec7:eqn:generalization_loss}\n    \\text{generalization loss} \\leq \\frac{2R_S(\\mathcal{H})}{\\gamma_{\\mathrm{min}}} + \\text{low-order term},\n\\end{equation}\nwhere $\\gamma_\\mathrm{min}$ is the minimum margin achievable on~$S$ over those hypotheses in $\\cH$ that separate the data, and $R_S(\\cH)$ is the empirical Rademacher complexity of $\\cH$. Such bounds state that simpler models will generalize better beyond the training data, particularly for data that is strongly separable.\n\n\\begin{remark}\nNote there is a subtlety here. If we think of the dataset as random, it follows that $\\gamma_{\\min}$ is a random variable. Consequently, the $\\gamma$ we choose to define the hypothesis class is random, which is not a valid choice when thinking about Rademacher complexity! Technically we cannot apply Talagrand's lemma with a random $\\kappa$ (which we took to be $1/\\gamma$). Also, when we used concentration inequalities, we implicitly assume that the $\\ell_\\gamma((x\\sp{i}, y\\sp{i}), h)$ are independent of each other. That is not the case if $\\gamma$ is dependent on the data.\n\nHow can we address this? The idea is to do another union bound over $\\gamma$. Choose a family $\\Gamma = \\left\\{ 2^k: k \\in [-B, B] \\right\\}$ for some $B$. For every fixed $\\gamma \\in \\Gamma$, we prove the theorem that\n\\begin{align}\n\\Err(h) \\leq \\hatL_\\gamma (h) + O \\left( \\frac{R_S(\\cH)}{\\gamma} \\right) + \\tilO \\left( \\frac{1}{\\sqrt{n}} \\right).\n\\end{align}\n\\end{remark}\n\nWe can then take a union bound over all $\\gamma \\in \\Gamma$. Next, choose the largest $\\gamma \\in \\Gamma$ such that $\\gamma \\leq \\gamma_{\\min}$. For this value of $\\gamma$ we have $\\hatL_\\gamma (h) = 0$ and $O \\left( \\frac{R_S(\\cH)}{\\gamma} \\right) = O \\left( \\frac{R_S(\\cH)}{\\gamma_{\\min}} \\right)$. \\tnotelong{make this part more formal. }"
  },
  {
    "path": "tex/collection/02asymptotics.tex",
    "content": "% reset section counter\n\\setcounter{section}{0}\n\n%\\metadata{lecture ID}{Your names}{date}\n\\metadata{2}{Alexander Ke and Trenton Chang}{Jan 13th, 2021}\n\nIn this chapter, we use an asymptotic approach (i.e. assuming number of training samples $n \\to \\infty$) to achieve a bound on the ERM. We then instantiate these results to the case where the loss function is the maximum likelihood  and discuss the limitations of asymptotics. (In future chapters we will assume finite $n$ and provide a non-asymptotic analysis.)\n\n\\sec{Asymptotics of empirical risk minimization}\n\nFor the asymptotic analysis of ERM, we would like to prove that excess risk is bounded as shown below:\n\\al{\n    L(\\hat{\\theta}) - \\inf_{\\theta \\in \\Theta} L(\\theta) \\leq \\frac{c}{n} + o\\left(\\frac{1}{n}\\right). \n    \\label{lec1:eqn:erm-bound}\n}\nHere $c$ is a problem dependent constant that does not depend on $n$, and $o(1/n)$ hides all dependencies except $n$. The equation above shows that as we have more training data (i.e. as $n$ increases) the excess risk of ERM decreases at the rate of $\\frac{1}{n}$.\n\nLet $\\{(x^{(1)},y^{(1)}), \\cdots, (x^{(n)},y^{(n)})\\}$ be the training data and let $\\cH = \\{ h_\\theta: \\theta \\in \\R^p \\}$ be the parameterized family of hypothesis functions. Let the ERM minimizer be $\\hat{\\theta}$ as defined in Equation~\\eqref{lec1:eqn:erm}. Let $\\theta^{*}$ be the minimizer\nof the population risk $L$, i.e. $\\theta^{*} = \\argmin_\\theta L(\\theta)$. The theorem below quantifies the excess risk $L(\\hat{\\theta}) - L(\\theta^{*})$:\n\n\\begin{theorem}[Informally stated]\nSuppose that (a) $\\hat{\\theta}  \\overset{p}{\\to} \\theta^{*}$ as $n \\to \\infty$ (i.e. consistency of $\\hat{\\theta}$), (b) $\\nabla^{2}L(\\theta^{*})$ is full rank, and  (c) other appropriate regularity conditions hold.\\footnote{$X_n \\overset{p}{\\to} X$ implies that for all $\\epsilon > 0$, $\\bbP \\left (\\norm{X_n - X} > \\epsilon \\right ) \\to 0$, while $X_n \\overset{d}{\\to} X$ implies that $\\bbP(X_n \\leq t) \\to \\bbP(X \\leq t)$ at all points $t$ for which $\\bbP(X \\leq t)$ is continuous. These two notions of convergence are known as convergence in probability and convergence in distribution, respectively. These concepts are not essential to this course, but additional information can be found by reading the Wikipedia \\href{https://en.wikipedia.org/wiki/Convergence_of_random_variables}{article} on convergence of random variables.} \nThen,\n\\begin{enumerate}\n    \\item $\\sqrt{n} (\\hat{\\theta} - \\theta^{*}) = O_P(1)$, i.e. for every $\\epsilon > 0$, there is an $M$ such that $\\sup_n \\bbP (\\| \\sqrt{n} (\\hat{\\theta} - \\theta^{*}) \\|_2 > M) < \\epsilon$. (This means that the sequence $\\{ \\sqrt{n} (\\hat{\\theta} - \\theta^{*}) \\}$ is ``bounded in probability\".)\n    \n    \\item  $\\sqrt{n}(\\hat{\\theta}-\\theta^{*}) \\overset{d}{\\to} \\mathcal{N} \\left(0, (\\nabla^{2}L(\\theta^{*}))^{-1}\\Cov(\\nabla \\ell((x,y), \\theta^*)) (\\nabla^{2}L(\\theta^{*}))^{-1} \\right)$.\n     \\item $n (L(\\hat{\\theta}) - L(\\theta^{*})) = O_P(1)$.\n    \\item $n (L(\\hat{\\theta}) - L(\\theta^{*})) \\overset{d}{\\to} \\frac{1}{2} ||S||_{2}^{2}$ where $S \\sim \\mathcal{N} \\left(0, (\\nabla^{2}L(\\theta^{*}))^{-1/2}\\Cov(\\nabla \\ell((x,y), \\theta^*)) (\\nabla^{2}L(\\theta^{*}))^{-1/2} \\right)$.\n    \\item $\\lim_{n \\to \\infty} \\Exp \\left[ n (L(\\hat \\theta) - L(\\theta^*)) \\right] = \\frac12 \\tr\\left( \\nabla^2 L(\\theta^*)^{-1} \\Cov(\\nabla \\ell ((x, y), \\theta^*) \\right)$.\n\\end{enumerate}\n\\label{lec1:thm:asymp}\n\\end{theorem}\n\\textbf{Remark:} In the theorem above, Parts 1 and 3 only show the rate or order of convergence, while Parts 2 and 4 define the limiting distribution for the random variables.\n\nTheorem \\ref{lec1:thm:asymp} is a powerful conclusion because once we know that $\\sqrt{n}(\\hat \\theta  - \\theta^*)$ is (asymptotically) Gaussian, we can easily work out the distribution of the excess risk. If we believe in our assumptions and $n$ is large enough such that we can assume $n \\to \\infty$, this allows us to analytically determine quantities of interest in almost any scenario (for example, if our test distribution changes). The key takeaway is that our parameter error $\\hat{\\theta} - \\theta^*$ decreases in order $1/\\sqrt{n}$ and the excess risk decreases in order $1/n$. While we will not discuss the regularity assumptions in Theorem~\\ref{lec1:thm:asymp} in great detail, we note that the assumption that $L$ is twice differentiable is crucial. \n\n\\subsec{Key ideas of proofs} \n\nWe will prove the theorem above by applying the following main ideas:\n\\begin{enumerate}\n    \\item Obtain an expression for the excess risk by Taylor expansion of the derivative of the empirical risk $\\nabla \\hatL(\\theta)$ around $\\theta^{*}$.\n    \\item By the law of large numbers, we have that $\\hatL(\\theta) \\overset{p}{\\to} L(\\theta)$, $\\nabla\\hatL(\\theta) \\overset{p}{\\to} \\nabla L(\\theta)$   and  $\\nabla^{2}\\hatL(\\theta) \\overset{p}{\\to} \\nabla^{2} L(\\theta)$ as $n \\to \\infty$.\n    \n    \\item Central limit theorem (CLT).\n\\label{ideas}\n\\end{enumerate}\n \nFirst, we state the CLT for i.i.d. means and a lemma that we will use in the proof.\n\n\\begin{theorem}[Central Limit Theorem] \\label{lec1:thm:CLT}\nLet $X_1, \\cdots, X_n$, be i.i.d. random variables, where $\\widehat{X}=\\frac{1}{n} \\sum_{i=1}^{n} X_i$ and the covariance matrix $\\Sigma$ is finite. Then, as $n \\to \\infty$ we have\n\\begin{enumerate}\n    \\item $\\widehat{X} \\overset{p}{\\to} \\Exp[X]$, and\n    \\item $\\sqrt{n} (\\widehat{X}-\\Exp[X]) \\overset{d}{\\to} \\mathcal{N}(0,\\Sigma)$. In particular, $\\sqrt{n} (\\widehat{X}-\\Exp[X]) = O_P(1)$.\n\\end{enumerate}\n\\end{theorem}\n\n\\begin{lemma}\\label{lec1:lem:dist}\n\\quad\\quad\n    \\begin{enumerate}\n        \\item If $Z \\sim N(0, \\Sigma)$ and $A$ is a deterministic matrix, then $AZ \\sim N(0, A \\Sigma A^\\top)$.\n        \n        \\item If $Z \\sim N(0, \\Sigma^{-1})$ and $Z \\in \\bbR^p$, then $Z^\\top \\Sigma Z \\sim \\chi^2(p)$, where $\\sim \\chi^2(p)$ is the chi-squared distribution with $p$ degrees of freedom.\n    \\end{enumerate}\n\\end{lemma}\n\n\\subsec{Main proof}\n\nLet us start with heuristic arguments for Parts 1 and 2. First, note that by definition, the gradient of the empirical risk at the empirical risk minimizer, $\\nabla \\hatL(\\hat{\\theta})$, is equal to $0$. From the Taylor expansion of $\\nabla \\hatL$ around $\\theta^*$, we have that \n\\begin{align}\n    0 = \\nabla \\hatL(\\hat{\\theta}) = \\nabla \\hatL(\\theta^*) + \\nabla^2 \\hatL(\\theta^*)(\\hat{\\theta} - \\theta^*) + O(\\|\\hat{\\theta} - \\theta^*\\|^2_2).\n\\end{align}\n\nRearranging, we have\n\n\\al{\n \\hat{\\theta}-\\theta^{*} = -(\\nabla^{2}\\hatL(\\theta^{*}))^{-1} \\nabla \\hatL(\\theta^{*}) + O(||\\hat{\\theta}-\\theta^{*}||_{2}^{2}). \\label{lec1:eqn:branch}} \n\nMultiplying by $\\sqrt{n}$ on both sides,\n \\al{\n\\sqrt{n} (\\hat{\\theta}-\\theta^{*}) &= -(\\nabla^{2}\\hatL(\\theta^{*}))^{-1} \\sqrt{n} (\\nabla \\hatL(\\theta^{*})) + O(\\sqrt{n} ||\\hat{\\theta}-\\theta^{*}||_{2}^{2}) \\\\\n&\\approx -(\\nabla^{2}\\hatL(\\theta^{*}))^{-1} \\sqrt{n} (\\nabla \\hatL(\\theta^{*})). \\label{lec1:eqn:interm}}\n\n \nApplying the Central Limit Theorem (Theorem~\\ref{lec1:thm:CLT}) using $X_i = \\nabla \\ell ((x^{(i)}, y^{(i)}), \\theta^*)$ and $\\widehat{X} = \\nabla \\hatL(\\theta^*)$, and noticing that $\\Exp[\\nabla \\hatL(\\theta^{*})] = \\nabla L(\\theta^{*})$, we have\n \\al{ \\sqrt{n} (\\nabla \\hatL(\\theta^{*}) - \\nabla L(\\theta^{*})) \\overset{d}{\\to} \\mathcal{N}(0,\\Cov(\\nabla \\ell((x, y), \\theta^{*}))).} \n \nNote that $\\nabla L(\\theta^{*}) = 0$ because $\\theta^{*}$ is the minimizer of  $L$, so $\\sqrt{n} (\\nabla \\hatL(\\theta^{*})) \\overset{d}{\\to} \\mathcal{N}(0,\\Cov(\\nabla \\ell((x, y), \\theta^{*})))$. By the law of large numbers, $\\nabla^2 \\hatL(\\theta^*) \\stackrel{p}{\\rightarrow} \\nabla^2 L(\\theta^*)$. Applying these results to \\eqref{lec1:eqn:interm} (together with an application of Slutsky's theorem),\n\\al{\n\\sqrt{n} (\\hat{\\theta}-\\theta^{*}) &\\overset{d}{\\to} \\nabla^{2}L(\\theta^{*})^{-1} \\mathcal{N}(0,\\Cov(\\nabla \\ell((x,y),\\theta^{*}))) \\\\\n&\\stackrel{d}{=} \\mathcal{N} \\left( 0,\\nabla^{2}L(\\theta^{*})^{-1}\\Cov(\\nabla \\ell((x,y), \\theta^{*})) \\nabla^{2}L(\\theta^{*})^{-1} \\right),\n}\n\nwhere the second step is due to Lemma~\\ref{lec1:lem:dist}. This proves Part 2 of Theorem~\\ref{lec1:thm:asymp}.\n\nPart 1 follows directly from Part 2 by the following fact: If $X_n \\stackrel{d}{\\rightarrow} P$ for some probability distribution $P$, then $X_n = O_P(1)$.\n\nWe now turn to proving Parts 3 and 4. Using a Taylor expansion of $L$ with respect to $\\theta$ at $\\theta^*$, we find\n\\begin{equation}\nL(\\hat \\theta) = L(\\theta^*) \n+ \\langle \\nabla L(\\theta^*), \\hat \\theta - \\theta^* \\rangle \n+ \\frac12 \\langle \\hat \\theta - \\theta^*, \\nabla^2 L(\\theta^*) (\\hat \\theta - \\theta^*) \\rangle + o(\\|\\hat \\theta - \\theta^*\\|_2^2).\n\\end{equation}\nSince $\\theta^*$ is the minimizer of the population risk $L$, we know that $\\nabla L(\\theta^*) = 0$ and the linear term is equal to 0. Rearranging and multiplying by $n$, we can write\n\\begin{align}\nn (L(\\hat \\theta) - L(\\theta^*)) &= \\frac{n}{2} \\langle \\hat \\theta - \\theta^*, \\nabla^2 L(\\theta^*) (\\hat \\theta - \\theta^*) \\rangle + o(\\|\\hat \\theta - \\theta^*\\|_2^2) \\\\\n&\\approx \\frac12 \\langle \\sqrt n(\\hat \\theta - \\theta^*), \\nabla^2 L(\\theta^*) \\sqrt n (\\hat \\theta - \\theta^*) \\rangle \\\\\n&= \\frac12 \\left\\|\\nabla^2 L(\\theta^*)^{1/2} \\sqrt n(\\hat \\theta - \\theta^*) \\right\\|_2^2,\n\\end{align}\n\nwhere the last equality follows from the fact that for any vector $v$ and positive semi-definite matrix $A$ of appropriate dimensions, the inner product $\\langle v, Av\\rangle = v^\\top Av = \\lVert A^{1/2}v \\rVert_2^2$. Let $S = \\nabla^2 L(\\theta^*)^{1/2} \\sqrt n(\\hat \\theta - \\theta^*)$, i.e. the random vector inside the norm. By Part 2, we know the asymptotic distribution of $\\sqrt n(\\hat \\theta - \\theta^*)$ is Gaussian. Thus as $n \\to \\infty$, $n (L(\\hat \\theta) - L(\\theta^*)) \\overset d \\to \\frac12 \\|S\\|_2^2$ where\n\\begin{align}\n    S &\\sim \\nabla^2 L(\\theta^*)^{1/2} \\cdot \\cN \\left(0, \\nabla^2 L(\\theta^*)^{-1} \\Cov(\\nabla \\ell ((x, y), \\theta^*)) \\nabla^2 L(\\theta^*)^{-1} \\right) \\\\\n    &\\stackrel{d}{=} \\cN \\left(0, \\nabla^2 L(\\theta^*)^{-1/2} \\Cov(\\nabla \\ell ((x, y), \\theta^*)) \\nabla^2 L(\\theta^*)^{-1/2} \\right).\n\\end{align}\n\nThis proves Part 4, and Part 3 follows directly from the definition of the $O_P$ notation.\n\nFinally, for Part 5, using the fact that the trace operator is invariant under cyclic permutations, the fact that $\\Exp [S] = 0$, and some regularity conditions,\n\\begin{align}\n    \\lim_{n \\to \\infty} \\Exp \\left[ n (L(\\hat \\theta) - L(\\theta^*)) \\right] &= \\frac12 \\Exp\\left[ \\|S\\|_2^2 \\right] = \\frac12 \\Exp \\left[ \\tr(S^\\top S) \\right] \\\\\n    &= \\frac12 \\Exp \\left[ \\tr(S S^\\top) \\right]  = \\frac12 \\tr \\left(\\Exp[S S^\\top] \\right) \\\\\n    &= \\frac12 \\tr \\left( \\Cov(S) \\right) \\\\\n    &= \\frac12 \\tr\\left( \\nabla^2 L(\\theta^*)^{-1} \\Cov(\\nabla \\ell ((x, y), \\theta^*)) \\right).\n\\end{align}\n\n\\subsec{Well-specified case}\n\nTheorem \\ref{lec1:thm:asymp} is powerful because it is general, avoiding any assumptions of a probabilistic model of our data. However in many applications, we assume a model of our data and we define the log-likelihood with respect to this model. Formally, suppose that we have a family of probability distributions $P_\\theta$, parameterized by $\\theta \\in \\Theta$, such that $P_{\\theta_*}$ is the true data-generating distribution. This is known as the well-specified case. To make the results of Theorem \\ref{lec1:thm:asymp} more applicable, we derive analogous results for this well-specified case in Theorem \\ref{lec2:thm:applied}.\n\n\\begin{theorem}\n\\label{lec2:thm:applied}\n    In addition to the assumptions of Theorem~\\ref{lec1:thm:asymp}, suppose there exists a parametric model $P(y \\mid x; \\theta)$, $\\theta \\in \\Theta$, such that $\\{ y\\sp{i} \\mid x\\sp{i} \\}_{i=1}^n \\sim P( y\\sp{i} \\mid x\\sp{i} ; \\theta_*)$ for some $\\theta_* \\in \\Theta$. Assume that we performing maximum likelihood estimation (MLE), i.e. our loss function is the negative log-likelihood $\\ell((x\\sp{i}, y\\sp{i}), \\theta) = - \\log P( y\\sp{i} \\mid x\\sp{i} ; \\theta)$. As before, let $\\hat\\theta$ and $\\theta^*$ denote the minimizers of empirical risk and population risk, respectively. Then\n    \\al{\n    \\label{lec2:eqn:applied1}\n        \\theta^* = \\theta_*,\n    }\n    \\al{\n    \\label{lec2:eqn:applied2}\n        \\Exp \\left[ \\nabla \\ell ((x, y), \\theta^*) \\right] = 0,\n    }\n    \\al{\n    \\label{lec2:eqn:applied3}\n        \\Cov \\left( \\nabla \\ell ((x, y), \\theta^*) \\right) = \\nabla^2 L(\\theta^*), \\text{ and}\n    }\n    \\al{\n    \\label{lec2:eqn:applied4}\n        \\sqrt n (\\hat \\theta - \\theta^*) \\overset d \\to \\cN (0, \\nabla^2 L(\\theta^*)^{-1}).\n    }\n\\end{theorem}\n\n\\textbf{Remark 1:} You may also have seen \\eqref{lec2:eqn:applied4} in the following form: under the maximum likelihood estimation (MLE) paradigm, the MLE is asymptotically efficient as it achieves the Cramer-Rao lower bound. That is, the parameter error of the MLE estimate converges in distribution to $\\mathcal{N}(0, \\mathcal{I}(\\theta)^{-1})$, where $\\mathcal{I}(\\theta)$ is the Fisher information matrix (in this case, equivalent to the risk Hessian $\\nabla^2 L(\\theta^*)$)~\\cite{rice2006mathematical}.\n\n\\textbf{Remark 2:} \\eqref{lec2:eqn:applied3} is also known as Bartlett's identity~\\cite{percynotes}.\n\nAlthough the proofs were not presented in live lecture, we include them here.\n\n\\begin{proof}\nFrom the definition of the population loss,\n\\begin{align}\n    L(\\theta) &= \\Exp \\left[ \\ell((x\\sp{i}, y\\sp{i}), \\theta) \\right]\\\\\n    &= \\Exp \\left[ - \\log P(y \\mid x; \\theta) \\right] \\\\\n    &= \\Exp \\left[ - \\log P(y \\mid x; \\theta) + \\log P(y \\mid x; \\theta_*) \\right] + \\Exp \\left[ - \\log P(y \\mid x; \\theta_*) \\right] \\\\\n    &= \\Exp \\left[ \\log \\frac{P(y \\mid x; \\theta_*)}{P(y \\mid x; \\theta)} \\right] + \\Exp \\left[ - \\log P(y \\mid x; \\theta_*) \\right].\n\\end{align}\nNotice that the second term is a constant which we will express as $\\cH(y \\mid x; \\theta_*)$. We expand the first term using the tower rule (or law of total expectation):\n\\begin{align}\n    L(\\theta) &= \\Exp \\left[ \\Exp \\left[ \\log \\frac{P(y \\mid x; \\theta_*)}{P(y \\mid x; \\theta)} \\biggr\\vert x \\right] \\right] + \\cH(y \\mid x; \\theta_*).\n\\end{align}\nThe term in the expectation is just the KL divergence between the two probabilities, so \n\\begin{align}\n    L(\\theta) &= \\Exp \\left[ \\KL \\left( y \\mid x; \\theta_* \\| y \\mid x; \\theta \\right) \\right] + \\cH(y \\mid x; \\theta_*) \\\\\n    &\\geq \\cH(y \\mid x; \\theta_*),\n\\end{align}\nsince KL divergence is always non-negative. Since $\\theta_*$ makes the KL divergence term 0, it minimizes $L(\\theta)$ and so $\\theta_* \\in \\argmin_\\theta L(\\theta)$. However, the minimizer of $L(\\theta)$ is unique because of consistency, so  we must have $\\argmin_\\theta L(\\theta) = \\theta^*$ which proves (\\ref{lec2:eqn:applied1}).\n\nFor \\eqref{lec2:eqn:applied2}, recall $\\nabla L(\\theta^*) = 0$, so we have\n\\begin{equation}\n0 = \\nabla L(\\theta^*) = \\nabla \\Exp \\left[ \\ell((x\\sp{i}, y\\sp{i}), \\theta^*) \\right] = \\Exp \\left[ \\nabla \\ell((x\\sp{i}, y\\sp{i}), \\theta^*) \\right],\n\\end{equation}\nwhere we can switch the gradient and expectation under some regularity conditions.\n\nTo prove \\eqref{lec2:eqn:applied3}, we first expand the RHS using the definition of covariance and express the marginal distributions as integrals:\n\\begin{align}\n    \\Cov \\left( \\nabla \\ell ((x, y), \\theta^*) \\right) &= \\Exp \\left[ \\nabla \\ell ((x, y), \\theta^*) \\nabla \\ell ((x, y), \\theta^*)^\\top \\right] \\\\\n    &= \\int P(x) \\left( \\int P(y \\mid x; \\theta^*) \\nabla \\log P( y\\sp{i} \\mid x\\sp{i} ; \\theta^*) \\nabla \\log P( y\\sp{i} \\mid x\\sp{i} ; \\theta^*)^\\top dy \\right) dx \\\\\n    &= \\int P(x) \\left( \\int \\frac{\\nabla P(y \\mid x; \\theta^*) \\nabla P(y \\mid x; \\theta^*)^\\top}{P(y \\mid x; \\theta^*)}dy \\right) dx.\n\\end{align}\nNow we expand the LHS using the definition of the population loss and differentiate repeatedly:\n\\begin{align}\n    \\nabla^2 L(\\theta^*) &= \\Exp \\left[- \\nabla^2 \\log P(y \\mid x; \\theta^*) \\right] \\\\\n    &= \\int P(x) \\left( \\int - \\nabla^2 P(y \\mid x; \\theta^*) + \\frac{\\nabla P(y \\mid x; \\theta^*) \\nabla P(y \\mid x; \\theta^*)^\\top}{P(y \\mid x; \\theta^*)}dy  \\right) dx.\n\\end{align}\nNote that we can express \n\\begin{equation} \\int \\nabla^2 P(y \\mid x; \\theta^*) dy = \\nabla^2 \\int P(y \\mid x; \\theta^*) dy = \\nabla 1  = 0 \\end{equation}\nso we find\n\\begin{equation} \\nabla^2 L(\\theta^*) = \\int P(x) \\left( \\int \\frac{\\nabla P(y \\mid x; \\theta^*) \\nabla P(y \\mid x; \\theta^*)^\\top}{P(y \\mid x; \\theta^*)}dy \\right) dx = \\Cov \\left( \\nabla \\ell ((x, y), \\theta^*) \\right). \\end{equation}\n\nFinally, \\eqref{lec2:eqn:applied4} follows directly from Part 2 of Theorem~\\ref{lec1:thm:asymp} and \\eqref{lec2:eqn:applied3}.\n\\end{proof}\n\nUsing similar logic to our proof of Part 4 and 5 of Theorem~\\ref{lec1:thm:asymp}, we can see that $n (L(\\hat \\theta) - L(\\theta^*)) \\overset d \\to \\frac12 \\|S\\|_2^2$ where $S \\sim N(0, I)$. Since a chi-squared distribution with $p$ degrees of freedom is defined as a sum of the squares of $p$ independent standard normals, it quickly follows that $2n (L(\\hat \\theta) - L(\\theta^*)) \\sim  \\chi^2(p)$, where $\\theta \\in \\R^p$ and $n \\to \\infty$. We can thus characterize the excess risk in this case using the properties of a chi-squared distribution:\n\n\\al{\n    \\lim_{n \\to \\infty} \\Exp \\left[ L(\\hat \\theta) - L(\\theta^*) \\right] = \\frac{p}{2n}.\n}\n\n\\sec{Limitations of asymptotic analysis}\n\nOne limitation of asymptotic analysis is that our bounds often obscure dependencies on higher order terms. As an example, suppose we have a bound of the form\n\t\\al{\n\t\t\\frac{p}{2n} + o\\left(\\frac{1}{n}\\right).\n\t\t\\label{lec2:eqn:spicy_bound}\n\t}\n(Here $o(\\cdot)$ treats the parameter $p$ as a constant as $n$ goes to infinity.) \nWe have no idea how large $n$ needs to be for asymptotic bounds to be ``reasonable.\" Compare two possible versions of \\eqref{lec2:eqn:spicy_bound}: \n\\begin{align}\n    \\frac{p}{2n} + \\frac{1}{n^2} \\quad \\text{vs.} \\quad \\frac{p}{2n} + \\frac{p^{100}}{n^2}.\n\\end{align}\nAsymptotic analysis treats both of these bounds as the same, hiding the polynomial dependence on $p$ in the second bound. Clearly, the second bound is significantly more data-intensive than the first: we would need $n > p^{50}$ for $\\frac{p^{100}}{n^2}$ to be less than one. Since $p$ represents the dimensionality of the data, this may be an unreasonable assumption.\n\nThis is where non-asymptotic analysis can be helpful. Whereas asymptotic analysis uses large-sample theorems such as the central limit theorem and the law of large numbers to provide convergence guarantees, non-asymptotic analysis relies on concentration inequalities to develop alternative techniques for reasoning about the performance of learning algorithms.\n\n"
  },
  {
    "path": "tex/collection/03concentration.tex",
    "content": "% reset section counter\n\\setcounter{section}{0}\n\n%\\metadata{lecture ID}{Your names}{date}\n\\metadata{3}{Brad Ross and Robbie Jones}{Jan 20, 2021}\n\nIn this chapter, we take a little diversion and develop the notion of \\emph{concentration inequalities}. Assume that we have independent random variables $X_1, \\ldots, X_n$. We will develop tools to show results that formalize the intuition for these statements:\n\\begin{enumerate}\n    \\item $X_1 + \\ldots + X_n$ concentrates around $\\Exp[X_1 + \\ldots + X_n]$.\n    \\item More generally, $f(X_1, \\ldots, X_n)$ concentrates around $\\Exp[f(X_1, \\ldots, X_n)]$.\n\\end{enumerate}\nThese inequalities will be used in subsequent chapters to bound several key quantities of interest.\n\nAs it turns out, the material from this chapter constitutes arguably the important mathematical tools in the entire course. No matter what area of machine learning one wants to study, if it involves sample complexity, some kind of concentration result will typically be required. Hence, concentration inequalities are some of the most important tools in modern statistical learning theory.\n\n\\sec{The big-O notation}\n\nThroughout the rest of this course, we will use ``big-O\" notation in the following sense: every occurrence of $O(x)$ is a placeholder for some function $f(x)$ such that for every $x$, $|f(x)| \\leq Cx$ for some absolute/universal constant $C$. In other words, when $O(n_1),\\dots, O(n_k)$ occur in a statement, it means that \\textbf{there exist} absolute constants $C_1,\\dots, C_k > 0$ and functions $f_1,\\dots, f_k$ satisfying $|f_i(x)|\\le C_ix$ for all $x$, such that after replacing each occurrence $O(n_i)$ by $f_i(n_i)$,  the statement is true.  The difference from traditional ``big-O\" notation is that we do not need to send $n \\to \\infty$ in order to define ``big-O\". In nearly all cases, big-O notation is used to define an upper bound; then, the bound is identical if we simply substitute $Cx$ in place of $O(x)$. \n\nNote that the $x$ in our definition of big-O is a surrogate for an arbitrary variable. For instance, later on in this chapter, we will encounter the term $O(\\sigma \\sqrt{\\log n})$. The definition above, applied with $x = \\sigma \\sqrt{\\log n}$, yields the following conclusion: $O(\\sigma \\sqrt{\\log n}) = f(\\sigma \\sqrt{\\log n})$ for some function $f$ and constant $C$ such that $|f(\\sigma \\sqrt{\\log n})| \\leq C \\sigma  \\sqrt{\\log n}$ for all values that $\\sigma \\sqrt{\\log n}$ can take. \n\nLastly, for any $a, b \\geq 0$, we will let $a \\lesssim b$ mean that there is some absolute constant $c > 0$ such that $a \\leq cb$.\n\\sec{Chebyshev's inequality} \n\nWe begin by considering an arbitrary random variable $Z$ with finite variance. One of the most famous results characterizing its tail behavior is the following theorem:\n\n\\begin{theorem}[Chebyshev's inequality]\n    Let $Z$ be a random variable with finite expectation and variance. Then\n    \\al{\n        \\Pr[|Z - \\Exp[Z]| \\geq t] \\leq \\frac{\\Var(Z)}{t^2}, \\quad \\forall t > 0.\n        \\label{lec3:eqn:chebyshev}\n    }\n\\end{theorem}\n\nIntuitively, this means that as we approach the tails of the distribution of $Z$, the density decreases at a rate of at least $1 / t^2$. Moreover, for any $\\delta \\in (0, 1]$, by plugging in $t = \\sd(Z) / \\sqrt{\\delta}$ to \\eqref{lec3:eqn:chebyshev} we see that \n    \\al{\n        \\Pr\\left[|Z - \\Exp[Z]| \\leq \\frac{\\sd(Z)}{\\sqrt{\\delta}}\\right] \\geq 1 - \\delta.\n        \\label{lec3:eqn:chebyshevdelta}\n    }\n    \nUnfortunately, it turns out that Chebyshev's inequality is a rather weak concentration inequality. To illustrate this, assume $Z \\sim \\cN(0, 1)$. We can show (using the Gaussian tail bound derived in Problem 3(c) in Homework 0) that\n\\al{\n    \\Pr\\left[|Z - \\Exp[Z]| \\leq \\sd(Z)\\sqrt{2 \\log (2 / \\delta)}\\right] \\geq 1 - \\delta.\n    \\label{lec3:eqn:normaltailbound}\n}\nfor any $\\delta \\in (0, 1]$. In other words, the density at the tails of the normal distribution is decreasing at an exponential rate, while Chebyshev's inequality only gives a quadratic rate. The discrepancy between \\eqref{lec3:eqn:chebyshevdelta} and \\eqref{lec3:eqn:normaltailbound} is made more apparent when we consider inverse-polynomial $\\delta = \\frac{1}{n^c}$ for some parameter $n$ and degree $c$ (we will see concrete instances of this setup in future chapters). Then the tail bound for the normal distribution \\eqref{lec3:eqn:normaltailbound} implies that\n\\al{\n    |Z - \\Exp[Z]| \\leq \\sd(Z) \\cdot \\sqrt{\\log{O\\left(n^c\\right)}} = \\sd(Z) \\cdot O\\left(\\sqrt{\\log{n}}\\right) \\quad w.p. \\; 1 - \\delta,\n}\nwhile Chebyshev's inequality gives us the weaker result\n\\al{\n    |Z - \\Exp[Z]| \\leq \\sd(Z) \\cdot \\sqrt{O(n^c)} = \\sd(Z) \\cdot O(n^{c / 2})  \\quad w.p. \\; 1 - \\delta.\n}\n\nChebyshev's inequality is actually optimal without further assumptions, in the sense that there exist distributions with finite variance for which the bound is tight. However, in many cases, we will be able to improve the $1/t^2$ rate of tail decay in Chebyshev's inequality to an $e^{-t}$ rate. In the next two sections, we will demonstrate how to construct tail bounds with exponential decay rates.\n\n\\sec{Hoeffding's inequality}\\label{lec2:subsec:hoeffding}\n\nWe next provide a brief overview of Hoeffding's inequality, a concentration inequality for bounded random variables with an exponential tail bound:\n\n\\begin{theorem}[Hoeffding's inequality]\n    Let $X_1, X_2, \\dots, X_n$ be independent real-valued random variables drawn from some distribution, such that $a_i \\leq X_i \\leq b_i$ almost surely. Define $\\bar{X} = \\frac{1}{n}\\sum_{i=1}^n X_i$, and let $\\mu = \\Exp [\\bar{X}]$. Then for any $\\varepsilon > 0$,\n    \\al {\n    \\Pr \\left[ |\\bar{X} - \\mu | \\leq \\varepsilon \\right] \\geq 1 - 2  \\exp\\left(\\frac{-2n^2\\varepsilon^2}{\\sum_{i=1}^n (b_i - a_i)^2}\\right). \\label{lec2:eqn:hoeffding}\n    }\n\\end{theorem}\n\nNote that the demoninator within the exponential term, $\\sum_{i=1}^n (b_i - a_i)^2$, can be thought of as an upper bound or proxy for the variance $\\Var(X_i)$. In fact, under the independence assumption, we can show\n\\begin{align}\n    \\Var\\left(\\bar{X} \\right) &= \\frac{1}{n^2}\\sum_{i=1}^n \\Var(X_i) \\leq \\frac{1}{n^2}\\sum_{i=1}^n (b_i - a_i)^2.\n\\end{align}\n\nLet $\\sigma^2 = \\frac{1}{n^2}\\sum_{i=1}^n (b_i - a_i)^2$. If we take $\\varepsilon = O(\\sigma \\sqrt{\\log{n}}) = \\sigma \\sqrt{c \\log n}$ so that $\\varepsilon$ is bounded above by some large (i.e., $c \\geq 10$) multiple of the standard deviation of the $X_i$'s times $\\sqrt{\\log{n}}$, we can substitute this value of $\\varepsilon$ into \\eqref{lec2:eqn:hoeffding} to reach the following conclusion: \n\\begin{align}\n    \\Pr \\left[ |\\bar{X} - \\mu| \\leq \\varepsilon \\right] &\\geq 1 - 2\\exp\\left(\\frac{-2 \\varepsilon^2}{\\sigma^2}\\right)\\\\\n    &= 1 - 2 \\exp(-2 c \\log n)\\\\\n    &= 1 - 2 n^{-2c}\n\\end{align}\n\nWe can see that as $n$ grows, the right-most term tends to zero such that $\\Pr[|\\bar{X} - \\mu| \\leq \\varepsilon]$ very quickly approaches 1. Intuitively, this result tells us that, with high probability, the sample mean $\\bar{X}$ will not be ``much farther\" from the population mean $\\mu$ by more than some sublogarithmic ($\\sqrt{c \\log n}$) factor of the standard deviation.\\footnote{This is with the caveat, of course, that $\\sigma$ is not exactly the standard deviation but a loose upper bound on standard deviation.} Thus, we can restate the above claim we reached as follows:\n\n\\begin{remark}\n    For sufficiently large $n$, $|\\bar{X} - \\mu | \\leq O(\\sigma \\sqrt{\\log{n}})$ with high probability.\n\\end{remark}\n\n\\begin{remark}\\label{lec2:rem:hoeffding}\n    If, in addition, we have $a_i = -O(1)$ and $b_i = O(1)$, then $\\sigma^2 = O \\left( \\frac{1}{n}\\right)$, and $|\\bar{X} - \\mu | \\leq O\\left(\\sqrt{\\frac{\\log n}{n}}\\right) = \\tilO\\left(\\frac{1}{\\sqrt{n}}\\right)$.\\footnote{$\\tilO$ is analogous to Big-$O$ notation, but $\\tilO$ hides logarithmic factors. That is; if $f(n) = O(\\log n)$, then $f(n) = \\tilO(1)$.}\n\\end{remark}\n\nRemark~\\ref{lec2:rem:hoeffding} provides a compact form of the Hoeffding bound that we can use when the $X_i$ are bounded almost surely. \n\nSo far, we have only shown how to construct exponential tail bounds for bounded random variables. Since requiring boundedness in $[0, 1]$ (or $[a, b]$ more generally) is limiting, it is worth asking what types of distributions permit such an exponential tail bound. The following section will explore such a class of random variables: \\emph{sub-Gaussian} random variables.\n\n\\sec{Sub-Gaussian random variables}\n\nWe begin by defining the class of sub-Gaussian random variables by way of a bound on their moment generating functions. After establishing this definition, we will see how this bound guarantees the exponential tail decay we desire.\n\n\\begin{definition}[Sub-Gaussian Random Variables]\n    A random variable $X$ with finite mean $\\mu$ is \\textit{sub-Gaussian} with parameter $\\sigma$ if\n    \\al{\n        \\Exp \\left[ e^{\\lambda(X - \\mu)} \\right] \\leq e^{\\sigma^2\\lambda^2 / 2}, \\quad \\forall\\lambda\\in\\R.\n        \\label{lec3:eqn:subgassdefn}\n    }\n    We say that $X$ is $\\sigma$-sub-Gaussian and say it has \\emph{variance proxy} $\\sigma^2$.\n\\end{definition}\n\n\\begin{remark}\\label{lec3:rem:mgf_strong}\n    As it turns out, \\eqref{lec3:eqn:subgassdefn} is quite a strong condition, requiring that infinitely many moments of $X$ exist and do not grow too quickly. To see why, assume without loss of generality that $\\mu = 0$ and take a power series expansion of the moment generating function:\n    \\al{\n        \\Exp[\\exp(\\lambda X)] = \\Exp\\left[\\sum_{k = 0}^\\infty \\frac{(\\lambda X)^k}{k!}\\right] = \\sum_{k = 0}^\\infty\\frac{\\lambda^k}{k!}\\Exp[X^k].\n    }\n    A bound on the moment generating function then is a bound on infinitely many moments of $X$, i.e. a requirement that the moments of $X$ are all finite and grow slowly enough to allow the power series to converge. Though a proof of this result is beyond the scope of this monograph, Proposition 2.5.2 in \\cite{vershynin2018high} shows that \\eqref{lec3:eqn:subgassdefn} is equivalent to $\\Exp \\left [|X|^p \\right ]^{1/p} \\lesssim \\sqrt{p}$ for all $p \\geq 1$.\n\\end{remark}\n\n\\noindent Although \\eqref{lec3:eqn:subgassdefn} is not a particularly intuitive definition, it turns out to imply exactly the type of exponential tail bound we want:\n\n\\begin{theorem}[Tail bound for sub-Gaussian random variables]\\label{lec3:thm:subgausstail}\n    If a random variable $X$ with finite mean $\\mu$ is $\\sigma$-sub-Gaussian, then\n    \\al{ \n        \\Pr[|X - \\mu| \\geq t] \\leq 2 \\exp \\left( -\\frac{t^2}{2\\sigma^2} \\right), \\quad \\forall t \\in \\R.\n        \\label{lec3:eqn:subgausstail}\n    }\n\\end{theorem}\n\n\\begin{proof}\nFix $t > 0$. For any $\\lambda > 0$,\n\\al{\n    \\Pr[X - \\mu \\geq t] &= \\Pr[\\exp(\\lambda (X - \\mu)) \\geq \\exp(\\lambda t)]  \\\\\n    &\\leq \\exp(-\\lambda t)\\Exp[\\exp(\\lambda (X - \\mu))] && \\text{(by Markov's inequality)}  \\\\\n    &\\leq \\exp(-\\lambda t)\\exp(\\sigma^2\\lambda^2/2) && \\text{(by \\eqref{lec3:eqn:subgassdefn})} \\\\\n    &= \\exp(-\\lambda t + \\sigma^2\\lambda^2/2). \\label{lec3:eqn:non_opt_tail_bound}\n}\nBecause the bound \\eqref{lec3:eqn:non_opt_tail_bound} holds for any choice of $\\lambda > 0$ and $\\exp(\\cdot)$ is monotonically increasing, we can optimize the bound \\eqref{lec3:eqn:non_opt_tail_bound} by finding $\\lambda$ which minimizes the exponent $-\\lambda t + \\sigma^2 \\lambda^2/2$. Differentiating and setting the derivative equal to zero, we find that the optimal choice is $\\lambda = t/\\sigma^2$, yielding the one-sided tail bound\n\\al{\\label{lec3:eqn:opt_tail_bound_right}\n    \\Pr[X - \\mu \\geq t] \\leq \\exp\\left(-\\frac{t^2}{2\\sigma^2}\\right).\n}\nGoing through the same line of reasoning but for $-X$ and $-t$, we can also show that for any $t > 0$,\n\\al{\\label{lec3:eqn:opt_tail_bound_left}\n    \\Pr[X - \\mu \\leq -t] \\leq \\exp\\left(-\\frac{t^2}{2\\sigma^2}\\right).\n}\n\nWe can then obtain \\eqref{lec3:eqn:subgausstail} by applying the union bound:\n\\al{\n    \\Pr[|X - \\mu| \\geq t] = \\Pr[X - \\mu \\geq t] + \\Pr[X - \\mu \\leq -t] \\leq 2\\exp\\left(-\\frac{t^2}{2\\sigma^2}\\right).\n}\n\\end{proof}\n\n\\begin{remark}[Tail bound implies sub-Gaussianity]\\label{lec3:rem:tail_bound_remark}\n    In addition to being a necessary condition for sub-Gaussianity (Theorem \\ref{lec3:thm:subgausstail}), the tail bound \\eqref{lec3:eqn:subgausstail} for sub-Gaussian random variables is also a sufficient condition up to a constant factor. In particular, if a random variable $X$ with finite mean $\\mu$ satisfies \\eqref{lec3:eqn:subgausstail} for some $\\sigma > 0$, then $X$ is $O(\\sigma)$-sub-Gaussian. Unfortunately, the proof of this reverse direction is somewhat more involved, so we refer the interested reader to Theorem 2.6 and its proof in Section 2.4 of \\cite{wainwright2019high} and Proposition 2.5.2 in \\cite{vershynin2018high} for details. While the tail bound is the property we ultimately care about most when studying sub-Gaussian random variables, the definition in \\eqref{lec3:eqn:subgassdefn} is a more technically convenient characterization, as we will see in the proof of Theorem \\ref{lec3:thm:sum_sub_gaussian}.\n\\end{remark}\n\n\\begin{remark}\n    Note that in light of Remark \\ref{lec3:rem:mgf_strong}, the tail bound \\eqref{lec3:eqn:normaltailbound} requires all central moments of $X$ to exist and not grow too quickly. In contrast, Chebyshev's inequality (and more generally any polynomial variant of Markov's inequality $\\Pr[|X-\\mu| \\geq t] = \\Pr[|X-\\mu|^k \\geq t^k] \\leq t^{-k}\\Exp[|X-\\mu|^k]$) only requires that the second central moment $\\Exp[(X-\\mu)^2]$ (more generally, the $k$th central moment $\\Exp[|X - \\mu|^k]$) is finite to yield a tail bound. If infinite moments exist, however, it turns out that $\\inf_{k \\in \\mathbb{N}} t^{-k}\\Exp[|X-\\mu|^k] \\leq \\inf_{\\lambda > 0} \\exp(-\\lambda t) \\Exp[\\exp(\\lambda (X-\\lambda))]$, i.e. the optimal polynomial tail bound is tighter than the optimal exponential tail bound (see Exercise 2.3 in \\cite{wainwright2019high}). As we will see shortly though, using exponential functions of random variables allows us to prove results about sums of random variables more conveniently. This ``tensorization'' property is why most researchers use exponential tail bounds in practice.\n\\end{remark}\n\nHaving defined and derived exponential tail bounds for sub-Gaussian random variables, we can now accomplish the first of the goals we set out at the beginning of the chapter: show that under certain conditions, namely independence and sub-Gaussianity of $X_1, \\dotsc, X_n$, the sum $Z = \\sum_{i = 1}^n X_i$ concentrates around $\\Exp[Z] = \\Exp[\\sum_{i = 1}^n X_i]$.\n\n\\begin{theorem}[Sum of sub-Gaussian random variables is sub-Gaussian]\\label{lec3:thm:sum_sub_gaussian}\n    If $X_1, \\ldots, X_n$ are independent sub-Gaussian random variables with variance proxies $\\sigma_1^2, \\ldots, \\sigma_n^2$, then $Z = \\sum_{i = 1}^n X_i$ is sub-Gaussian with variance proxy $\\sum_{i = 1}^n \\sigma_i^2$. As a consequence, we have the tail bound\n    \\al{\n        \\Pr[|Z - \\Exp[Z]| \\geq t] \\leq 2\\exp\\left(-\\frac{t^2}{2\\sum_{i = 1}^n \\sigma_i^2}\\right),\n    }\n    for all $t \\in \\R$.\n\\end{theorem}\n\n\\begin{proof}\nUsing the independence of $X_1, \\ldots, X_n$, we have that for any $\\lambda \\in \\R$:\n \\al{\n    \\Exp \\left[ \\exp \\left\\{\\lambda(Z - \\Exp[Z]) \\right\\} \\right] &= \\Exp\\left[\\prod_{i = 1}^n \\exp \\left\\{\\lambda(X_i - \\Exp[X_i]) \\right\\}\\right] \\\\\n    &= \\prod_{i = 1}^n \\Exp \\left[ \\exp \\left\\{\\lambda(X_i - \\Exp[X_i]) \\right\\} \\right] \\\\\n    &\\leq \\prod_{i = 1}^n \\exp \\left( \\frac{\\lambda^2\\sigma_i^2}{2} \\right) \\\\\n    &= \\exp \\left( \\frac{\\lambda^2 \\sum_{i = 1}^n\\sigma_i^2}{2} \\right),\n }\n so $Z$ is sub-Gaussian with variance proxy $\\sum_{i = 1}^n \\sigma_i^2$. The tail bound then follows immediately from \\eqref{lec3:eqn:subgausstail}.\n\\end{proof}\n\nThe proof above demonstrates the value of the moment generating functions of sub-Gaussian random variables: they factorize conveniently when dealing with sums of independent random variables.\n\n\\subsec{Examples of sub-Gaussian random variables}\n\nWe now provide several examples of classes of random variables that are sub-Gaussian, some of which will appear repeatedly throughout the remainder of the course.\n\n\\begin{example}[Rademacher random variables]\n    A \\textit{Rademacher random variable} $\\epsilon$ takes a value of 1 with probability $1/2$ and a value of $-1$ with probability $1/2$. To see that $\\epsilon$ is $1$-sub-Gaussian, we follow Example 2.3 in \\cite{wainwright2019high} and upper bound the moment generating function of $\\epsilon$ by way of a power series expansion of $\\exp(\\cdot)$:\n    \\al{\n        \\Exp[\\exp(\\lambda \\epsilon)] &= \\frac{1}{2}\\left\\{\\exp(-\\lambda) + \\exp(\\lambda)\\right\\} \\\\\n        &= \\frac{1}{2}\\left\\{\\sum_{k = 0}^\\infty \\frac{(-\\lambda)^k}{k!} + \\sum_{k = 0}^\\infty \\frac{\\lambda^k}{k!}\\right\\} \\\\\n        &= \\sum_{k = 0}^\\infty \\frac{\\lambda^{2k}}{(2k)!} && \\text{(for odd $k$, $(-\\lambda)^k + \\lambda^k = 0$)} \\\\\n        &\\leq 1 + \\sum_{k = 1}^\\infty \\frac{\\left(\\lambda^2\\right)^{k}}{2^k k!} && \\text{($2^k k!$ is every other term of $(2k)!$)} \\\\\n        & = \\exp(\\lambda^2/2),\n    }\n    which is exactly the moment generating function bound \\eqref{lec3:eqn:subgassdefn} required for $1$-sub-Gaussianity.\n\\end{example}\n\n\\begin{example}[Random variables with bounded distance to mean]\\label{lec3:ex:rand_var_bound_dist_to_mean}\n    Suppose a random variable $X$ satisfies $|X - \\Exp[X]| \\leq M$ almost surely for some constant $M$. Then $X$ is $O(M)$-sub-Gaussian.\n\\end{example}\nWe now provide an even more general class of sub-Gaussian random variables that subsume the random variables in Example \\ref{lec3:ex:rand_var_bound_dist_to_mean}:\n\\begin{example}[Bounded random variables]\n    \\label{lec3:ex:bounded_rand_var_subg}\n    If $X$ is a random variable such that $a \\leq X \\leq b$ almost surely for some constants $a, b \\in \\R$, then\n    \\begin{equation*}\n        \\Exp\\left[e^{\\lambda(X - \\Exp[X])}\\right] \\leq \\exp \\left[ \\frac{\\lambda^2(b - a)^2}{8} \\right],\n    \\end{equation*}\n    i.e., $X$ is sub-Gaussian with variance proxy $(b - a)^2/4$. (We will prove this in Question 2(a) of Homework 1.) Note that combining the $(b - a)/2$-sub-Gaussianity of i.i.d. bounded random variables $X_1, \\dotsc, X_n$ and Theorem \\ref{lec3:thm:sum_sub_gaussian} yields a proof of Hoeffding's inequality.\n\\end{example}\n\n\\begin{example}[Gaussian random variables]\nIf $X$ is Gaussian with variance $\\sigma^2$, then $X$ satisfies \\eqref{lec3:eqn:subgassdefn} with equality. In this special case, the variance and the variance proxy are the same.\n\\end{example}\n\n\\sec{Concentrations of functions of random variables}\nWe now introduce some important inequalities related to the second of our two goals, namely, showing that for independent $X_1, \\dotsc, X_n$ and certain functions $f$, $f(X_1, \\dotsc, X_n)$ concentrates around $\\Exp[f(X_1, \\dotsc, X_n)]$.\n\n\\begin{theorem}[McDiarmid's inequality]\n    Suppose $f : \\R^n \\to \\R$ satisfies the \\emph{bounded difference condition}: there exist constants $c_1, \\ldots, c_n \\in \\R$ such that for all real numbers $x_1, \\ldots, x_n$ and $x_i'$,\n    \\al{\\label{lec3:eqn:mcdiarmid_fn_cond}\n        |f(x_1, \\ldots, x_n) - f(x_1, \\ldots, x_{i - 1}, x_i', x_{i + 1}, \\ldots, x_n)| \\leq c_i.\n    }\n    (Intuitively, \\eqref{lec3:eqn:mcdiarmid_fn_cond} states that $f$ is not overly sensitive to arbitrary changes in a single coordinate.) Then, for any independent random variables $X_1, \\ldots, X_n$,\n    \\al{\n        \\Pr \\left[ f(X_1, \\ldots, X_n) - \\Exp[f(X_1, \\ldots, X_n)] \\geq t \\right] \\leq \\exp\\left(-\\frac{2t^2}{\\sum_{i = 1}^n c_i^2}\\right). \\label{lec3:eqn:mcdiarmid_bound}\n    }\n    Moreover, $f(X_1, \\ldots, X_n)$ is $O\\left(\\sqrt{\\sum_{i = 1}^n c_i^2}\\right)$-sub-Gaussian.\n\\end{theorem}\n\n\\begin{remark}\n    Note that McDiarmid's inequality is a generalization of Hoeffding's inequality with $a_i \\leq x_i \\leq b_i$ and\n    \\begin{equation}\n        f(x_1, \\dotsc, x_n) = \\sum_{i = 1}^n x_i.\n    \\end{equation} \n\\end{remark}\n\n\\begin{proof}\n    The idea of this proof is to take the quantity $f(X_1,\\dots,X_n) - \\Exp[f(X_1,\\dots,X_n)]$ and break it into manageable components by conditioning on portions of the sample. To this end, we begin by defining:\n\t\\begin{align*}\n\t \tZ_0 &= \\Exp \\left[f(X_1,\\dots,X_n) \\right] &&\\text{constant}\\\\\n\t \tZ_1 &= \\Exp \\left[f(X_1,\\dots,X_n) \\lvert X_1 \\right] &&\\text{a function of $X_1$} \\\\\n        &\\cdots \\\\\n        Z_i &= \\Exp \\left [f(X_1,\\dots,X_n) | X_1,\\dots,X_i \\right] &&\\text{a function of $X_1,\\dots,X_i$} \\\\\n\t \t&\\cdots \\\\\n        Z_n &= f(X_1,\\dots,X_n)\n\t\\end{align*}\n    Using the law of total expectation, we show also that the expectation of $Z_i$ equals $Z_0$ for all $i$.\n    \\begin{align*}\n        \\Exp [Z_i] &= \\Exp \\left [ \\Exp \\left [f(X_1,\\dots,X_n) | X_1,\\dots,X_i \\right] \\right] \\\\\n        &= \\Exp[f(X_1,\\dots,X_n)] \\\\\n        &= Z_0\n    \\end{align*}\n    The fact that $\\Exp[D_i] = 0$, where $D_i = Z_i - Z_{i - 1}$, is an immediate corollary of this result. Next, we observe that we can rewrite the quantity of interest, $Z_n - Z_0$, as a telescoping sum in the increments $Z_i - Z_{i - 1}$:\n    \\begin{align*}\n        Z_n - Z_0 &= (Z_n - Z_{n - 1}) + (Z_{n - 1} - Z_{n - 2}) + \\cdots + (Z_1 - Z_0) \\\\\n        &= \\sum_{i = 1}^n D_i\n    \\end{align*} \n    Next, we show that conditional on $X_1,\\dots,X_{i - 1}$, $D_i$ is a bounded random variable. First, observe that:\n    \\begin{align*}\n        A_i = \\inf_x \\Exp \\left[ f(X_1,\\dots,X_n) | X_1,\\dots,X_{i - 1}, X_i = x \\right] - \\Exp \\left[ f(X_1,\\dots,X_n) | X_1,\\dots,X_{i - 1} \\right] \\\\\n        B_i = \\sup_x \\Exp \\left [ f(X_1,\\dots,X_n) | X_1,\\dots,X_{i - 1}, X_i = x \\right] - \\Exp \\left[ f(X_1,\\dots,X_n) | X_1,\\dots,X_{i - 1} \\right]\n    \\end{align*}\n    It is clear from their definition that $A_i \\leq D_i \\leq B_i$. Furthermore, by independence of the $X_i$'s, we have that:\n    \\begin{align*}\n        B_i - A_i &\\leq \\sup_{x_{1:i - 1}} \\sup_{x, x'} \\int \\left (f(x_1,\\dots,x_{i - 1}, x, x_{i + 1},\\dots,x_n) - f(x_1,\\dots,x_{i - 1}, x', x_{i + 1},\\dots,x_n)\\right) dP(x_{i + 1},\\dots,x_n) \\\\\n        &\\leq c_i\n    \\end{align*}\n    Using this bound, the properties of conditional expectation, and Example~\\ref{lec3:ex:bounded_rand_var_subg}, we can now prove that that $Z_n - Z_0$ is $O\\left(\\sqrt{\\sum_{i = 1}^n c_i^2}\\right)$-sub-Gaussian.\n    \\begin{align*}\n        \\Exp \\left[e^{\\lambda(Z_n - Z_0)} \\right] &= \\Exp \\left[e^{\\lambda \\sum_{i = 1}^n (Z_i - Z_{i - 1})} \\right] \\\\\n        &= \\Exp \\left[ \\Exp \\left[e^{\\lambda (Z_n - Z_{n - 1})} \\biggr\\lvert X_1,\\dots,X_{n - 1} \\right]e^{\\lambda \\sum_{i = 1}^{n - 1} (Z_i - Z_{i - 1})} \\right] \\\\\n        &\\leq e^{\\lambda^2 c_n^2/8} \\Exp \\left[ e^{\\lambda \\sum_{i = 1}^{n - 1} (Z_i - Z_{i - 1})} \\right] \\\\\n        &\\cdots \\\\\n        &\\leq e^{\\lambda^2 (\\sum_{i = 1}^n c_i^2)/8}\n    \\end{align*}\n    The final inequality given in \\eqref{lec3:eqn:mcdiarmid_bound} follows by Theorem~\\ref{lec3:thm:subgausstail}.\n\\end{proof}\n\nA more general version of McDiarmid's inequality comes from Theorem 3.18 in~\\cite{vanhandel2016high}. The setup for this theorem requires defining the \\emph{one-sided differences} of a function $f : \\R^n \\to \\R$:\n\\al{\n    D_i^-{f(x)} &= f(x_1, \\ldots, x_n) - \\inf_z f(x_1, \\ldots, x_{i - 1}, z, x_{i + 1}, \\ldots, x_n) \\\\\n    D_i^+{f(x)} &= \\sup_z f(x_1, \\ldots, x_{i - 1}, z, x_{i + 1}, \\ldots, x_n) - f(x_1, \\ldots, x_n).\n}\nThese two quantities are functions of $x \\in \\R^n$, and hence can be interpreted as describing the sensitivity of $f$ \\emph{at a particular point}. (Contrast this with the bounded difference condition \\eqref{lec3:eqn:mcdiarmid_fn_cond}, which bounds the sensitivity of $f$ universally over all points.) For convenience, define\n\\al{\n    d^+ &= \\Norm{\\sum_{i = 1}^n |D_i^+{f}|^2}_\\infty = \\sup_{x_1, \\ldots, x_n}\\sum_{i = 1}^n[|D_i^+{f(x_1, \\ldots, x_n)}]^2 \\\\\n    d^- &= \\Norm{\\sum_{i = 1}^n |D_i^-{f}|^2}_\\infty = \\sup_{x_1, \\ldots, x_n}\\sum_{i = 1}^n [D_i^-{f(x_1, \\ldots, x_n)}]^2.\n}\n\\begin{theorem}[Bounded difference inequality, Theorem 3.18 in~\\cite{vanhandel2016high}]\n    Let $f : \\R^n \\to \\R$, and let $X_1, \\ldots, X_n$ be independent random variables. Then, for all $t \\geq 0$,\n    \\al{\n        \\Pr[f(X_1, \\ldots, X_n) \\geq \\Exp[f(X_1, \\ldots, X_n)] + t] &\\leq \\exp\\left(-\\frac{t^2}{4d^-}\\right) \\\\\n        \\Pr[f(X_1, \\ldots, X_n) \\leq \\Exp[f(X_1, \\ldots, X_n)] - t] &\\leq \\exp\\left(-\\frac{t^2}{4d^+}\\right).\n    }\n\\end{theorem}\n\n\\subsec{Bounds for Gaussian random variables}\nUnfortunately, the bounded difference condition (\\ref{lec3:eqn:mcdiarmid_fn_cond}) is often only satisfied by bounded random variables or a bounded function. To get similar concentration inequalities for unbounded random variables, we need some other special conditions. The following inequalities assume that the random variables have the standard normal distribution.\n\n\\begin{theorem}[Gaussian Poincar\\'{e} inequality, Corollary 2.27 in~\\cite{vanhandel2016high}]\n    Let $f : \\R^n \\to \\R$ be smooth. If $X_1, \\ldots, X_n$ are independently sampled from $\\cN(0, 1)$, then\n    \\al{\n        \\Var(f(X_1, \\ldots, X_n)) \\leq \\Exp \\left[ \\norm{\\nabla{f}(X_1, \\ldots, X_n)}_2^2 \\right].\n    }\n\\end{theorem}\n\nBefore introducing the next theorem, we recall that a function $f : \\R^n \\to \\R$ is \\emph{$L$-Lipschitz} with respect to the $\\ell_2$-norm if there exists a non-negative constant $L \\in \\R$ such that for all $x, y \\in \\R^n$,\n\\al{\n    |f(x) - f(y)| \\leq L\\norm{x - y}_2.\n}\nWe emphasize that $L$ is universal for all points in $\\R^n$.\n\n\\begin{theorem}[Theorem 2.26 in~\\cite{wainwright2019high}]\n    Suppose $f : \\R^n \\to \\R$ is $L$-Lipschitz with respect to Euclidean distance, and let $X = (X_1, \\ldots, X_n)$, where $X_1, \\ldots, X_n \\iid \\cN(0, 1)$. Then for all $t \\in \\R$,\n    \\al{\n        \\Pr[|f(X) - \\Exp[f(X)]| \\geq t] \\leq 2\\exp\\left(-\\frac{t^2}{2L^2}\\right).\n    }\nIn particular, $f(X)$ is sub-Gaussian.\n\\end{theorem}\n"
  },
  {
    "path": "tex/collection/04-01-uniform.tex",
    "content": "% reset section counter\n\\setcounter{section}{0}\n\n%\\metadata{lecture ID}{Your names}{date}\n\\metadata{4}{Yizhou Qian}{Jan 25th, 2021}\n\nIn Chapter \\ref{chap:asymp}, we pointed out some limitations of asymptotic analysis. In this chapter, we will turn our focus to \\textit{non-asymptotic analysis}, where we provide convergence guarantees without having the number of observations $n$ go off to infinity. A key tool for proving such guarantees is \\textit{uniform convergence}, where we have bounds of the following form:\n\\al{\n \\Pr \\left[ \\sup_{h \\in \\cH} \\vert \\hat L(h) - L(h) \\vert \\leq \\epsilon \\right] \\geq 1 - \\delta.\n\\label{lec4:eqn:uniformconvergence}\n}\nIn other words, the probability that the difference between our empirical loss and population loss is larger than $\\epsilon$ is at most $\\delta$. We give motivation for uniform convergence and show how it can give us non-asymptotic guarantees on excess risk.\n\n\\sec{Basic concepts}\n\nA central goal of learning theory is to bound the \\emph{excess risk} $L(\\hat{\\theta}) - L(\\theta^*)$. This is important as we don't want the expected risk of our ERM to be much larger than the expected risk of the best possible model. As we will see in the remainder of this section, uniform convergence is a technique that helps us achieve such bounds.\n\nUniform convergence is a property of a parameter set $\\Theta$, which gives us bounds of the form\n\\al{\n    \\Pr \\left[|\\hat{L}(\\theta) - L(\\theta)| \\geq \\varepsilon \\right] \\leq \\delta; , \\forall \\theta \\in \\Theta.\\label{lec2:eqn:uc}\n}\nIn other words, uniform convergence tells us that for any choice of $\\theta$, our empirical risk is always close to our population risk with high probability. Let's look at a motivating example for why this type of bound is useful.\n\n\\subsec{Motivation: Uniform convergence implies generalization}\\label{sec:uc-gen}\n\nConsider the standard supervised learning setup where we have some i.i.d. $\\{(x\\sp{i}, y\\sp{i})\\}$. Furthermore, assume that we have a bounded loss function; specifically, suppose that $0 \\leq \\ell((x, y); \\theta) \\leq 1$, as in the case of the zero-one loss function. We show that uniform convergence implies generalization.\n\nFirst, via telescoping sums, we can decompose the excess risk into three terms:\n\\al{\n    L(\\hat{\\theta}) - L(\\theta^*) = \\underbrace{L(\\hat{\\theta}) - \\hat{L}(\\hat{\\theta})}_\\text{\\circled{1}} + \\underbrace{\\hat{L}(\\hat{\\theta}) - \\hat{L}(\\theta^*)}_\\text{\\circled{2}} + \\underbrace{\\hat{L}(\\theta^*) - L(\\theta^*)}_\\text{\\circled{3}}.\n}\nWe know that $\\hat{L}(\\hat{\\theta}) - \\hat{L}(\\theta^*) \\leq 0$ since $\\hat{\\theta}$ is a minimizer of $\\hat{L}$. This allows us to write\n\\begin{align}\nL(\\hat{\\theta}) - L(\\theta^*) &\\leq |L(\\hat{\\theta}) - \\hat{L}(\\hat{\\theta})| + \\hat{L}(\\hat{\\theta}) - \\hat{L}(\\theta^*) + |\\hat{L}(\\theta^*) - L(\\theta^*)|\\\\\n&\\leq |L(\\hat{\\theta}) - \\hat{L}(\\hat{\\theta})| + 0 + |\\hat{L}(\\theta^*) - L(\\theta^*)|\\\\\n&\\leq 2\\;\\sup_{\\theta \\in \\Theta } |L(\\theta) - \\hat{L}(\\theta)|. \\label{lec2:eqn:1}\n\\end{align}\nThis result tells us that if $\\sup_{\\theta \\in \\Theta } |L(\\theta) - \\hat{L}(\\theta)|$ is small (say, less than $\\varepsilon/2$), then excess risk $L(\\hat{\\theta}) - L(\\theta^*)$ is less than $\\varepsilon$. But this is exactly in the form of the bound in \\eqref{lec2:eqn:uc}. Hence, if we can show that a parameter family exhibits uniform convergence, we can get a bound on excess risk as well.\n\nFor future reference, Equation~\\eqref{lec2:eqn:1} can be strengthened straightforwardly into the following with slightly more careful treatment of the signs of each term:\n\\begin{align}\nL(\\hat{\\theta}) - L(\\theta^*) \\le |\\hat{L}(\\theta^*) - L(\\theta^*)|+  L(\\hat{\\theta}) - \\hat{L}(\\hat{\\theta})  \\le |\\hat{L}(\\theta^*) - L(\\theta^*)|+ \\sup_{\\theta \\in \\Theta} \\left(L(\\theta) - \\hat{L}(\\theta)\\right)\\label{lec2:eqn:2}\n\\end{align}\nThis will make some of our future derivations technically slightly more convenient, but the nuanced difference between Equations~\\eqref{lec2:eqn:1} and~\\eqref{lec2:eqn:2} does not change the fundamental idea and the discussions in this chapter. \n\nLet us try to apply our knowledge of concentration inequalities to this problem. Earlier we assumed that $\\ell((x, y); \\theta)$ is bounded, so we can bound $\\circled{3}$ \\todo{replace $\\circled{3}$ etc by something that looks more aesthetic} \nby $\\tilO\\left(\\frac{1}{\\sqrt{n}}\\right)$ via Hoeffding's inequality (Remark \\ref{lec2:rem:hoeffding}). However, we cannot apply the same concentration inequality to $\\circled{1}$: since $\\hat{\\theta}$ is data-dependent by definition, the i.i.d. assumption no longer holds. (To see this, note that $\\hat\\theta$ depends on the training dataset $\\{(x\\sp{i}, y\\sp{i})\\}$, so the terms in $\\hatL (\\hat\\theta)$, $\\ell ((x\\sp{i}, y\\sp{i}); \\hat\\theta)$, all depend on the training dataset too.) This is concerning: it is certainly possible that $L(\\hat{\\theta}) - \\hat{L}(\\hat{\\theta})$ is large. You've probably encountered this yourself when a model exhibits low training loss, but high validation/testing loss. \n\n\\subsec{Deriving uniform convergence bounds}\n\nUniform convergence is one way we can control this issue. The high-level idea is as follows: \n\\begin{itemize}\n    \\item Suppose we have a bound of the form $\\Pr[|\\hat{L}(\\theta) - L(\\theta)| \\geq \\varepsilon'] \\leq \\delta'$ for some single, fixed choice of $\\theta$.\n    \\item If we know \\emph{all possible values of $\\theta$} in advance, we can use the above bound to create a more general bound over all values of $\\theta$.\n\\end{itemize}\nIn particular, we can use the union-bound inequality to create the general bound described in the second bullet point, using the bound in the first bullet point:\n\\al{\n    \\Pr \\left[\\forall \\theta \\in \\Theta, |\\hat{L}(\\theta) - L(\\theta)| \\geq \\varepsilon' \\right] \\leq \\sum_{\\theta \\in \\Theta} \\Pr \\left[|\\hat{L}(\\theta) - L(\\theta)| \\geq \\varepsilon' \\right].\n}\nWe can then use Hoeffding's inequality to deal with the summands as $\\theta$ there is no longer data-dependent. We will talk more later about proving statements of this form.\n\n\\subsec{Intuitive interpretation of uniform convergence}\n\nSince uniform convergence implies generalization, if we know that population risk and empirical risk are always ``close,\" then excess risk is ``small\" as well (Figure \\ref{lec2:fig:uc}). In fact, it is possible to show that not only is $L(\\theta)$ ``close\" to $\\hat{L}(\\theta)$ for sufficiently large data, but that the ``shape\" of $\\hat{L}$ is ``close\" to the shape of $L$ as well (Figure \\ref{lec2:fig:shape}). This holds for the convex case; furthermore, there are conditions under which this holds in the non-convex case, for which a rigorous treatment can be found in~\\cite{mei2017landscape}. (\\emph{Figure design and some wording in this section were inspired by~\\cite{percynotes, thomasliu2018}.})\n\n\\begin{figure}[t]\n    \\centering\n    \\begin{subfigure}[t]{0.49\\textwidth}\n        \\hspace*{0.8em}\n        \\centering\n        \\begin{tikzpicture}[scale=0.7]\n            \\draw[help lines, color=gray!30, dashed] (-4.9,-4.9) grid (4.9,4.9);\n            \\draw[->,ultra thick] (-5,0)--(5,0) node[right]{$\\theta$};\n            \\draw[->,ultra thick] (0,-5)--(0,5) node[above]{$L$};\n            \\draw[blue, thick]   plot[smooth,domain=-5:5] (\\x, {0.1* (\\x*\\x)});\n            \\draw[red, dashed]   plot[smooth,domain=-5:5] (\\x, {0.1* (\\x*\\x) + 1});\n            \\draw[red, dashed]   plot[smooth,domain=-5:5] (\\x, {0.1* (\\x*\\x) - 1});\n            \\draw[green, thick]   plot[smooth,domain=-5:5] (\\x, {0.1* (\\x*\\x) + 0.8 * sin(3000 * \\x)});\n        \\end{tikzpicture}\n        \\caption{}\n        \\label{lec2:fig:uc}\n    \\end{subfigure}\n    \\hfill\n    \\begin{subfigure}[t]{0.49\\textwidth}\n        \\hspace*{0.8em}\n        \\centering\n        \\begin{tikzpicture}[scale=0.7]\n            \\draw[help lines, color=gray!30, dashed] (-4.9,-4.9) grid (4.9,4.9);\n            \\draw[->,ultra thick] (-5,0)--(5,0) node[right]{$\\theta$};\n            \\draw[->,ultra thick] (0,-5)--(0,5) node[above]{$L$};\n            \\draw[blue, thick]   plot[smooth,domain=-5:5] (\\x, {0.01*\\x*\\x*\\x*\\x - 0.3*\\x*\\x + 2});\n            \\draw[green, thick]   plot[smooth,domain=-5:5] (\\x, {0.011*\\x*\\x*\\x*\\x -0.003*\\x*\\x*\\x - 0.32*\\x*\\x + 2.15});\n            \\draw[red, dashed]   plot[smooth,domain=-5:5] (\\x, {0.011*\\x*\\x*\\x*\\x -0.003*\\x*\\x*\\x - 0.32*\\x*\\x + 3.05});\n            \\draw[red, dashed]   plot[smooth,domain=-5:5] (\\x, {0.011*\\x*\\x*\\x*\\x -0.003*\\x*\\x*\\x - 0.32*\\x*\\x + 1.25});\n        \\end{tikzpicture}\n        \\caption{}\n        \\label{lec2:fig:shape}\n    \\end{subfigure}\n    \\caption{These curves demonstrate how we apply uniform convergence to bound the population risk. The \\textcolor{blue}{blue} curves are the unobserved population risk we aim to bound. The \\textcolor{green}{green} curves denote the empirical risk we observe. Though this curve is often depicted as the fluctuating curve used in Figure~\\ref{lec2:fig:uc}, it is more often a smooth curve whose shape mimics that of the population risk (Figure~\\ref{lec2:fig:shape}). Uniform convergence allows us to construct additive error bounds for the excess risk, which are depicted using the \\textcolor{red}{red, dashed} lines.}\n    \\label{lec2:fig:uc_shape}\n\\end{figure}\n\n\\sec{Finite hypothesis class}\n\nIn this section, assume that $\\cH$ is finite. The following theorem gives a bound for the excess risk $L(\\hat{h}) - L(h^{*})$, where $\\hat{h}$ and $h^*$ are the minimizers of the empirical loss and population loss, respectively.\n\n\\begin{theorem}\\label{lec4:thm:finite}\nSuppose that our hypothesis class $\\cH$ is finite and that our loss function $\\ell$ is bounded in $[0,1]$, i.e. $0 \\leq \\ell((x, y), h) \\leq 1$. Then $\\forall \\delta \\  s.t. \\  0 < \\delta < \\frac{1}{2}$ , with probability at least $1 - \\delta$, we have \n\\al {\n\\vert L(h) - \\hat{L}(h) \\vert \\leq \\sqrt{\\frac{\\ln{\\vert \\cH \\vert} + \\ln{(2 / \\delta)}}{2n}} \\qquad \\forall h \\in \\cH.\n\\label{lec4:eqn:finiteuniformbound}\n}\nAs a corollary, we also have \n\\al {\nL(\\hat{h}) - L(h^{*}) \\leq \\sqrt{\\frac{ 2(\\ln{\\vert \\cH \\vert} + \\ln{(2 / \\delta)}) }{n}}.\n\\label{lec4:eqn:finiteexcessriskbound}\n}\n\\end{theorem}\n\n\\begin{proof}\nWe will prove this in two steps:\n\\begin{enumerate}\n\\item Use concentration inequalities to prove the bound for a fixed $h \\in \\cH$, then\n\\item Use a union bound across the $h$'s. (Recall that if $E_1, \\dots, E_k$ are a finite set of events, then the union bound states that $\\Pr ( E_1 \\cup \\dots \\cup E_k) \\leq \\sum_{i = 1}^k \\Pr(E_i)$.)\n\\end{enumerate}\n\nFix some $\\epsilon > 0$. By applying Hoeffding's inequality on the $\\ell( (x\\sp{i}, y\\sp{i}), h)$, we know that \n\n\\al{\n\\Pr \\left( \\vert \\hat{L}(h) - L(h) \\vert \\geq \\epsilon \\right) &\\leq 2\\exp\\left(-\\frac{2n^2\\epsilon^2}{\\sum_{i = 1}^n(b_i - a_i)^2}\\right) \\\\\n&= 2\\exp\\left(-\\frac{2n^2\\epsilon^2}{n}\\right) \\\\\n&= 2\\exp(-2n\\epsilon^2),\n\\label{lec4:eqn:boundedconcentration}\n}\nsince we can set $a_i = 0, b_i = 1$. The bound above holds for a single fixed $h$. To prove a similar inequality that holds for all $h \\in \\cH$, we apply the union bound with $E_h = \\{\\vert \\hat{L}(h) - L(h) \\vert \\geq \\epsilon \\}$:\n\\al{\n\\Pr \\left( \\exists h \\text{ s.t. } \\vert \\hat{L}(h) - L(h) \\vert \\geq \\epsilon \\right) &\\leq \\sum_{h \\in \\cH} \\Pr \\left(\\vert \\hat{L}(h) - L(h) \\vert \\geq \\epsilon \\right) \\\\\n&\\leq \\sum_{h \\in \\cH} 2\\exp(-2n\\epsilon^2) \\\\\n&= 2\\vert \\cH \\vert \\exp(-2n\\epsilon^2). \n\\label{lec4:eqn:unionboundforh}\n}\nIf we take $\\delta$ such that $2\\vert \\cH \\vert \\exp(-2n\\epsilon^2) = \\delta$, then it follows that \n\\al {\n\\epsilon = \\sqrt{\\frac{\\ln{\\vert \\cH \\vert} + \\ln{(2 / \\delta)}}{2n}},\n\\label{lec4:eqn:probabilitytoerror}\n}\nwhich proves \\eqref{lec4:eqn:finiteuniformbound}. \\eqref{lec4:eqn:finiteexcessriskbound} follows by the inequality we stated in Section \\ref{sec:uc-gen}, and taking \n\\begin{align}\n    \\epsilon = \\sqrt{\\frac{ 2(\\ln{\\vert \\cH \\vert} + \\ln{(2 / \\delta)}) }{n}},\n\\end{align}\nwe have that\n\\begin{align}\n\\Pr \\left( | L(\\hat{h}) - L(h^{*}) | \\geq \\epsilon \\right) &\\leq \\Pr \\left( 2 \\sup_{h \\in \\cH} | \\hat{L}(h) - L(h) | \\geq \\epsilon \\right) \\\\\n&\\leq 2 |\\cH| \\exp \\left( -\\frac{n\\epsilon^2}{2} \\right).\n\\end{align}\n\\end{proof}\n\n\\subsec{Comparing Theorem \\ref{lec4:thm:finite} with standard concentration inequalities}\nWith standard concentration inequalities, we have the following bound that depends on empirical risk:\n\\al{\n\\forall h \\in \\cH, \\quad w.h.p. \\quad \\vert \\hat{L}(h) - L(h) \\vert \\leq \\tilde{O} \\left( \\frac{1}{\\sqrt{n}} \\right).\n\\label{lec4:eqn:centrallimitconvergence}\n}\nThe bound here depends on each $h$. In contrast, the uniform convergence bound we obtain from \\eqref{lec4:eqn:probabilitytoerror} is uniform over all $h \\in \\cH$:\n\\al{\nw.h.p., \\quad \\forall h \\in \\cH, \\quad \\vert \\hat{L}(h) - L(h) \\vert \\leq \\tilde{O} \\left( \\frac{ \\ln |\\cH|}{\\sqrt{n}} \\right),\n}\nif we omit the $\\ln{(1/\\delta)}$ factor (we can do this since $\\ln{(1/\\delta)}$ is small in general and we take $\\delta = \\frac{1}{poly(n)}$). Hence, the extra $\\ln{\\vert \\cH \\vert}$ term that depends on the size of our finite hypothesis family $\\cH$ can be viewed as a trade-off in order to make the bound uniform.\n\n\\begin{remark}\nThere is no standard definition for the term \\textit{with high probability} (\\textit{w.h.p}). For this class, the term is equivalent to the condition that the probability is higher than $1 - n^{-c}$ for some constant $c$.\n\\end{remark}\n\n\\subsec{Comparing Theorem \\ref{lec4:thm:finite} with asymptotic bounds}\nWe can also compare the bound in Theorem \\ref{lec4:thm:finite} with our original asymptotic bound, namely,\n\\al{\nL(\\hat{h}) - L(h^*) \\leq \\frac{c}{n} + o \\left(n^{-1} \\right).\n\\label{lec4:eqn:asymptotics}\n}\nThe $o(n^{-1})$ term can vary significantly depending on the problem. For instance, both $n^{-2}$ and $p^{100}n^{-2}$ are $o(n^{-1})$ but the second one converges much more slowly. With the new bound, there are no longer any constants hidden in an $o(n^{-1})$ term (in fact that term is no longer there). However, we now have a slower convergence rate of $O(n^{-1/2})$.\n\n\\begin{remark}\n$O(n^{-1/2})$ convergence is sometimes known as the \\textit{slow rate} while $O(n^{-1})$ convergence is known as the \\textit{fast rate}. We were only able to get the slow rate from uniform convergence: we needed asymptotics to get the fast rate. (It is possible to get the fast rate from uniform convergence under certain conditions, e.g. when the population risk on the true $h^*$ is very low.)\n\\end{remark}\n\n\\sec{Bounds for infinite hypothesis class via discretization}\nUnfortunately, we cannot generalize the results from the previous section directly to the case where the hypothesis class $\\cH$ is infinite, since we cannot apply the union bound to an infinite number of hypothesis functions $h \\in \\cH$. However, if we consider a \\emph{bounded} and \\emph{continuous} parameterized space of $\\cH$, then we can obtain a similar uniform bound by applying a technique called \\emph{brute-force discretization}.\n\nFor this section, assume that our infinite hypothesis class $\\cH$ can be parameterized by $\\theta \\in \\mathbb{R}^p$ with $\\Vert \\theta \\Vert_2 \\leq B$ for some fixed $B > 0$. That is, we have \n\\al{\n\\cH = \\{h_{\\theta} : \\theta \\in \\mathbb{R}, \\Vert \\theta \\Vert_2 \\leq B \\}.\n\\label{lec4:eqn:infiniteclass}\n}\n\nThe intuition behind brute-force discretization is as follows: Let $E_\\theta = \\{ |\\hatL(\\theta) - L(\\theta)| \\geq \\epsilon \\}$ be the ``bad\" events. We want the bound the probability of any one of these bad events happening (i.e. $\\bigcup_\\theta E_\\theta$). The union bound does not work as we end up with an infinite sum. However, the union bound is very loose: these events can overlap with each other significantly. Instead, we can try to find ``prototypical\" bad events $E_{\\theta_1}, \\dots, E_{\\theta_N}$ that are somewhat disjoint so that $\\bigcup_\\theta E_\\theta \\approx \\bigcup_{i=1}^N E_{\\theta_i}$. We can then use the union bound on $\\bigcup_{i=1}^N E_{\\theta_i}$ to get a non-vacuous upper bound.\n\nWe make these ideas precise in the following section.\n\n\\subsec{Discretization of the parameter space by \\texorpdfstring{$\\epsilon$}{epsilon}-covers}\n\nWe start by defining the notion of an \\emph{$\\epsilon$-cover} (also \\textit{$\\epsilon$-net}):\n\n\\begin{definition}[$\\epsilon$-cover]\nLet $\\epsilon>0$. An \\emph{$\\epsilon$-cover} of a set $S$ with respect to a distance metric $\\rho$ is a subset $C \\subseteq S$ such that $\\forall x \\in S$, $\\exists x' \\in C$ such that $\\rho(x,x') \\le \\epsilon$, or equivalently,\n\\begin{align}\nS &\\subseteq \\bigcup_{x \\in C} \\mathrm{Ball}(x, \\epsilon, \\rho), \\quad \\text{where} \\\\\n\\mathrm{Ball}(x, \\epsilon, \\rho) &\\triangleq \\{ x': \\rho(x, x') \\leq \\epsilon \\}.\n\\end{align}\n\\end{definition}\n\n(We note that in some definitions it is possible for points in $C$ to lie outside of $S$; we do not worry about this technicality in this class.) The following lemma tells us that our parameter space $S = \\{\\theta \\in \\R^p: \\|\\theta\\|_2 \\le B\\}$ has an $\\epsilon$-cover with not too many elements:\n\n\\begin{lemma}[$\\epsilon$-cover of $\\ell_2$ ball]\\label{lec4:lem:ECSize}\nLet $B,\\epsilon>0$, and let $S = \\{x \\in \\R^p: \\|x\\|_2 \\le B\\}.$ Then there exists an $\\epsilon$-cover of $S$ with respect to the $\\ell_2$-norm with at most $\\max \\left (\\left(\\frac{3B\\sqrt{p}}{\\epsilon}\\right)^p, 1 \\right)$  elements.\n\\end{lemma}\n\n\\begin{proof}\nNote that if $\\epsilon > B\\sqrt{p}$, then $S$ is trivially contained in the ball centered at the origin with radius $\\epsilon$ and the $\\epsilon$-cover has size 1. Assume $\\epsilon \\leq B \\sqrt{p}$. Set\n\\begin{equation}\nC = \\left\\{ x \\in S: x_i = k_i \\frac{\\epsilon}{\\sqrt{p}}, k_i \\in \\mathbb{Z}, |k_i| \\leq  \\frac{B\\sqrt{p}}{\\epsilon}  \\right\\},\n\\end{equation}\ni.e. $C$ is the set of grid points in $\\R^p$ of width $\\tfrac{\\epsilon}{\\sqrt{p}}$ that are contained in $S$. See Figure \\ref{lec5:fig:ecover} for an illustration. \n\\begin{figure}[ht]\n\\centerline{\\includegraphics[width=3in]{figures/ECover.png}}\n\\caption[lec5:fig:ecover]{The $\\epsilon$-cover (shown in red) of $S$ that we construct in the proof of Lemma~\\ref{lec4:lem:ECSize}. For $x \\in S$, we choose the grid point $x'$ such that $\\norm{x-x'}_2 \\le \\epsilon$.}\n\\label{lec5:fig:ecover}\n\\end{figure}\n\nWe claim that $C$ is an $\\epsilon$-cover of $S$ with respect to the $\\ell_2$-norm: $\\forall x \\in S$, there exists a grid point $x' \\in C$ such that $|x_i-x_i'| \\le \\tfrac{\\epsilon}{\\sqrt{p}}$ for each $i$. Therefore,\n$$\\norm{x-x'}_2 = \\sqrt{\\sum_{i = 1}^p |x_i - x_i'|^2} \\leq \\sqrt{p\\cdot \\frac{\\epsilon^2}{p}} = \\epsilon.$$\n\nWe now bound the size of $C$. Since each $k_i$ in the definition of $C$ has at most $2\\tfrac{B\\sqrt{p}}{\\epsilon}+1$ choices, we have \n\\begin{equation}\n|C| \\le \\left( \\frac{2B\\sqrt{p}}{\\epsilon} +1\\right)^p \\le \\left(\\frac{3B\\sqrt{p}}{\\epsilon}\\right)^p.\n\\end{equation}\n\\end{proof}\n\n\\begin{remark}\\label{lec4:rem:enet}\nWe can actually prove a stronger version of Lemma \\ref{lec4:lem:ECSize}: there exists an $\\epsilon$-cover of $S$ with at most $\\left(\\frac{3B}{\\epsilon}\\right)^p$ elements. We will be using this version of the lemma in the proof below. (We will leave the proof of this stronger version as a homework exercise.)\n\\end{remark}\n\n\\subsec{Uniform convergence bound for infinite \\texorpdfstring{$\\cH$}{H}}\n\n\\begin{definition}[$\\kappa$-Lipschitz functions]\nLet $\\kappa \\ge 0$ and $\\norm{\\cdot}$ be a norm on the domain $D$. A function $L:D \\to \\R$ is said to be \\emph{$\\kappa$-Lipschitz} with respect to $\\norm{\\cdot}$ if for all $\\theta, \\theta' \\in D$, we have\n$$\n    |L(\\theta)-L(\\theta')| \\le \\kappa \\norm{\\theta-\\theta'}.\n$$\n\\end{definition}\n\nAssume that our infinite hypothesis class $\\cH$ can be parameterized by $\\cH = \\{h_{\\theta} : \\theta \\in \\mathbb{R}, \\Vert \\theta \\Vert_2 \\leq B\\}$. We have the following uniform convergence theorem for our infinite hypothesis class $\\cH$:\n\n\\begin{theorem}\\label{lec4:thm:main}\nSuppose $\\ell((x,y), \\theta) \\in [0,1]$, and $\\ell((x,y), \\theta)$ is $\\kappa$-Lipschitz in $\\theta$ with respect to the $\\ell_2$-norm for all $(x, y)$. Then, with probability  at least $1-O(\\exp(-\\Omega(p)))$, we have\n\\begin{equation}\n    \\forall \\theta, \\quad |\\hat L(\\theta)- L(\\theta)| \\leq  O\\left(\\sqrt{\\frac{p \\max(\\ln{(\\kappa Bn), 1)}}{n}}\\right).\n\\end{equation}\n\\end{theorem}\n\n\\begin{proof}[Proof of Theorem \\ref{lec4:thm:main}]\nFix parameters $\\delta, \\epsilon>0$ (we will specify their values later). Let $C$ be the $\\epsilon$-cover of our parameter space $S$ with respect to the $\\ell_2$-norm constructed in Lemma \\ref{lec4:lem:ECSize}. Define event $E = \\left\\{ \\forall \\theta \\in C, \\; |\\hat L(\\theta) - L(\\theta)| \\le \\delta \\right\\}$. By Theorem 4.1, we have $\\Pr (E) \\ge 1 - 2|C|\\exp(-2n\\delta^2)$.\n\nNow for any $\\theta \\in S$, we can pick some $\\theta_0 \\in C$ such that $\\norm{\\theta-\\theta_0}_2 \\le \\epsilon$. Since $L$ and $\\hatL$ are $\\kappa$-Lipschitz functions (this follows from the Lipschitzness of $\\ell$), we have\n\\begin{align}\n|L(\\theta) - L(\\theta_0)| &\\le \\kappa \\norm{\\theta-\\theta_0}_2 \\le \\kappa \\epsilon, \\text{ and} \\\\\n|\\hat L(\\theta) - \\hat L(\\theta_0)| &\\le \\kappa \\norm{\\theta-\\theta_0}_2 \\le \\kappa \\epsilon.\n\\end{align}\n\nTherefore, conditional on $E$, we have\n\\begin{equation}\n    |\\hat L(\\theta) -  L(\\theta)| \\le |\\hat L(\\theta)-\\hat L(\\theta_0)| + |\\hat L(\\theta_0) -  L(\\theta_0)| + | L(\\theta_0) - L(\\theta)| \\le 2 \\kappa\\epsilon+\\delta.\n\\end{equation}\n\nIt remains to choose suitable parameters $\\delta$ and $\\epsilon$ to get the desired bound in Theorem \\ref{lec4:thm:main} while making the failure probability small. First, set $\\epsilon = \\delta / (2 \\kappa)$ so that conditional on $E$,\n\\begin{equation} \\label{lec4:eqn:triangle}\n    |\\hat L(\\theta) -  L(\\theta)| \\le 2\\delta.\n\\end{equation}\n\nTo choose the correct $\\delta$, we must reason about the probability of $E$ under different choices of the parameter. The event $E$ happens with probability $1 - 2|C|\\exp(-2n\\delta^2) = 1 - 2 \\exp(\\ln{|C|} - 2n\\delta^2)$. From Remark \\ref{lec4:rem:enet}, we know that $\\ln{|C|} \\leq p \\ln{ (3B / (\\delta / 2)) }$. If we ignore the log term and assume $\\ln{|c|} \\leq p$, then this would give us the high probability bound we want:\n\\al{\n   2|C| \\exp(-2n\\delta^2)  = 2\\exp(\\ln{\\vert C \\vert} - 2n\\delta^2) \\leq 2\\exp(p - 2p) = 2\\exp(-p).\n}\n(At the same time, we see from \\eqref{lec4:eqn:triangle} that this choice of $\\delta$ gives $|\\hat L(\\theta)- L(\\theta)| \\le 2 \\sqrt{\\frac{p}{n}}$, which is roughly the bound we want.)\n\nSince we cannot actually drop the log term in the inequality $\\ln{|C|} \\leq p \\ln{ (3B / (\\delta / 2)) }$, we need to make $\\delta$ a little bit bigger. So, if we set $\\delta = \\sqrt{\\frac{c_0 p \\max(1, \\ln{(\\kappa Bn)})}{n}}$ with $c_0 = 36$, then by Remark \\ref{lec4:rem:enet},\n\\begin{align}\n\\ln{\\vert C \\vert} - 2n\\delta^2 &\\leq p \\ln\\left(\\frac{6B \\kappa}{\\delta}\\right) - 2n\\delta^2 \\\\\n&\\leq p \\ln\\left(\\frac{6B\\kappa \\sqrt{n}}{ \\sqrt{c_0 p \\max(1, \\ln{(\\kappa Bn)})} }\\right) - 2n \\frac{c_0p}{n} \\ln(\\kappa Bn) &(\\text{dfn of } \\delta)  \\\\\n&\\leq p\\ln\\left(\\frac{B\\kappa \\sqrt{n}}{\\sqrt{p}}\\right) - 72 p \\ln(\\kappa Bn) &(\\max(1, \\ln{(\\kappa Bn)}) \\geq 1, c_0 = 36) \\\\\n&\\leq p \\ln(B\\kappa n) - 72 p \\ln(B\\kappa n) &(\\sqrt{n/p} \\leq n) \\\\\n&\\leq -p,\n\\end{align}\nsince $\\ln (B\\kappa n) \\geq 1$ for large enough $n$. Therefore, with probability greater than $1 - 2|C| \\exp(-2n\\delta^2) = 1 - 2 \\exp(\\ln{|C|} - 2n\\delta^2) \\geq 1 - O(e^{-p})$, we have\n\\al{\n\\vert \\hat L(\\theta) - L(\\theta) \\vert \\leq 2\\delta = O\\left(\\sqrt{\\frac{p}{n}\\max(1,\\ln(\\kappa Bn))}\\right).\n}\n\\end{proof}\n\n\\begin{remark}\nWe bounded the generalization error $\\vert \\hat L(\\theta) - L(\\theta) \\vert$ by $\\delta + 2\\epsilon \\kappa \\leq \\sqrt{\\frac{\\ln{\\vert C \\vert}}{n}} + 2\\epsilon \\kappa$. The term $2\\epsilon \\kappa$ represents the error from our brute-force discretization. It is not a problem because we can always choose $\\epsilon$ small enough without worrying about the growth of the first term $\\sqrt{\\frac{\\ln{\\vert C \\vert}}{n}}$. This in turn is because $\\ln{\\vert C \\vert} \\approx p\\ln{\\epsilon^{-1}}$, which is very insensitive to $\\epsilon$, even if we let $\\epsilon = \\frac{1}{poly(n)}$. We also observe that both $\\sqrt{\\frac{\\ln{\\vert C \\vert}}{n}}$ and $\\sqrt{\\frac{p}{n}}$ are bounds that depend on the ``size\" of our hypothesis class, in terms of either its total size or dimensionality. This possibly explains why one may need more training samples when the hypothesis class is larger.\n\\end{remark}\n"
  },
  {
    "path": "tex/collection/04-02-uniform.tex",
    "content": "% reset section counter\n%\\setcounter{section}{0}\n\n%\\metadata{lecture ID}{Your names}{date}\n\\metadata{5}{Will Song}{Jan 27th, 2021}\n\n\\sec{Rademacher complexity}\n\n\\subsec{Motivation for a new complexity measure}\n\nRecall that our goal is to bound the \\textit{excess risk} $L(\\hat{h}) - L(h^*)$, where $L$ is the expected loss (or population loss), $\\hat{h}$ is our estimated hypothesis and $h^*$ is the hypothesis in the hypothesis class $\\cH$ which minimizes the expected loss. We previously showed that to do so, it suffices to upper bound $\\sup_{h\\in \\cH} (L (h) - \\hatL(h))$. (Note: we often call $L(\\hat{h}) - \\hatL(\\hat{h})$ the \\textit{generalization gap} or \\textit{generalization error}.)\n\nIn the previous sections, we derived bounds for the generalization gap in two cases:\n\\begin{enumerate}\n\t\\item If the hypothesis class $\\cH$ is finite,\n\t\\begin{equation}\\label{lec5:eqn:bound-finite}\n\tL(\\hat h) - \\hat L(\\hat h) \\leq \\tilde O \\l( \\sqrt{\\frac{\\log |\\cH|}{n}} \\r).\n\t\\end{equation}\n\t\\item If the hypothesis class $\\cH$ is $p$-dimensional,\n\t\\begin{equation}\\label{lec5:eqn:bound-p}\n\tL(\\hat h) - \\hat L(\\hat h) \\leq \\tilde O \\l( \\sqrt{\\frac{p}{n}} \\r).\n\t\\end{equation}\n\\end{enumerate} \nBoth of these bounds have a $\\frac{1}{\\sqrt{n}}$-dependency on $n$, which is known as the ``slow rate\". The terms in the numerator ($\\log |\\cH|$ and $p$ resp.) can be thought of as complexity measures of $\\cH$.\n\nThe bound \\eqref{lec5:eqn:bound-p} is not precise enough: it depends solely on $p$ and is not always optimal. For example, this would be a poor bound if the hypothesis class $\\cH$ has very high dimension but small norm. One specific example is for the following two hypothesis classes:\n$$ \\{\\theta : \\|\\theta\\|_1 \\leq B\\} \\qquad \\text{vs.} \\qquad \\{\\theta : \\|\\theta\\|_2 \\leq B\\},$$\n\\eqref{lec5:eqn:bound-p} would give both hypothesis classes the same bound of $\\tilde O \\l( \\sqrt{\\frac{p}{n}} \\r)$. Intuitively, we should take into account the norms to prove a better bound.\n\nWith the complexity measure to be introduced, we will prove a bound of the form\n\\begin{align}\n    L(\\hat h) - \\hat L(\\hat h) \\leq \\tilde O\\l(\\sqrt{\\frac{\\text{Complexity}(\\Theta)}{n}}\\r).\n\\end{align}\n\nThis complexity measure will depend on the distribution $P$ over $\\cX \\times \\cY$ (the input and output spaces), and hence takes into account how easy it is to learn $P$. If $P$ is easy to learn, then this complexity measure will be small even if the hypothesis space is big.\n\nOne of the practical implications of having such a complexity measure is that we can restrict the hypothesis space by regularizing the complexity measure (assuming it is something we can evaluate and train with). If we successfully find a low complexity model, then this generalization bound guarantees that we have not overfit.\n\n\\subsec{Definitions}\n\nIn uniform convergence, we sought a high probability bound for $\\sup_{h \\in H}(L(h) - \\hat L (h))$. Here we have a weaker goal: we try to obtain an upper bound for its expectation instead, i.e.\n\\begin{equation}\n\\Exp\\l[ \\sup_{h \\in H}(L(h) - \\hat L (h)) \\r] \\leq \\text{ upper bound}. \\label{lec5:eq:generror}\n\\end{equation}\nThe expectation is over the randomness in the training data $\\{(x^{(i)}, y^{(i)})\\}_{i=1}^n$.\\footnote{Though we might like to pull the $\\sup$ outside of the $\\Exp$ operator, and bound the expectation of the excess risk (a far simpler quantity to deal with!), in general, the $\\sup$ and $\\Exp$ operators do not commute. In particular, $\\Exp\\left [\\sup_{h \\in \\cH} (L(h) - \\hat{L}(h)) \\right ] \\geq \\sup_{h \\in \\cH} \\Exp \\left[ L(h) - \\hat{L} (h) \\right]$.}\n\nTo do so, we first define \\textit{Rademacher complexity}.\n\n\\begin{definition}[Rademacher complexity] \\label{lec5:dfn:rc}\nLet $\\cF$ be a family of functions mapping $Z \\mapsto \\bbR$, and let $P$ be a distribution over $Z$. The \\textit{(average) Rademacher complexity} of $\\cF$ is defined as \n\\begin{align}\n    R_n(F) \\triangleq \\Exp_{z_1, \\dots, z_n \\iid P} \\l[ \n    \\Exp_{\\sigma_1, \\dots, \\sigma_n \\iid\\{ \\pm 1 \\}} \\l[ \\sup_{f\\in F} \\frac{1}{n} \\sum^n_{i=1} \\sigma_i f(z_i) \\r] \\r], \\label{lec5:eqn:Rn}\n\\end{align}\nwhere $\\sigma_1, \\dots, \\sigma_n$ are independent \\textit{Rademacher random variables}, i.e. each taking on the value of $1$ or $-1$ with probability $1/2$.\n\\end{definition}\n\n\\begin{remark}\nFor applications to empirical risk minimization, we will take $\\cZ = \\cX \\times \\cY$. However, Definition \\ref{lec5:dfn:rc} holds for abstract input spaces $\\cZ$ as well.\n\\end{remark}\n\n\\begin{remark}\nNote that $R_n(\\cF)$ is also dependent on the measure $P$ of the space, so technically it should be $R_{n,P}(\\cF)$, but for brevity, we refer to it as $R_n(\\cF)$.\n\\end{remark}\n\nAn interpretation is that $R_n(\\cF)$ is the maximal possible correlation between outputs of some $f \\in \\cF$ (on points $f(z_1), \\dots, f(z_n)$) and random Rademacher variables $ (\\sigma_1, \\dots, \\sigma_n).$ Essentially, functions with more random sign outputs will better match random patterns of Rademacher variables and have higher complexity (greater ability to mimic or express randomness).\n\nThe following theorem is the main theorem involving Rademacher complexity:\n\n\\begin{theorem} \\label{lec5:thm:thm1}\n    \\begin{align}\n       \\Exp_{z_1, \\dots, z_n \\iid P} \\l[ \\sup_{f\\in F} \\l[ \\frac{1}{n} \\sum^n_{i=1} f(z_i) -  \\Exp_{z\\sim P} [f(z)] \\r]\\r] \\leq 2 R_n(\\cF). \\label{lec5:eqn:thm1}\n    \\end{align}\n\\end{theorem}\n\n\\begin{remark}\nWe can think of $\\frac{1}{n} \\sum^n_{i=1} f(z_i)$ as an empirical average and $\\Exp_{z\\sim P} [f(z)]$ as a population average.\n\\end{remark}\n\\noindent\\textit{Why is Theorem \\ref{lec5:thm:thm1} useful to us?} We can set $\\cF$ to be the family of loss functions, i.e.\n\\begin{equation}\n\\cF = \\l\\{ z = (x,y) \\in \\cZ \\mapsto \\ell((x,y),h) \\in \\bbR : h \\in \\cH \\r\\}.\n\\end{equation} \nThis is the family of losses induced by the hypothesis functions in $\\cH$. We also define the function class $-\\cF$ as $\\{-f : f \\in \\cF\\}$. It should be obvious from this definition that $R_n(\\cF) = R_n(-\\cF)$ since $\\sigma_i \\stackrel{d}{=} -\\sigma_i$ for all $i$. Then, letting $z_i = (x^{(i)}, y^{(i)})$,\n\\begin{align}\n    \\Exp\\l[ \\sup_{h \\in \\cH}\\l( L(h) - \\hat L (h) \\r) \\r] &= \\Exp_{\\{(x^{(i)}, y^{(i)})\\}} \\l[ \\sup_{h \\in \\cH} \\l[L(h) - \\frac{1}{n} \\sum^n_{i=1} \\ell((x^{(i)}, y^{(i)}), h) \\r] \\r] \\\\\n    &= \\Exp_{\\{z_i\\}} \\l[\\sup_{f \\in \\cF} \\l(\\Exp[f(z)] - \\frac{1}{n} \\sum^n_{i=1} f(z_i) \\r)\\r] \\\\\n    &= \\Exp_{\\{z_i\\}} \\l[\\sup_{f \\in -\\cF} \\l(\\frac{1}{n} \\sum^n_{i=1} f(z_i) - \\Exp[f(z)] \\r)\\r] \\\\\n    &\\leq 2 R_n(-\\cF) = 2R_n(\\cF)\n\\end{align}\nwhere the last step follows by Theorem \\ref{lec5:thm:thm1}. \n\nThus, $2R_n(\\cF)$ is an upper bound for the generalization error. In this context, $R_n(\\cF)$ can be interpreted as how well the loss sequence $\\ell((x^{(1)}, y^{(1)}), h), \\dots \\ell((x^{(n)}, y^{(n)}), h)$ correlates with $\\sigma_1, \\dots, \\sigma_n$.\n\\begin{example}\nConsider the binary classification setting where $y \\in \\{\\pm 1\\}$. Let $\\ell_{0-1}$ denote the zero-one loss function. Note that\n\\begin{equation}\\label{lec5:eqn:01}\n    \\ell_{0-1}((x,y), h) = \\mathbf{1}\\{h(x) \\neq y\\} = \\frac{1-yh(x)}{2}.\n\\end{equation}\n\nHence,\n\\begin{align}\n    R_n(\\cF) &= \\Exp_{\\{(x^{(i)}, y^{(i)})\\}, \\sigma_i} \\l[ \\sup_{h \\in \\cH} \\frac{1}{n}\\sum^n_{i=1} \\ell_{0-1}((x^{(i)}, y^{(i)}),h)\\sigma_i \\r] &(\\text{by definition}) \\\\\n    &= \\Exp_{\\{(x^{(i)}, y^{(i)})\\}, \\sigma_i} \\l[ \\sup_{h \\in \\cH} \\frac{1}{n}\\sum^n_{i=1} \\l(\\frac{-h(x^{(i)})y^{(i)}+1}{2}\\r)\\sigma_i \\r] &(\\text{by } \\eqref{lec5:eqn:01}) \\\\\n    &= \\frac{1}{2} \\Exp_{\\{(x^{(i)}, y^{(i)})\\}, \\sigma_i} \\l[ \\frac{1}{n}\\sum^n_{i=1}\\sigma_i + \\sup_{h \\in \\cH} \\frac{1}{n}\\sum^n_{i=1} -h(x^{(i)})y^{(i)}\\sigma_i \\r] &(\\sup \\text{only over } \\cH) \\\\\n    &= \\frac{1}{2} \\Exp_{\\{(x^{(i)}, y^{(i)})\\}, \\sigma_i} \\l[\\sup_{h \\in \\cH} \\frac{1}{n}\\sum^n_{i=1} -h(x^{(i)})y^{(i)}\\sigma_i \\r] &(\\Exp [\\sigma_i] = 0) \\\\\n    &=\\frac{1}{2} \\Exp_{\\{(x^{(i)}, y^{(i)})\\}, \\sigma_i} \\l[\\sup_{h \\in \\cH} \\frac{1}{n}\\sum^n_{i=1} h(x^{(i)})\\sigma_i \\r] &(-y_i \\sigma_i \\stackrel{d}{=} \\sigma_i) \\\\\n    &= \\frac{1}{2}R_n(\\cH). &(\\text{by definition})\n\\end{align}\n\nIn this setting, $R_n(\\cF)$ and $R_n(\\cH)$ are the same (except for the factor of 2). $R_n(\\cH)$ has a slightly more intuitive interpretation: it represents how well $h \\in \\cH$ can fit random patterns.\n\n\\textbf{Warning!} $R_n(\\cF)$ is not always the same as $R_n(\\cH)$ in other problems.\n\\end{example}\n\n\\begin{remark}\nRademacher complexity is invariant to translation. This property manifests in the previous example when the $+1$ in the $\\l(\\frac{-h(x^{(i)})y^{(i)}+1}{2}\\r)$ term essentially vanishes in the computation.\n\\end{remark}\n\nLet us now prove Theorem \\ref{lec5:thm:thm1}.\n\n\\begin{proof}[Proof of Theorem \\ref{lec5:thm:thm1}]\nWe use a technique called \\textit{symmetrization}, which is a very important technique in probability theory. We first fix $z_1, \\dots, z_n$and draw $ z_1', \\dots z_n' \\iid P$. Then we can rewrite the term in the expectation on the LHS of \\eqref{lec5:eqn:thm1}:\n\\begin{align}\n    \\sup_{f \\in \\cF} \\l( \\frac{1}{n} \\sum^n_{i=1} f(z_i) - \\Exp[f] \\r) &= \\sup_{f \\in \\cF} \\l( \\frac{1}{n} \\sum^n_{i=1} f(z_i) - \\Exp_{z_1',\\dots, z_n'} \\l[ \\frac{1}{n} \\sum^n_{i=1} f(z_i')\\r] \\r) \\\\\n    &= \\sup_{f \\in \\cF} \\l( \\Exp_{z_1',\\dots, z_n'} \\l[\\frac{1}{n} \\sum^n_{i=1} f(z_i) -  \\frac{1}{n} \\sum^n_{i=1} f(z_i')\\r] \\r)\\\\\n    &\\leq \\Exp_{z_1',\\dots, z_n'} \\l[\\sup_{f \\in \\cF} \\l( \\frac{1}{n} \\sum^n_{i=1} f(z_i) -  \\frac{1}{n} \\sum^n_{i=1} f(z_i')\\r)\\r]. \\label{lec5:eqn:thm1-pf1}\n\\end{align}\n\nThe last inequality is because in general,\n\\begin{align}\n    \\sup_u \\l(\\Exp_v[g(u,v)]\\r) \\leq \\sup_u \\l( \\Exp_v\\l[\\sup_{u'}(g(u',v))\\r]\\r) = \\Exp_v \\l[\\sup_u (g(u,v))\\r]\n\\end{align}\nsince the $\\sup$ over $u$ becomes vacuous after we replace $u$ with $u'$.\n\nNow, if we take the expectation over $z_1, \\dots, z_n$ for both sides of \\eqref{lec5:eqn:thm1-pf1},\n\\begin{align}\n    \\Exp_{z_1, \\dots, z_n} \\l[\\sup_{f \\in \\cF} \\l( \\frac{1}{n} \\sum^n_{i=1} f(z_i) - \\Exp[f] \\r) \\r] \n    &\\leq \\Exp_{z_i} \\l[ \\Exp_{z_i'} \\l[\\sup_{f \\in \\cF} \\l( \\frac{1}{n} \\sum^n_{i=1} \\l(f(z_i) -  f(z_i')\\r)\\r)\\r]\\r]\\\\\n    &= \\Exp_{z_i,z_i'} \\l[ \\Exp_{\\sigma_i} \\l[\\sup_{f \\in \\cF} \\l( \\frac{1}{n} \\sum^n_{i=1} \\sigma_i\\l(f(z_i) -  f(z_i')\\r)\\r)\\r]\\r] \\label{lec5:eqn:thm1-pf2} \\\\\n &\\leq \\Exp_{z_i,z_i', \\sigma_i} \\l[\\sup_{f \\in \\cF} \\l( \\frac{1}{n} \\sum^n_{i=1} \\sigma_i f(z_i)\\r)+\\sup_{f \\in \\cF} \\l( \\frac{1}{n} \\sum^n_{i=1} -\\sigma_i f(z_i')\\r)\\r] \\\\\n    &= 2R_n(\\cF),\n\\end{align}\nwhere \\eqref{lec5:eqn:thm1-pf2} is because $\\sigma_i(f(z_i) - f(z_i')) \\stackrel{d}{=} f(z_i) - f(z_i')$ since $f(z_i) - f(z_i')$ has a symmetric distribution. The last equality holds since $-\\sigma_i \\overset{d}{=} \\sigma_i$ and $z_i, z_i'$ are drawn iid from the same distribution. \n\\end{proof}\n\nHere is an intuitive understanding of what Theorem \\ref{lec5:thm:thm1} achieves. Consider the quantities on the LHS and RHS of \\eqref{lec5:eqn:thm1}:\n\\begin{align*}\n    \\sup_{f\\in \\cF} \\l(\\frac{1}{n} \\sum_{i=1}^n f(z_i) - \\Exp[f(z)]\\r) \\qquad \\text{vs.} \\qquad \\sup_{f\\in \\cF} \\l(\\frac{1}{n} \\sum_{i=1}^n \\sigma_i f(z_i)\\r).\n\\end{align*}\nFirst, we removed $\\Exp[f(z)]$, which is hard to control quantitatively since it is deterministic. Second, we added more randomness in the form of Rademacher variables. This will allow us to shift our focus from the randomness in the $z_i$'s to the randomness in the $\\sigma_i$'s. In the future, our bounds on the Rademacher complexity will typically only depend on the randomness from the $\\sigma_i$'s.\n\n\\subsec{Dependence of Rademacher complexity on \\texorpdfstring{$P$}{P}}\nFor intuition on how Rademacher complexity depends on the distribution $P$, consider the extreme example where $P$ is a point mass, i.e. $z = z_0$ almost surely. Assume that $-1 \\leq f(z_0) \\leq 1$ for all $f \\in \\cF$. Then\n\\begin{align}\n    \\Exp_{z_1, \\dots, z_n \\sim P} \\l[ \\sup_{f \\in \\cF} \\frac{1}{n} \\sum^n_{i=1} \\sigma_i f(z_i)\\r]\n    &= \\Exp_{\\sigma_1, \\dots, \\sigma_n} \\l[ \\sup_{f \\in \\cF} \\frac{1}{n}f(z_0) \\sum^n_{i=1} \\sigma_i \\r] \\\\\n    &\\leq \\Exp_{\\sigma_1, \\dots, \\sigma_n} \\l[ \\l| \\frac{1}{n} \\sum^n_{i=1} \\sigma_i \\r|\\r] &(\\text{since } f(z_0) \\in [-1,1]) \\\\\n    &\\leq \\Exp_{\\sigma_i} \\l[ \\l( \\frac{1}{n} \\sum^n_{i=1} \\sigma_i \\r)^2\\r]^\\frac{1}{2} &(\\text{Jensen's Inequality}) \\\\\n    &= \\frac{1}{n}\\l( \\Exp_{\\sigma_i, \\sigma_j} \\l[ \\sum^n_{i, j=1} \\sigma_i\\sigma_j \\r] \\r)^\\frac{1}{2}\\\\\n    &= \\frac{1}{n}\\l( \\Exp_{\\sigma_i} \\l[ \\sum^n_{i=1} \\sigma_i^2 \\r] \\r)^\\frac{1}{2} \\\\\n    &= \\frac{1}{n} \\cdot \\sqrt{n} = \\frac{1}{\\sqrt{n}}.\n\\end{align}\nThis bound does not depend on $\\cF$ (except on the fact that $f \\in \\cF$ is bounded). This example illustrates that a bound on the Rademacher complexity can sometimes depend only on the (known) distribution of the Rademacher random variables.\n\n\\sec{Empirical Rademacher complexity}\n\nIn the previous section, we bounded the expectation of $\\sup_{f\\in F} \\l[ \\frac{1}{n} \\sum^n_{i=1} f(z_i) -  \\Exp_{z\\sim P} [f(z)] \\r]$. This expectation is taken over the training examples $z_1, \\dots, z_n$. In many instances we only have one training set, and do not have access to many training sets. Thus, the bound on the expectation does not give a guarantee for the one training set that we have. In this section, we seek to bound the quantity itself with high probability.\n\n\\begin{definition}[Empirical Rademacher complexity]\nGiven a dataset $S = \\{z_1, \\dots, z_n\\}$, the \\textit{empirical Rademacher complexity} is defined as\n\\begin{equation}\nR_S(\\cF) \\overset{\\Delta}{=} \\Exp_{\\sigma_1,\\dots, \\sigma_n} \\l[ \\sup_{f\\in \\cF} \\frac{1}{n} \\sum^n_{i=1} \\sigma_i f(z_i) \\r].\n\\end{equation}\n$R_S(\\cF)$ is a function of both the function class $\\cF$ and the dataset $S$.\n\\end{definition}\n\nAs the name suggests, the expectation of the empirical Rademacher complexity is the Rademacher complexity:\n\\begin{align}\n    R_n(\\cF) = \\underset{S=\\{z_1,\\dots, z_n\\}}{\\underset{z_1, \\dots, z_n \\iid P}\\Exp}\\l[ R_S(\\cF) \\r].\n\\end{align}\n\nHere is the theorem involving empirical Rademacher complexity:\n\\begin{theorem}\\label{lec5:thm:thm2}\n    Suppose for all $f \\in \\cF$, $0 \\leq f(z) \\leq 1$. Then, with probability at least $1-\\delta$,\n    \\begin{align}\n        \\sup_{f\\in \\cF} \\l[ \\frac{1}{n} \\sum^n_{i=1} f(z_i) - \\Exp[f(z)] \\r] \\leq 2 R_S(\\cF) + 3\\sqrt{\\frac{\\log{(2/\\delta)}}{2n}}.\n    \\end{align}\n\\end{theorem}\n\n\\begin{proof}\nFor conciseness, define\n\\begin{equation} g(z_1, \\dots, z_n) \\triangleq \\sup_{f\\in F} \\l[ \\frac{1}{n} \\sum^{n}_{i=1} f(z_i) - \\Exp[f(z)]\\r]. \\end{equation}\n\nWe prove the theorem in 4 steps.\n\n\\textbf{Step 1:} We bound $g$ using McDiarmid's Inequality. To use McDiarmid's Inequality, we check that the bounded difference condition holds:\n\\begin{align}\n    g(z_1, \\dots, z_n) - g(z_1, \\dots, z_i', \\dots, z_n)\n    &\\leq \\sup_{f\\in \\cF} \\l[ \\frac{1}{n} \\sum^{n}_{j=1} f(z_j) \\r] - \\sup_{f\\in \\cF} \\l[ \\l(\\frac{1}{n} \\sum^{n}_{j=1, j \\neq i} f(z_j)\\r) + \\frac{f(z_i')}{n} \\r]  \\\\\n    &\\leq \\sup_{f\\in \\cF} \\l[ \\frac{1}{n} \\sum^{n}_{j=1} f(z_j) - \\l(\\frac{1}{n} \\sum^{n}_{j=1, j \\neq i} f(z_j)\\r) - \\frac{f(z_i')}{n} \\r] \\label{lec5:eqn:thm2-pf1} \\\\\n    &= \\sup_{f\\in \\cF}\\l[ \\frac{1}{n} \\l( f(z_i) - f(z_i') \\r) \\r] \\\\\n    &\\leq \\frac{1}{n}. \\label{lec5:eqn:thm2-pf2}\n\\end{align}\n\\eqref{lec5:eqn:thm2-pf1} holds because in general, $\\sup_f A(f) - \\sup_f B(f) \\leq \\sup_f [A(f) - B(f)]$, and \\eqref{lec5:eqn:thm2-pf2} holds since $f$ is bounded by $[0,1]$. We can thus apply McDiarmid's Inequality with parameters $c_1 = \\dots = c_n = 1/n$:\n\\begin{align}\n    \\Pr\\l[ g(z_1, \\dots, z_n) \\geq \\Exp_{z_1,\\dots, z_n \\iid P}[g] + \\epsilon \\r] \\leq \\exp{\\l( \\frac{-2\\epsilon^2}{\\sum^n_{i=1} c_i^2 }\\r)} = \\exp(-2n\\epsilon^2).\n\\end{align}\n\n\\textbf{Step 2:} We apply Theorem \\ref{lec5:thm:thm1} to get \n\\begin{align}\n \\Exp_{z_1,\\dots, z_n \\iid P}[g] \\leq 2R_n(\\cF).\n\\end{align}\n\n\\textbf{Step 3:} Define\n\\begin{equation} \\tilde g (z_1, \\dots, z_n) = R_S(\\cF) \\triangleq \\Exp_{\\sigma_i}\\l[\\sup_{f\\in \\cF} \\frac{1}{n} \\sum^n_{i=1} \\sigma_i f(z_i)\\r]. \\end{equation}\n\nUsing a similar argument to that of Step 1, we show that $\\tilde g$ satisfies the bounded difference condition:\n\\begin{align}\n    &\\tilde g(z_1, \\dots, z_n) - \\tilde g(z_1, \\dots, z_i', \\dots, z_n) \\nonumber \\\\\n    &\\leq \\Exp_{\\sigma_i} \\l[\\sup_{f\\in F} \\l[ \\frac{1}{n} \\sum^{n}_{j=1} \\sigma_j f(z_j) \\r] - \\sup_{f\\in F} \\l[ \\l(\\frac{1}{n} \\sum^{n}_{j=1, j \\neq i} \\sigma_j f(z_j)\\r) + \\frac{1}{n} \\sigma_if(z_i')\\r]\\r]\\\\\n    &\\leq \\Exp_{\\sigma_i}\\l[\\sup_{f\\in F} \\l(\\frac{1}{n} \\sigma_i(f(z_i) - f(z_i'))\\r)\\r] \\\\\n    &\\leq \\frac{1}{n},\n\\end{align}\nsince the term inside the $\\sup$ is always upper bounded by 1. We can thus apply McDiarmid's Inequality with parameters $c_1 = \\dots = c_n = 1/n$:\n\\begin{align}\n    \\Pr\\l[ \\tilde g - \\Exp[\\tilde g] \\geq \\epsilon \\r] \\leq \\exp(-2n \\epsilon^2), \\quad\\text{and}\\quad\n    \\Pr\\l[ \\tilde g - \\Exp[\\tilde g] \\leq -\\epsilon \\r] \\leq \\exp(-2n \\epsilon^2).\n\\end{align}\n\n\\textbf{Step 4:} We set $\\delta$ such that $\\exp(-2n \\epsilon^2) = \\delta/2$. (This implies that $\\epsilon = \\sqrt{\\frac{\\log(2/\\delta)}{2n}}$.) Then, with probability $\\geq 1 - \\delta$,\n\\begin{align}\n    \\sup_{f\\in \\cF} \\l[ \\frac{1}{n} \\sum^n_{i=1} f(z_i) - \\Exp[f]\\r] = g &\\leq \\Exp[g] + \\epsilon &\\text{(Step 1)} \\\\\n    &\\leq 2R_n(\\cF) + \\epsilon &\\text{(Step 2)} \\\\\n    &\\leq 2(R_S(\\cF) + \\epsilon) + \\epsilon &\\text{(Step 3)} \\\\\n    &= 2R_S(\\cF) + 3\\epsilon,\n\\end{align}\nas required.\n\\end{proof}\n\nSetting $\\cF$ to be a family of loss functions bounded by $[0,1]$ in Theorem \\ref{lec5:thm:thm2} gives the following corollary:\n\\begin{corollary}\\label{lec6:cor:ggap-rsbound}\nLet $\\cF$ be a family of loss functions $\\cF = \\l\\{ (x,y) \\mapsto \\ell((x,y),h): h \\in \\cH \\r\\}$ with $\\ell((x,y), h) \\in [0,1]$ for all $\\ell$, $(x,y)$ and $h$. Then, with probability $1-\\delta,$ the generalization gap is\n    \\begin{equation}\\label{lec6:eqn:ggap-rsbound}\n        \\hat{L}(h) - L(h) \\leq 2R_S(\\cF) + 3\\sqrt{\\frac{\\log(2/\\delta)}{2n}} \\quad \\text{for all } h\\in \\cH.\n    \\end{equation}\n\\end{corollary}\n\n\\begin{remark}\nIf we want to bound the generalization gap by the average Rademacher complexity instead, we can replace the RHS of \\eqref{lec6:eqn:ggap-rsbound} with $2R_n(\\cF) + \\sqrt{\\frac{\\log(2/\\delta)}{2n}}$.\n\\end{remark}\n\n\\paragraph{Interpretation of  Corollary \\ref{lec6:cor:ggap-rsbound}.}\n\\sloppy It is typically the case that $O\\l(\\sqrt{\\frac{\\log (2/\\delta)}{n}}\\r) \\ll R_S(\\cF)$ and $O\\l(\\sqrt{\\frac{\\log (2/\\delta)}{n}}\\r) \\ll R_n(\\cF)$. This is the case because $R_S(\\cF)$ and $R_n(\\cF)$ often take the form $\\frac{c}{\\sqrt{n}}$ where $c$ is a big constant depending on the complexity of $\\cF$, whereas we only have a logarithmic term in the numerator of $O\\l(\\sqrt{\\frac{\\log (2/\\delta)}{n}}\\r)$. As a result, we can view the $3\\sqrt{\\frac{\\log (2/\\delta)}{n}}$ term in the RHS of Corollary \\ref{lec6:cor:ggap-rsbound} as negligible. Another way of seeing this is noting that a $\\tilO \\left( \\frac{1}{\\sqrt{n}} \\right)$ term is necessary even for the concentration bound of a single function $h\\in\\cH$. Previously, we bounded $L(h)-\\hat{L}(h)$ using a union bound over $h\\in\\cH$, which necessarily needs to be larger than $\\tilO \\left(\\frac{1}{\\sqrt{n}} \\right)$. As a result, the $O\\l(\\sqrt{\\frac{\\log (2/\\delta)}{n}}\\r)$ term is not significant.\n\n%\\subsec{Empirical Rademacher complexity viewed in the output/function space}\n%Assume we have a fixed dataset $S = \\{z_1, \\dots, z_n\\}$. Since $z_1,\\dots, z_n$ is fixed, each function $f\\in\\cF$ corresponds to a single output $(f(z_1),\\dots,f(z_n))\\in \\R^n$. Hence, we can express the set of outputs for every function $f\\in\\cF$ as\n%\\begin{align}\n%    Q_\\cF = \\left\\{ \\begin{pmatrix}f(z_1), \\dots, f(z_n) \\end{pmatrix} \\mid f\\in\\cF \\right\\}.\n%\\end{align}\n%\n%Now we can mathematically re-express the empirical Rademacher complexity as an inner product:\n%\\begin{align}\n%R_S(\\cF) &= \\Exp_{\\sigma_1,\\dots, \\sigma_n} \\l[ \\sup_{f\\in \\cF} \\frac{1}{n} \\sum^n_{i=1} \\sigma_i f(z_i) \\r] \\\\\n%&= \\Exp_{\\sigma_1,\\dots, \\sigma_n} \\l[ \\sup_{v\\in Q} \\frac{1}{n}\\langle\\sigma, v\\rangle \\r],\n%\\end{align}\n%where $\\sigma=(\\sigma_1,\\dots,\\sigma_n)$. (See Figure \\ref{lec6:fig:rs-innerprod} for an illustration of this idea.) This perspective will be helpful later on when proving bounds on the empirical Rademacher complexity.\n\n\n\n\n\\subsec{Rademacher complexity is translation invariant}\nA useful fact is that both empirical Rademacher complexity and average Rademacher complexity are translation invariant. (This is not obvious when thinking of how translation affects the picture in Figure \\ref{lec6:fig:rs-innerprod}.)\n\n\\begin{proposition}\nLet $\\cF$ be a family of functions mapping $Z \\mapsto \\R$ and define $\\cF' = \\{f'(z) = f(z) + c_0\\mid f\\in \\cF\\}$ for some $c_0\\in\\R$. Then $R_S(\\cF) = R_S(\\cF')$ and $R_n(\\cF) = R_n(\\cF')$.\n\\end{proposition}\n\n\\begin{proof}\nWe will prove here that empirical Rademacher complexity is translation invariant.\n\\begin{align}\nR_S(\\cF') &= \\Exp_{\\sigma_1,\\dots, \\sigma_n} \\l[ \\sup_{f'\\in \\cF'} \\frac{1}{n} \\sum^n_{i=1} \\sigma_i f(z_i) \\r] \\\\\n&= \\Exp_{\\sigma_1,\\dots, \\sigma_n} \\l[ \\sup_{f\\in \\cF} \\frac{1}{n} \\sum^n_{i=1} \\sigma_i (f(z_i)+c_0) \\r] \\\\\n&= \\Exp_{\\sigma_1,\\dots, \\sigma_n} \\l[ \\frac{1}{n} \\sum^n_{i=1} \\sigma_i c_0 + \\sup_{f\\in \\cF} \\frac{1}{n} \\sum^n_{i=1} \\sigma_i f(z_i) \\r] \\\\\n&= \\Exp_{\\sigma_1,\\dots, \\sigma_n} \\l[\\sup_{f\\in \\cF} \\frac{1}{n} \\sum^n_{i=1} \\sigma_i f(z_i) \\r] = R_S(\\cF), \\label{lec6:eqn:rs-translation}\n\\end{align}\nwhere \\eqref{lec6:eqn:rs-translation} follows because $\\Exp_{\\sigma_1,\\dots,\\sigma_n} \\frac{1}{n}\\sum_{i=1}^n \\sigma_i c_0 = 0$, since the $\\sigma_i$'s are Rademacher random variables.\n\\end{proof}\n\n"
  },
  {
    "path": "tex/collection/04-03-uniform.tex",
    "content": " % reset section counter\n%\\setcounter{section}{0}\n\\metadata{8}{David Lin and Jinhui Wang}{Feb.~8th, 2021}\n\n\\sec{Covering number upper bounds Rademacher complexity}\nIn Chapter \\ref{chap:gen-bounds}, we will prove Rademacher complexity bounds that hinge on elegant, ad-hoc algebraic manipulations that may not extend to more general settings. Here, we consider a more fundamental approach for proving empirical Rademacher complexity bounds based on coverings of the output space. The trade-off is generally more tedium.\n\nThe first important observation is that for purposes of computing the \\textbf{empirical} Rademacher complexity on samples $z_1, \\dots, z_n$, \n\\al{\n    R_S(\\cF) = \\Exp_\\sigma \\sbr{\\sup_{f \\in \\cF} \\frac 1 n \\sum_{i=1}^n \\sigma_i f(z_i)},\n}\nwe only care about the output of function $f \\in \\cF$, and not the function itself (i.e. it is sufficient for our purposes to know $f(z_1),\\dots, f(z_n)$, but not know $f$). In other words, we can characterize $f \\in \\cF$ by $f(z_1),\\dots, f(z_n)$. In the sequel, we will take advantage of this simplification from the (potentially large) space of all functions $\\cF$ to the \\textit{output space},\n\\begin{equation}\n\\cQ \\triangleq \\cbr{ \\begin{pmatrix} f(z_1), \\dots, f(z_n) \\end{pmatrix}^\\top: f\\in \\cF} \\subseteq \\R^n, \\label{lec6:eqn:shattercoef}\n\\end{equation}\nwhich may be drastically smaller than $\\cF$. Correspondingly, the empirical Rademacher complexity can be rewritten as a maximization over the output space $\\cQ$ instead of the function space $\\cF$: \n\\al{\n    R_S(\\cF) &= \\Exp_\\sigma \\sbr{\\sup_{v\\in \\cQ} \\frac 1 n \\inprod{\\sigma, v}}.\n}\nIn other words, the complexity of $\\cF$ can be also interpreted as how much the vectors in $Q$ can be correlated with a random vector $\\sigma.$ See Figure \\ref{lec6:fig:rs-innerprod} for an illustration of this idea. One can also view $\\Exp_\\sigma \\sbr{\\sup_{v\\in \\cQ} \\frac 1 n \\inprod{\\sigma, v}}$ as a complexity measure for the set $Q$. If we replace $\\sigma$ by a Gaussian vector with spherical covariance, then the corresponding quantity (without the $\\frac 1 n$ scaling), $\\Exp_{g\\sim N(0,I)} \\sbr{\\sup_{v\\in \\cQ} \\inprod{g, v}}$, is often referred to as the Gaussian complexity of the set $Q$. (It turns out that Gaussian complexity and Rademacher complexity are closely related.)\n\nAnother corollary of this is that the empirical Rademacher complexity only depends on the functionality of $\\cF$ but not on the exact parameterization of $\\cF$. For example, suppose we have two parameterizations $\\cF = \\left\\{f(x)=\\sum \\theta_{i} x_{i} \\mid \\theta \\in \\mathbb{R}^{d}\\right\\}$ and $\\cF' = \\left\\{f(x)=\\sum \\theta_{i}^{3} \\cdot w_{i} x_{i} \\mid \\theta \\in \\R^{d}, w \\in \\mathbb{R}^{d}\\right\\}$. Since $Q_\\cF$ and $Q_{\\cF'}$ are the same, we see that $R_S(\\cF) = R_S(\\cF')$ since our earlier expression for $R_S(\\cF)$ only depends on $\\cF$ through $Q_\\cF$. \n\n\\begin{figure}[ht!]\n\t\\begin{center}\n\t\t\\includegraphics[width=.5\\textwidth]{figures/remark2.png}\n\t\\end{center}\n\t\\caption{We can view empirical Rademacher complexity as the expectation of the maximum inner product between $\\sigma$ and $v\\in Q$.}\n\t\\label{lec6:fig:rs-innerprod}\n\\end{figure}\n\n\\paragraph{Rademacher complexity of finite hypothesis classes.} In practice, we cannot directly evaluate the Rademacher complexity, so we instead bound its value using quantities that are computable. Given finite $|\\cQ|$, we often rely on the following bound, which is also known as Massart's finite lemma: \n\\begin{proposition}\n    Let $\\cF$ be a collection of functions mapping $Z \\mapsto \\mathbb{R}$ and let $\\cQ$ be defined as in \\eqref{lec6:eqn:shattercoef}. Assume that $\\frac{1}{\\sqrt{n}} \\norm{v}_2 \\le M < \\infty$ for all $v \\in \\cQ$. Then,\n    \\begin{align}\n        R_S(\\cF) \\leq \\sqrt{\\frac{2 M^2 \\log |\\cQ|}{n}}\n    \\end{align}\n    \\label{lec6:prop:massartlemma}\n\\end{proposition}\nWe prove a (slightly) simplified version of this result in Problem 3(c) of Homework 2, so we omit the proof of Massart's lemma here. Using Massart's lemma, we can also bound the Rademacher complexity in terms of $\\cF$. Restating the assumption accordingly, \n\n\\begin{corollary}\n    Let $\\cF$ be a collection of functions mapping $Z \\mapsto \\mathbb{R}$. If $\\sqrt{\\frac{1}{n}\\sum_{i=1}^n f(z_i)^2} \\le M$ for all $f \\in \\cF$, then \n    \\begin{align}\n        R_S(\\cF) \\le \\sqrt{\\frac{2M^2\\log \\abs{\\cF}}{n}}.\n    \\end{align}\n    \\label{lec6:cor:massartlemmafunc}\n\\end{corollary}\nNote that Corollary~\\ref{lec6:cor:massartlemmafunc} yields a looser bound than Massart's lemma since $|\\cQ| \\leq |\\cF|$. \n    \nIn practice, we rarely apply Massart's lemma directly since $|\\cQ|$ is typically infinite. In the sequel, we discuss alternative approaches to bounding the Rademacher complexity that are appropriate for this setting.\n\n\\paragraph{Bounding Rademacher complexity using $\\epsilon$-covers.}\nWhen $|\\cQ|$ is infinite, we can apply the same discretization trick that we used to prove the generalization bound for an infinite-hypothesis space. This time, instead of trying to cover the parameter space, we will cover the output space. To this end, we first recall a few definitions concerning $\\epsilon$-covers.\n\n\\begin{definition}\n$\\cC$ is an \\emph{$\\epsilon$-cover} of $\\cQ$ with respect to metric $\\rho$ if for all $v' \\in \\cQ$, there exists $v \\in \\cC $ such that $\\rho(v,v')\\le \\epsilon$.\n\\end{definition}\n\n\\begin{definition}\nThe \\emph{covering number} is defined as the minimum size of an $\\epsilon$-cover, or explicitly:\n\\begin{align}\n    N(\\epsilon, \\cQ, \\rho) \\overset \\triangle = (\\text{min size of $\\epsilon$-cover of $\\cQ$ w.r.t.\\ metric $\\rho$}).\n\\end{align}\n\\end{definition}\n\n\\begin{figure}[h]\n\t\\begin{center}\n\t\t\\includegraphics[width=.5\\textwidth]{figures/chaining_1.png}\n\t\\end{center}\n\t\\caption{We can visualize the $\\epsilon$-cover $\\cC$ by depicting a set of $\\epsilon$-balls that cover the output space $\\cQ$. The yellow circles denote the $\\epsilon$-neighborhoods of the covering points $u_i \\in \\cC$.}\n\t\\label{lec9:fig:eps-cover}\n\\end{figure}\n\nIn subsequent derivations, we will use the metric $\\rho(v,v') = \\frac 1 {\\sqrt{n}} \\norm{v-v'}_2$. \n\n\\begin{remark}\nWe normalize the $\\ell_2$ norm in $\\rho$ by $\\frac{1}{\\sqrt{n}}$ to simplify comparisons to the functional analysis view of the Rademacher complexity. In the literature, the $\\epsilon$-cover of $\\cQ$ defined above is also referred to as an $\\epsilon$-cover of the function class $\\cF$ under the $L_2(P_n)$ metric.\\footnote{$P_n$ denotes the empirical distribution, i.e. the uniform distribution over the observations $z_1,\\dots,z_n$. More generally the $L_p(Q)$ metric is defined by $\\Exp_Q \\left [\\left (f(z) - f'(z) \\right )^p \\right]^{1/p}$.}\nIn particular, \n\\begin{align}\nL_2(P_n)(f,f') = \\sqrt{ \\frac 1 n \\sum_{i=1}^n (f(z_i) - f'(z_i))^2 }.\n\\end{align}\nRecall we have established the following correspondences between the set of functions $\\cF$ and the output space $\\cQ$:\n\\begin{align}\n    f \\in \\cF \\iff \\begin{pmatrix} f(z_1) \\\\ \\vdots \\\\ f(z_n) \\end{pmatrix} \\in \\cQ\n\\end{align}\n\nWe can write a trivial correspondence between both the output and function class points of view as follows:\n\\begin{align}\nN(\\epsilon, \\cF, L_2(P_n)) = N\\left (\\epsilon, \\cQ, \\frac{1}{\\sqrt{n}} || \\cdot ||_2 \\right )\n\\end{align}\nThe results below will be stated in the function-space notation, but in the proofs we will shift to the $\\cQ$-formulation for the sake of clarity.\nIn general, we prefer to reason about covering numbers on $\\cQ$ as it is more natural to analyze vector spaces compared to function spaces.\n\\label{lec8:rmk:l2pncover}\n\\end{remark}\n\nEquipped with the definition of minimal $\\epsilon$-covers, we can prove the following Rademacher complexity bound:\n\n\\begin{theorem}\\label{lec8:thm:rc-covering-bd}\nLet $\\cF$ be a family of functions $Z \\mapsto [-1,1]$. Then\n\\begin{equation}\nR_S(\\cF) \\le \\inf_{\\epsilon > 0} \\rbr{ \\epsilon + \\sqrt{ \\frac {2\\log N(\\epsilon, \\cF, L_2(P_n))} n } }. \\label{lec8:eqn:rc-covering-bd}\n\\end{equation}\n\\end{theorem}\n\nThe $\\epsilon$ term can be thought of as the discretization error, while the second term is the Rademacher complexity of the finite $\\epsilon$-cover. The precise form of this complexity bound follows from Proposition~\\ref{lec6:prop:massartlemma}.\n\n\\begin{proof}\nFix any $\\epsilon > 0$. Let $\\cC$ be the minimal $\\epsilon$-cover of $\\cQ$ with respect to the metric $\\rho(v,v') = \\frac 1 {\\sqrt{n}} \\norm{v-v'}_2$. Note that $\\abs{\\cC} = N(\\epsilon, \\cQ, \\frac{1}{\\sqrt{n}} \\norm{\\cdot}_2) = N(\\epsilon, \\cF, L_2(P_n))$.\n\nWe aim to bound $R_S(\\cF) = \\Exp_\\sigma[\\sup_{v \\in \\cQ} \\frac{1}{n} \\inprod{v, \\sigma}]$ by approximating $v$ with $v' \\in \\cC$. In particular, for every point $v \\in \\cQ$, choose $v' \\in \\cC$ such that $\\rho(v, v') \\leq \\epsilon$ and $z$ is small (specifically, $\\frac 1 {\\sqrt{n}} \\norm{z}_2 \\le \\epsilon$). This gives\n\\al{\n    \\frac 1 n \\inprod{v, \\sigma} &= \\frac 1 n \\inprod{v',\\sigma} + \\frac 1 n \\inprod{v - v', \\sigma}\\\\\n    &\\le \\frac 1 n \\inprod{v', \\sigma} + \\frac 1 n \\norm{z}_2 \\norm{\\sigma}_2 \n        &&\\text{($z \\defeq v - v'$, Cauchy-Schwarz)} \\label{lec8:eqn:cs-step}\\\\\n    &\\le \\frac 1 n \\inprod{v', \\sigma} + \\epsilon.\n        &&\\text{(since $\\norm{z}_2\\le \\sqrt{n}\\epsilon$ and $\\norm{\\sigma}_2 \\le \\sqrt{n}$)}\n}\nTaking the expectation of the supremum on both sides of this inequality gives\n\\al{\n    R_S(\\cF) &= \\Exp_\\sigma \\sbr{\\sup_{v\\in \\cQ} \\frac 1 n \\inprod{v,\\sigma} }\\\\\n    &\\le \\Exp_\\sigma \\sbr{\\sup_{v'\\in \\cC} \\rbr{\\frac 1 n \\inprod{v',\\sigma} + \\epsilon}}\\\\ \n    &= \\epsilon + \\Exp_\\sigma \\sbr{\\sup_{v'\\in \\cC} \\rbr{\\frac 1 n \\inprod{v',\\sigma}}} \\\\\n    &\\le \\epsilon + \\sqrt{ \\frac {2\\log \\abs{\\cC}} n } &\\text{(Proposition~\\ref{lec6:prop:massartlemma})} \\\\\n    &= \\epsilon + \\sqrt{ \\frac {2\\log N(\\epsilon, \\cQ , \\rho)} n } \\\\\n    &= \\epsilon + \\sqrt{ \\frac {2\\log N(\\epsilon, \\cF , L_2(P_n))} n } &\\text{(Remark~\\ref{lec8:rmk:l2pncover})}\n}\nSince the argument above holds for any $\\epsilon > 0$, we can take the infimum over all $\\epsilon$ to arrive at Equation \\eqref{lec8:eqn:rc-covering-bd}.\n\n\\end{proof}\n\n\\subsec{Chaining and Dudley's theorem}\n\nWhile Theorem \\ref{lec8:thm:rc-covering-bd} is useful, the bound in \\eqref{lec8:eqn:cs-step} is rarely tight as $z$ might not be perfectly correlated with $\\sigma$. It is possible to obtain a stronger theorem by constructing a chained $\\epsilon$-covering scheme. Specifically, when we decompose $v=v'+z$, we can construct a finer-grained covering of the ball $B(v',\\epsilon)$, and then we can decompose $z$ into smaller components and so on (see Figure \\ref{lec9:fig:chaining_diag} for an illustration).\n\nUsing this method of chaining, we can obtain the following (stronger) result:\n\n\\begin{theorem}[Dudley's Theorem]\nIf $\\cF$ is a function class from $Z \\mapsto \\R$, then\n\\begin{equation}\n    R_S(\\mathcal{F})\\leq 12\\int_{0}^{\\infty}\\sqrt{\\frac{\\log N(\\epsilon, \\mathcal{F}, L_2(P_n))}{n}}d\\epsilon. \\label{lec9:eqn:dudley}\n\\end{equation}\n\\end{theorem}\n\nNote that unlike in Theorem~\\ref{lec8:thm:rc-covering-bd}, we do not require $f \\in \\cF$ to be bounded.\n\nIt is not obvious how \\eqref{lec9:eqn:dudley} improves upon the one-step discretization bound given by \\eqref{lec8:eqn:rc-covering-bd}. At a high level, we can interpret this bound as removing the discretization error term by averaging over different scales of $\\epsilon$.  But before we can explicitly prove this claim, we motivate our approach. In the proof of Theorem~\\ref{lec8:thm:rc-covering-bd}, we approximated $v$ with $v' + z$ where $v'$ is the closest point to $v$ in the minimal $\\epsilon$-cover of $\\cQ$, and $z$ is the vector between $v'$ and $v$. In particular,\n\\begin{equation}\n    \\frac 1 n \\inprod{v, \\sigma} = \\frac 1 n \\inprod{v', \\sigma} + \\frac 1 n \\inprod{z, \\sigma} \\label{lec9:eqn:disc_decomp}\n\\end{equation}\nThen, to obtain a bound, we take a $\\sup$ of both sides, but apply the $\\sup$ separately to each term on the right hand side. Namely, we show that:\n\\begin{align}\n    \\Exp \\left[\\sup_v \\frac{1}{n} \\inprod{v, \\sigma} \\right] \\leq \\Exp \\left[\\sup_{v' \\in \\cC} \\frac 1 n \\inprod{v', \\sigma} \\right ] + \\Exp \\left[ \\sup_{z \\in B_{v'}} \\frac{1}{n} \\inprod{z, \\sigma} \\right]\n\\end{align}\nThis bound follows by observing that $\\Exp[\\sup (A + B)] \\leq \\Exp[\\sup A] + \\Exp[\\sup B]$ since the $\\sup$ on the RHS is taken separately over both terms. The difficult term to tightly bound is the last one, $\\frac 1 n \\inprod{z, \\sigma}$. In the previous derivation, we naively upper bounded $\\inprod{z, \\sigma}$ using Cauchy-Schwarz,\n\\begin{equation}\n    \\frac 1 n \\inprod{z,\\sigma} \\le \\frac{\\norm{z}_2\\cdot \\norm{\\sigma}_2} n,\n\\end{equation}\nbut this bound is only tight if there exists $z \\in B_{v'}$ that is perfectly correlated with $\\sigma$. We claim that such perfect correlation is unlikely. Recall that the output space is defined by possible outputs of $f \\in \\cF$ given $n$ inputs. Unless our function class is extremely expressive, the set of radius $\\epsilon$ around $v'$ contained in $\\cQ$ will only be a small subset of the $\\epsilon$-ball centered at $v'$; thus, $\\sup_{z} \\frac{1}{n} \\inprod{z, \\sigma} \\ll \\frac{\\norm{z}_2 \\cdot \\norm{\\sigma}_2}{n}$.\n\nTo precisely set up our approach, we observe that $\\Exp[\\sup_{z \\in B_{v'}} \\frac 1 n \\inprod{z, \\sigma}]$ is itself a Rademacher complexity: $R_S(B_{v'} \\cap \\cQ)$. To more tightly bound $\\Exp \\l[ \\sup_{z \\in B_{v'}} \\frac 1 n \\inprod{z,\\sigma}\\r]$, we then repeat the $\\epsilon$-covering argument again with a smaller choice of $\\epsilon$. Intuitively, this procedure amounts to decomposing $\\inprod{z, \\sigma}$ from \\eqref{lec9:eqn:disc_decomp} into another pair of terms corresponding to the new $\\epsilon$-cover and the discretization error. ``Chaining'' then repeats this decomposition countably many times. This procedure is illustrated visually by Figure~\\ref{lec9:fig:chaining_diag}, and we formalize this argument in the sequel.\n\n\\begin{figure}[ht!]\n    \\centering\n    \\begin{subfigure}[t]{0.45\\textwidth}\n        \\includegraphics[width=\\textwidth]{figures/chaining_1.png}\n        \\caption{}\n        \\label{lec9:fig:chaining_1}\n    \\end{subfigure}\n    \\hfill\n    \\begin{subfigure}[t]{0.45\\textwidth}\n        \\includegraphics[width=\\textwidth]{figures/chaining_2.png}\n        \\caption{}\n        \\label{lec9:fig:chaining_2}\n    \\end{subfigure}\n    \\hfill\n    \\begin{subfigure}[t]{0.45\\textwidth}\n        \\includegraphics[width=\\textwidth]{figures/chaining_3.png}\n        \\caption{}\n        \\label{lec9:fig:chaining_3}\n    \\end{subfigure}\n    \\caption{We depict how the chaining procedure approximates $v$ using a sequence of progressively finer discretizations. Figure~\\ref{lec9:fig:chaining_1} illustrates how we first approximate $v$ using the nearest covering point $u_1$, while Figures~\\ref{lec9:fig:chaining_2} and \\ref{lec9:fig:chaining_3} describe how we refine this approximation using two finer covers, whose nearest points are denoted by $u_2$ and $u_3$, respectively.}\n    \\label{lec9:fig:chaining_diag}\n\\end{figure}\n\n\\begin{proof} \n    Let $\\epsilon_0 = \\sup_{f\\in \\cF} \\max_i \\abs{f(z_i)}$, so that for all $v \\in \\cQ$,\n    \\begin{equation}\n        \\epsilon_0 \\ge \\sqrt{\\frac 1 n \\sum_{i=1}^nf(z_i)^2}  = \\sqrt{\\frac 1 n \\norm{v}_2^2}.\n    \\end{equation}\n    \n    Define $\\epsilon_j = 2^{-j}\\epsilon_0$ and let $\\cC_j$ be an $\\epsilon_j$-cover of $\\cQ$. Then, $\\cC_0$ is the coarsest cover of $\\cQ$, and as $j$ increases, we obtain progressively more fine-grained covers $\\cC_j$. We can intuitively think of these covers as nested, but this is not necessary for the proof to hold. We next use this sequence of covers to define a telescoping series that equals $v$; the terms in this series can then be analyzed using the tools that we have developed in the prequel. \n    \n    For $v \\in \\cQ$, let $u_i$ denote the nearest neighbor of $v$ in $\\cC_i$. Note that by definition $\\rho(u, v_j) \\leq \\epsilon_j$. Taking $u_0 = 0$, it follows from our definition of $\\cC_i$ that as $j \\to \\infty$, $\\epsilon_j \\to 0$ and $u_j \\to v$. Leveraging these observations, we can express $v$ using the following series:\n    \\begin{align}\n        v &= u_1 + (u_2 - u_1) + (u_3 - u_2) + \\cdots \\\\\n        &= (u_1 - u_0) + (u_2 - u_1) + (u_3 - u_2) + \\cdots \\\\\n        &= \\sum_{i = 1}^\\infty (u_i - u_{i - 1}). \\label{lec9:eqn:telescope_chain}\n    \\end{align}\n    \n    Substituting \\eqref{lec9:eqn:telescope_chain} in the Rademacher complexity we aim to bound, we obtain\n    \\al{\n        \\Exp\\l[\\sup_{v \\in \\cQ} \\frac 1 n \\inprod{v, \\sigma}\\r]&= \\Exp\\l[\\sup_{v \\in \\cQ} \\frac 1 n \\sum_{i=1}^\\infty \\inprod{u_i - u_{i - 1}, \\sigma}\\r]\\\\\n        &\\le \\Exp\\l[\\sum_{i=1}^\\infty \\sup_{u_i \\in \\cC_i, u_{i - 1} \\in \\cC_{i - 1}} \\frac 1 n\\inprod{u_i - u_{i - 1}, \\sigma}\\r]\\\\\n        &= \\sum_{i=1}^\\infty \\Exp\\l[ \\sup_{u_i \\in \\cC_i, u_{i - 1} \\in \\cC_{i - 1}} \\frac 1 n\\inprod{u_i - u_{i - 1}, \\sigma}\\r]. \\label{lec9:eqn:chaining_expansion}\n    }\n    Observe that\n    \\begin{equation}\n        \\Exp\\l[ \\sup_{u_i \\in \\cC_i, u_{i - 1} \\in \\cC_{i - 1}} \\frac 1 n\\inprod{u_i-u_{i - 1}, \\sigma}\\r]\n    \\end{equation}\n    is a Rademacher complexity defined over the \\emph{finite} space $\\cC_i \\times \\cC_{i - 1}$, so we can use Proposition~\\ref{lec6:prop:massartlemma} (Massart's lemma) to obtain a tractable upper bound. To do so, we must first compute an upper bound on $\\frac{1}{\\sqrt{n}} \\norm{u_i - u_{i - 1}}_2$:\n    \\al{\n        \\frac 1 {\\sqrt n} \\norm{u_i - u_{i - 1}}_2 &= \\frac 1 {\\sqrt{n}} \\norm{(u_i - v) - (u_{i - 1} - v)}_2\\\\\n        &\\le \\frac 1 {\\sqrt{n}} \\l(\\norm{u_i - v}_2 - \\norm{u_{i - 1} - v}_2 \\r)\\\\\n        &\\le \\epsilon_i + \\epsilon_{i - 1} \\\\\n        &= 3 \\epsilon_i & \\text{($\\epsilon_{i - 1} \\defeq 2 \\epsilon_i$)}\n    }\n    Now we apply Proposition~\\ref{lec6:prop:massartlemma} with $M = 3 \\epsilon_i$ and $\\abs{\\cQ} = \\abs{\\cC_i \\times \\cC_{i - 1}} \\leq \\abs{\\cC_i} \\cdot \\abs{\\cC_{i - 1}}$.\n    \\al{\n        \\Exp\\l[\\sup_{u_i \\in \\cC_i, u_{i - 1} \\in \\cC_{i - 1}} \\frac 1 n \\inprod{u_i - u_{i - 1}, \\sigma} \\r] & \\le \\sqrt{\\frac{2(3 \\epsilon_i)^2\\log (\\abs{\\cC_i}\\cdot \\abs{\\cC_{i-1}})}{n}}\\\\\n        &= \\frac{3 \\epsilon_i}{\\sqrt{n}}\\sqrt{2(\\log \\abs{\\cC_i} + \\log \\abs{\\cC_{i-1}})}\\\\\n        &\\le \\frac{6 \\epsilon_i}{\\sqrt{n}}\\sqrt{\\log \\abs{\\cC_i}} & (\\abs{\\cC_i} \\ge \\abs{\\cC_{i - 1}}) \\label{lec9:eqn:massartbound}\n    }\n    \n    Applying \\eqref{lec9:eqn:massartbound} to each term in \\eqref{lec9:eqn:chaining_expansion} and substituting the covering number $N(\\epsilon_i, \\cF, L_2(P_n))$ for $|\\cC_i|$, we obtain the following upper bound on the Rademacher complexity:\n    \\al{\n        \\Exp\\l[\\sup_{v \\in \\cQ} \\frac 1 n \\inprod{v, \\sigma} \\r] & \\le \\sum_{i = 1}^\\infty \\frac{6 \\epsilon_i}{\\sqrt{n}}\\sqrt{\\log N(\\epsilon_i, \\cF, L_2(P_n))}. \\label{lec9:eqn:dudley_sumbound}\n    }\n\n    Finally, we must relate \\eqref{lec9:eqn:dudley_sumbound} to the target upper bound of $12 \\int \\frac{1}{\\sqrt{n}} \\sqrt{\\log N(\\epsilon, \\cF, L_2(P_n))} d\\epsilon$. Examining Figure~\\ref{lec9:fig:chaining_riemann}, we can make two crucial observations. First, for sufficiently large $\\epsilon$, $\\log N(\\epsilon, \\cF, L_2(P_n)) = 0$ since one point is sufficient to construct a cover. Second, we observe that \n    \\begin{align}\n        (\\epsilon_i - \\epsilon_{i + 1}) \\sqrt{\\log \\abs{\\cC_i}} \\leq \\int_{\\epsilon_{i + 1}}^{\\epsilon_i} \\sqrt{\\log N(\\epsilon, \\cF, L_2(P_n))} d\\epsilon \\label{lec9:eqn:riemann_term}\n    \\end{align}\n    since the LHS of \\eqref{lec9:eqn:riemann_term} is the area of the dotted rectangle illustrated in Figure~\\ref{lec9:fig:chaining_riemann} while the RHS is the area under the curve for that interval. Formally, this result is equivalent to observing that the right Riemann sum underestimates the integral for monotone decreasing functions $f$.\n\n    \\begin{figure}[ht!]\n        \\begin{center}\n            \\includegraphics[width=.7\\textwidth]{figures/chaining_riemann.png}\n        \\end{center}\n        \\caption{We observe that $\\log N(\\epsilon, \\cF, L_2(P_n))$ is monotone decreasing in $\\epsilon$. The area of the dotted rectangle formed by the vertical lines at $\\epsilon_{i + 1}$ and $\\epsilon_i$ equals (up to a constant factor) the $i-$th term of the infinite sum derived in our proof of Dudley's theorem \\eqref{lec9:eqn:dudley_sumbound}. The figure shows that the area of this rectangle is no larger than the integral of $\\log N(\\epsilon, \\cF, L_2(P_n))$ over this same interval.}\n        \\label{lec9:fig:chaining_riemann}\n    \\end{figure}  \n\n    \n    Recognizing that $\\epsilon_i - \\epsilon_{i + 1} = \\frac{\\epsilon_i}{2}$, we note that the LHS of \\eqref{lec9:eqn:riemann_term} is equal (up to a constant factor) to the $i$-th term of \\eqref{lec9:eqn:dudley_sumbound}. Thus,\n    \\begin{align}\n        \\sum_{i = 1}^\\infty \\frac{6 \\epsilon_i}{\\sqrt{n}}\\sqrt{\\log N(\\epsilon_i, \\cF, L_2(P_n))} &= \\frac{12}{\\sqrt{n}} \\sum_{i = 1}^\\infty (\\epsilon_i - \\epsilon_{i + 1}) \\sqrt{\\log N(\\epsilon_i, \\cF, L_2(P_n))} \\label{lec9:eqn:dudley_rriemann} \\\\\n        &\\leq \\frac{12}{\\sqrt{n}} \\int_{\\epsilon_{i + 1}}^{\\epsilon_i} \\sqrt{\\log N(\\epsilon_i, \\cF, L_2(P_n))} d\\epsilon \\\\\n        &= \\frac{12}{\\sqrt{n}} \\int_{0}^{\\epsilon_0} \\sqrt{\\log N(\\epsilon, \\cF, L_2(P_n))} d\\epsilon. \\label{lec9:eqn:dudley_almost}\n    \\end{align} \n\n    To complete the proof, observe that $\\log N(\\epsilon, \\cF, L_2(P_n)) = 0$ for all $\\epsilon > \\epsilon_0$. This allows us to extend the upper limit of the integral given by \\eqref{lec9:eqn:dudley_almost} to $\\infty$ and yields the desired result:\n    \\begin{align}\n        \\Exp\\l[\\sup_{v \\in \\cQ} \\frac 1 n \\inprod{v, \\sigma} \\r] & \\le \\frac{12}{\\sqrt{n}} \\int_{0}^\\infty \\sqrt{\\log N(\\epsilon, \\cF, L_2(P_n))} d\\epsilon.\n    \\end{align}\n\\end{proof}\n\n\\begin{remark}\nIf $\\mathcal{F}$ consists of functions bounded in $[-1,1]$, then we have that for all $\\epsilon > 1, N(\\epsilon, \\mathcal{F}, L_2(P_n))=1$. To see this, choose $\\{f\\equiv 0\\}$, which is a complete cover for $\\epsilon>1$. Hence, the limits of integration in \\eqref{lec9:eqn:dudley} can be truncated to $[0,1]$:\n\\begin{equation}\n    R_S(\\mathcal{F})\\leq 12\\int_{0}^{1}\\sqrt{\\frac{\\log N(\\epsilon, \\mathcal{F}, L_2(P_n))}{n}}d\\epsilon,\n\\end{equation}\n    since $\\log N(\\epsilon, \\mathcal{F}, L_2(P_n))=0$ for $\\epsilon >1$.\n\\end{remark}\n\n\\subsec{Translating Covering Number Bounds to Rademacher Complexity} \\label{lec9:sec:cover_to_radem}\n\nOf course, the bound in \\eqref{lec9:eqn:dudley} is only useful if the integral on the RHS is finite. Here are some setups where this is the case (we continue to assume that the functions in $\\cF$ are bounded in $[-1, 1]$):\n\n\\begin{enumerate}\n\\item If after ignoring multiplicative and additive constants,\n\\begin{equation}\n    N(\\epsilon, \\mathcal{F}, L_2(P_n))\\approx (1 / \\epsilon)^R,\n\\end{equation}\nthen we have $\\log N(\\epsilon, \\mathcal{F}, L_2(P_n)) \\approx  R\\log (1/\\epsilon)$. We can plug this into the RHS of \\eqref{lec9:eqn:dudley} to get\n\\begin{equation}\n\\int_{0}^{1}\\sqrt{\\frac{\\log N(\\epsilon, \\mathcal{F}, L_2(P_n))}{n}}d\\epsilon = \\int_{0}^1\\sqrt{\\frac{R\\log(1/\\epsilon)}{n}}d\\epsilon \\approx \\sqrt{\\frac{R}{n}}.\n\\end{equation}\n            \n\\item If after ignoring multiplicative and additive constants, for some $a$,\n\\begin{equation}\n    N(\\epsilon, \\mathcal{F}, L_2(P_n))\\approx a^{R/\\epsilon},\n\\end{equation}\nthen we have $\\log N(\\epsilon, \\mathcal{F}, L_2(P_n)) \\approx \\frac{R}{\\epsilon}\\log a$. The bound in \\eqref{lec9:eqn:dudley} becomes\n        \n\\begin{align}\n\\int_0^1\\!\\!\\sqrt{\\frac{\\log N(\\epsilon, \\mathcal{F}, L_2(P_n))}{n}}d\\epsilon &\\approx \\int_0^1\\!\\!\\sqrt{\\frac{R}{n\\epsilon}\\log a}\\, d\\epsilon \\\\\n&= \\sqrt{\\frac{R}{n}\\log a} \\int_0^1\\!\\!\\sqrt{\\frac{1}{\\epsilon}}d\\epsilon \\\\\n&= \\tilO \\l(\\sqrt{\\frac{R}{n}}\\r).\n\\end{align}\n        \n\\item If the covering number has the form $N(\\epsilon, \\mathcal{F}, L_2(P_n))\\approx a^{R/\\epsilon^2}$, then $\\log N(\\epsilon, \\mathcal{F}, L_2(P_n))\\approx \\frac{R}{\\epsilon^2}\\log a$. In this case we have:\n        \n\\begin{equation}\\int_0^1\\sqrt{\\frac{\\log N(\\epsilon, \\mathcal{F}, L_2(P_n))}{n}}d\\epsilon \\approx \\sqrt{\\frac{R}{n}\\log a} \\underbrace{\\int_0^1\\frac{1}{\\epsilon}d\\epsilon}_{=\\infty}=\\infty,\n\\end{equation}\n\ni.e. the bound in \\eqref{lec9:eqn:dudley} is vacuous. This is because of the behavior of $\\epsilon \\mapsto 1/\\epsilon^2$ near 0: the function goes to infinity too quickly for us to upper bound its integral. Fortunately, there is an ``improved'' version of Dudley's theorem that is applicable here:\n        \n\\begin{theorem}[Localized Dudley's Theorem]\\label{lec9:thm:better-dudley}\nIf $\\cF$ is a function class from $Z \\mapsto \\R$, then for any fixed cutoff $\\alpha \\geq 0$ we have the bound\n\\begin{equation}\\label{lec9:eqn:better-dudley}\nR_S(\\mathcal{F})\\leq 4\\alpha + 12\\int_{\\alpha}^{\\infty}\\sqrt{\\frac{\\log N(\\epsilon, \\mathcal{F}, L_2(P_n))}{n}}d\\epsilon.      \n\\end{equation}\n\\end{theorem}\nThe proof of this theorem is similar to the proof of the original Dudley's theorem, except that the iterative covering procedure is stopped at the threshold $\\epsilon = \\alpha$ at the cost of the extra $4\\alpha$ term above.\n        \nTheorem \\ref{lec9:thm:better-dudley} allows us to avoid the problematic region around $\\epsilon=0$ in the integral in \\eqref{lec9:eqn:dudley}. If we let $\\alpha = 1/\\mathsf{poly}(n)$, where $\\mathsf{poly}(n)$ denotes some polynomial function of $n$, the bound in \\eqref{lec9:eqn:better-dudley} becomes\n\\begin{align}\nR_S(\\mathcal{F}) &\\leq \\frac{1}{\\mathsf{poly}(n)} + \\frac{\\sqrt{R\\log a}}{\\sqrt{n}}\\int_{\\alpha}^1\\frac{1}{\\epsilon}d\\epsilon \\\\\n&= \\frac{1}{\\mathsf{poly}(n)}  + \\frac{\\sqrt{R\\log a}}{\\sqrt{n}} \\log(1/\\alpha) \\\\\n&= \\tilO \\l(\\sqrt{\\frac{R}{n}}\\r). \\label{lec9:eqn:rademacherbound_three}\n\\end{align}\n\\end{enumerate}\nThe last line follows by observing that $\\log(1/\\alpha) = \\log \\mathsf{poly}(n)$.\n\nIn summary, we have that $R_S(\\mathcal{F}) \\leq \\tilO\\l(\\sqrt{\\frac{R}{n}}\\r)$ for covering numbers of the form $R\\log (1/\\epsilon)$,$\\frac{R}{\\epsilon} \\log a$, or $\\frac{R}{\\epsilon^2} \\log a$ for some $a$. Note that if the dependence on $\\epsilon$ is $1/\\epsilon^c$ for $c > 2$, then even the improved Dudley's theorem does not help us. This is because the $\\log(1/\\alpha)$ term above becomes $\\alpha^{1-c/2}$; then, for $\\alpha = 1/\\mathsf{poly}(n)$, the second term in Dudley's integral is no longer $\\tilO \\l(\\sqrt{\\frac{R}{n}}\\r )$.\n\n\\subsec{Lipschitz composition}\nCovering numbers also interact nicely with composition by Lipschitz functions. The following result is the analog of Talagrand's lemma for Rademacher complexity (Lemma~\\ref{lec6:lem:talagrand_lemma}), but its proof is much more elementary as given below. We will use this Lemma in Section~\\ref{sec:deep_nets} when bounding the covering number of deep nets. \n\\begin{lemma} \\label{lec9:lma:talagrand}\n\tSuppose $\\phi$ is $\\kappa$-Lipschitz, and $\\rho = L_2(P_n)$. Then,\n\t\\begin{align}\n\t\\log N(\\epsilon, \\phi \\circ \\cF, \\rho) \\le \\log N(\\epsilon / \\kappa, \\cF, \\rho) \\label{lec9:eqn:covering-num-lipschitz}\n\t\\end{align}\n\\end{lemma}\n\\begin{proof}\n\tLet $\\cC$ denote an $\\epsilon/\\kappa$-cover for $\\cF$. Then $\\phi \\circ \\cC$ is an $\\epsilon$-cover of $\\phi \\circ \\cF$.\n\t\\begin{align}\n\t\\rho(\\phi \\circ f', \\phi \\circ f) &= \\sqrt{\\frac{1}{n} \\sum (\\phi(f'(z_i)) - \\phi(f(z_i)))^2} \\\\ \n\t&\\le \\sqrt{\\frac{1}{n} \\cdot \\kappa^2 \\sum(f'(z_i) - f(z_i))^2}\\\\\n\t&\\le \\kappa \\cdot \\frac{\\epsilon}{\\kappa} = \\epsilon\n\t\\end{align}\n\\end{proof}\n\n\\sec{VC dimension and its limitations}\nIn this section, we briefly discuss a classical notion of complexity measure of function class, VC dimension. We will show that VC dimension is an upper bound on the Rademacher complexity. We will focus on classification and will be working within the framework of supervised learning stated in Chapter \\ref{chap:supervised}. The labels belong to the output space $\\mathcal{Y} = \\{-1, 1\\}$, each classifier is a function $h:\\mathcal{X}\\to\\R$ for all $h \\in \\cH$, and the prediction is the sign of the output, i.e. $\\hat{y} = \\sgn(h(x))$. We will look at zero-one loss, i.e. $\\err((x,y), h) = \\mathbbm{1}(\\sgn(h(x))\\neq y)$. Note that we can re-express the loss function as\n\\begin{equation}\n\\err((x,y), h) = \\frac{1-\\sgn(h(x))y}{2}.\n\\end{equation}\n\nThe first approach is to reason directly about the Rademacher complexity of $\\err$ loss, i.e. considering the family of functions $\\cF = \\left\\{ z = (x, y) \\mapsto \\err((x, y), h) : h \\in \\cH \\right\\}$. Define $Q$ to be the set of all possible outputs on our dataset: $Q=\\left\\{\\left(\\sgn\\left(h\\left(x^{(1)}\\right)\\right), \\dots, \\sgn \\left(h\\left(x^{(n)}\\right)\\right)\\right)\\mid  h \\in \\cH \\right\\}$. Then, using our earlier remark about viewing the empirical Rademacher complexity as an inner product between $v\\in Q$ and $\\sigma$, we have\n\\begin{align}\nR_S(\\cF) &= \\Exp_{\\sigma_1,\\dots, \\sigma_n} \\l[ \\sup_{f\\in \\cF} \\frac{1}{n} \\sum^n_{i=1} \\sigma_i \\frac{1-\\sgn(h(x^{(i)}))y_i}{2} \\r] \\\\\n&= \\Exp_{\\sigma_1,\\dots, \\sigma_n} \\l[ \\sup_{f\\in \\cF} \\frac{1}{n} \\sum^n_{i=1} \\sigma_i \\frac{\\sgn(h(x^{(i)}))}{2} \\r] \\\\\n&= \\frac{1}{2}\\Exp_{\\sigma_1,\\dots, \\sigma_n} \\l[ \\sup_{v\\in Q} \\frac{1}{n} \\langle \\sigma, v\\rangle \\r].\n\\end{align}\n\nNotice that the supremum is now over $Q$ instead of $\\cF$. If $n$ is sufficiently large, then it is typically the case that $|Q|>|\\cF|$. To see why this is the case, note that each function $f$ corresponds to a single element in $Q$. However, as $n$ increases, $|Q|$ increases as well. For any particular $v\\in Q$, notice that $\\langle v, \\sigma\\rangle$ is a sum of bounded random variables, so we can use Hoeffding's inequality to obtain\n\\begin{equation}\n\\Pr\\left[\\frac{1}{n}\\langle\\sigma, v\\rangle\\geq t\\right] \\leq \\exp (-n t^2 / 2).\n\\end{equation}\nTaking the union bound over $v\\in Q$, we see that \n\\begin{equation}\n\\Pr\\left[\\exists v\\in Q \\text{ such that } \\frac{1}{n}\\langle\\sigma, v\\rangle \\geq t\\right] \\leq |Q| \\exp (-nt^2 / 2).\n\\end{equation}\nThus, with probability at least $1-\\delta$, it is true that $\\sup _{v \\in Q} \\frac{1}{n}\\langle v, \\sigma \\rangle \\leq \\sqrt{\\frac{2(\\log|Q| + \\log (2/\\delta))}{n}}$. Similarly, we can show that $\\Exp \\left[ \\sup _{v \\in Q} \\frac{1}{n}\\langle v, \\sigma \\rangle \\right] \\leq O\\l(\\sqrt{\\frac{\\log|Q| + \\log (2/\\delta)}{n}}\\r)$ holds.\n\nThe key point to notice here is that the upper bound on $R_S(\\cF)$ depends on $\\log |Q|$. \\textit{VC dimension} is one way that we deal with bounding the size of $Q$. We will not delve into the details of this approach (for those interested, see Section 3.11 of \\cite{percynotes}). VC dimension, however, has a number of limitations. For one, we will always end up with a bound that depends somehow on the dimension. For linear models, we obtain a bound $\\log |Q| \\lesssim d \\log n$, corresponding to a bound on Rademacher complexity that looks like\n\\begin{equation}\nR_S(\\cF) \\leq \\tilO \\left( \\sqrt{\\frac{d}{n}} \\right),\n\\end{equation}\nso we still have a $\\sqrt{d}$ term. This will not be a good bound for high-dimensional models. For general models, we will arrive a bound of the form \n\\begin{equation}\nR_S(\\cF) \\leq \\tilO \\left( \\sqrt{\\frac{\\text{\\# of parameters}}{n}} \\right).\n\\end{equation}\nThis upper bound only depends on the number of parameters in our model, and does not take into the account the scale and norm of the parameters. Additionally, this doesn't work with kernel methods since the explicit parameterization is possibly infinite-dimensional, and therefore this upper bound becomes useless.\n\n\nThese limitations motivate the use of margin theory, which does take into account the norm of parameters and provides a theoretical basis for regularization techniques such as $L_1$ and $L_2$ regularization.\n"
  },
  {
    "path": "tex/collection/05-01-concrete-models.tex",
    "content": "% reset section counter\n%\\setcounter{section}{0}\n\n%\\metadata{lecture ID}{Your names}{date}\n\\metadata{6}{Daniel Do}{February 1st, 2021}\n\nIn this chapter, we will instantiate Rademacher complexity for two important hypothesis classes: linear models and two-layer neural networks. In the process, we will develop margin theory and use it to bound the generalization gap for binary classifiers.\n\n\\sec{Margin theory for classification problems}\n\n\\subsec{Intuition}\nAssume that we are in the same setting as in the previous section. A fundamental problem we face in this setting is that we do not have a continuous loss: everything is discrete in the output space. We need to find a way to reason about the scale of the output. An example of this is logistic regression: the logistic regression model outputs a probability, and when we compare it to the outcome (0 or 1), its closeness to the true output gives us a measure of how confident we are in the prediction.\n\nFigure \\ref{lec6:fig:margin} gives similar intuition for linear classifiers. Intuitively, the black line is a ``better'' decision boundary than the red line because the minimum distance from any point to the black boundary is greater than the minimum distance from any point to the red boundary. In the next section, we will formalize this intuition by proving that the larger this margin is, the smaller the bound on the generalization gap is.\n\n\\begin{figure}[ht!]\n    \\begin{center}\n  \\includegraphics[width=0.5\\textwidth]{figures/margin.png}\n  \\end{center}\n  \\caption{The red and black lines are two decision boundaries. The X's are positive examples and the O's are negative examples. The black line has a larger margin than the red line, and is intuitively a better classifier.}\n  \\label{lec6:fig:margin}\n\\end{figure}\n\n\\subsec{Formalizing margin theory} \\label{sec:formal_margin}\nFirst, assume that the dataset $\\cD = ((x\\sp{1}, y\\sp{1}), \\dots, (x\\sp{n}, y\\sp{n}))$ is \\textit{completely separable}. In other words, there exists some $h_\\theta\\in\\cH$ such that $y^{(i)} = \\sgn(h_\\theta(x^{(i)}))$ holds for all $( x^{(i)},y^{(i)})\\in \\cD$. This is not a necessary condition for our final bound but will make the derivation cleaner.\n\n\\begin{definition}[(Unnormalized) Margin]\nFix the hypothesis $h_\\theta$. The \\textit{(unnormalized) margin} for example $(x, y)$ is defined as $\\margin(x) = yh_\\theta(x)$. Margin is only defined on examples where $\\sgn(h_\\theta(x)) = y$. (Note that $\\margin(x)\\geq 0$ because of our assumption of complete separability.)\n\\end{definition}\n\n\\begin{definition}[Minimum margin] Given a dataset $\\cD = ((x\\sp{1}, y\\sp{1}), \\dots, (x\\sp{n}, y\\sp{n}))$, the \\textit{minimum margin} over the dataset is defined as $\\gamma_{\\min} \\triangleq \\min_{i\\in\\{1,\\dots,|\\cD|\\}} y^{(i)}h_\\theta(x^{(i)})$.\n\\end{definition}\n\nOur final bound will have the form (generalization gap) $\\leq f(\\text{margin},\\text{parameter norm})$. This is very generic since there are many different bounds we could derive based on what margin we use. For this current setting we are using $\\gamma_{\\min}$, which is the minimum margin, but in other settings could use $\\gamma_{\\text{average}}$, which is the average margin of each point in the dataset.\n\nWe will begin by introducing the idea of a \\textit{surrogate loss}, a loss function which approximates zero-one loss but takes the scale of the margin into account. The \\textit{margin loss} (also known as \\textit{ramp loss}) is defined as \n\\begin{equation}\n    \\ell_\\gamma(t) = \\begin{cases} \n      0, & t\\geq \\gamma \\\\\n      1, & t\\leq 0 \\\\\n      1-t/\\gamma, & 0\\leq t\\leq \\gamma\n   \\end{cases} \\label{lec6:eqn:ramp_loss}\n\\end{equation}\n\n\\begin{figure}[ht!]\n    \\begin{center}\n  \\includegraphics[width=0.5\\textwidth]{figures/margin_loss.png}\n  \\end{center}\n  \\caption{Plotted margin loss.}\n  \\label{lec6:fig:marginloss}\n\\end{figure}\n\nIt is plotted in Figure \\ref{lec6:fig:marginloss}. For convenience, define $\\ell_\\gamma((x,y), h) \\triangleq \\ell_\\gamma(yh(x))$. We can view $\\ell_\\gamma$ as a continuous version of $\\err$ that is more sensitive to the scale of the margin on $[0,\\gamma]$. Notice that $\\err$ is always less than or equal to the $\\ell_\\gamma$ when $\\gamma\\geq 0$, i.e.\n\\begin{equation}\n    \\err((x,y), h) = \\ind{yh(x) < 0}\\leq \\ell_\\gamma(yh(x)) =\\ell_\\gamma ((x,y), h)\n\\end{equation}\nholds for all $(x,y)\\sim P$. Taking the expectation over $(x,y)$ on both sides of this inequality, we see that\n\\begin{equation}\n    L(h) = \\Exp_{(x,y)\\sim P} \\left[ \\err((x,y), h) \\right] \\leq \\Exp_{(x,y)\\sim P} \\left[ \\ell_\\gamma ((x,y), h) \\right].\n\\end{equation}\n\nTherefore, the population loss is bounded by the expectation of the margin loss, and so it is sufficient to bound the expectation of the margin loss in order to bound the population loss.\n\nDefine the population and empirical versions of the margin loss:\n\\begin{equation}\nL_\\gamma(h) = \\Exp_{(x,y)\\sim P}\\l[ \\ell_\\gamma((x,y), h)\\r], \\quad \\hat{L}_\\gamma(h) = \\sum_{i=1}^n\\l [\\ell_\\gamma((x^{(i)},y^{(i)}), h)\\r].\n\\end{equation}\n\nBy Corollary \\ref{lec6:cor:ggap-rsbound}, we see that with probability at least $1-\\delta$,\n\\begin{equation}\nL_\\gamma(h) - \\hat{L}_\\gamma(h)\\leq 2R_S(\\cF) + 3\\sqrt{\\frac{\\log (2/\\delta)}{2n}},\n\\end{equation}\nwhere $\\cF = \\{(x,y)\\mapsto \\ell_\\gamma((x,y), h)\\mid h\\in\\cH\\}$. Note that if we set $\\gamma\\leq \\gamma_{\\min}$, then $\\hat{L}_{\\gamma}(h) = 0$. This follows because by definition of $\\gamma_{\\min}$, $y^{(i)}h(x^{(i)})\\geq \\gamma_{\\min}$ for any $(x^{(i)}, y^{(i)})\\in \\cD$. As a result, $\\ell_\\gamma((x^{(i)}, y^{(i)}), h) = \\ell_\\gamma(y^{(i)}h(x^{(i)})) = 0$ holds. Therefore, it suffices to bound $R_S(\\cF)$.\n\nWe will now use \\textit{Talagrand's lemma} to bound $R_S(\\cF)$ in terms of $R_S(\\cH)$ to remove any dependence on the loss function from the upper bound. \n \n\\begin{lemma}[Talagrand's lemma] \\label{lec6:lem:talagrand_lemma}\nLet $\\phi:\\R\\to\\R$ be a $\\kappa$-Lipschitz function. Then \\begin{equation}\n    R_S(\\phi\\circ \\cH)\\leq \\kappa R_S(\\cH),\n\\end{equation} \nwhere $\\phi\\circ\\cH = \\{z\\mapsto \\phi(h(z))\\mid h\\in\\cH\\}$.\n\\end{lemma}\n\nWe can use Talagrand's lemma directly with $\\phi(t) = \\ell_\\gamma(t)$, which is $\\frac{1}{\\gamma}$-Lipschitz. We can express $\\cF$ as $\\cF=\\ell_\\gamma\\circ\\cH'$ where $\\cH' = \\{(x,y)\\to yh(x)\\mid h\\in\\cH\\}$. Applying Talagrand's lemma, we see that\n\n\\begin{align}\nR_S(\\cF) &\\leq \\frac{1}{\\gamma}R_S(\\cH') \\\\\n&= \\frac{1}{\\gamma}\\Exp_{\\sigma_1,\\dots, \\sigma_n} \\l[ \\sup_{h\\in \\cH} \\frac{1}{n} \\sum^n_{i=1} \\sigma_i y^{(i)}h(x^{(i)}) \\r] \\\\\n&= \\frac{1}{\\gamma}\\Exp_{\\sigma_1,\\dots, \\sigma_n} \\l[ \\sup_{h\\in \\cH} \\frac{1}{n} \\sum^n_{i=1} \\sigma_i h(x^{(i)})  \\r] \\\\\n&= \\frac{1}{\\gamma}R_S(\\cH).\n\\end{align}\n\nPutting this all together, we have shown that for $\\gamma = \\gamma_{\\min}$,\n\\begin{align}\n\\Err(h) \\leq L_\\gamma(h) &\\leq 0 + O \\left( \\frac{R_S(\\cH)}{\\gamma} \\right) + \\tilO \\left( \\sqrt{\\frac{\\log (2 / \\delta)}{2n}} \\right) \\\\\n&= O \\left( \\frac{R_S(\\cH)}{\\min_i y\\sp{i} h(x\\sp{i}) } \\right) + \\tilO \\left( \\sqrt{\\frac{\\log (2 / \\delta)}{2n}} \\right).\n\\end{align}\n\nIn other words, for training data of the form $S = \\{(x\\sp{i},y\\sp{i})\\}_{i=1}^n \\subset \\mathbb{R}^d \\times \\{-1,1\\}$, a hypothesis class~$\\mathcal{H}$ and 0-1 loss, we can derive a bound of the form\n\\begin{equation}\\label{lec7:eqn:generalization_loss}\n    \\text{generalization loss} \\leq \\frac{2R_S(\\mathcal{H})}{\\gamma_{\\mathrm{min}}} + \\text{low-order term},\n\\end{equation}\nwhere $\\gamma_\\mathrm{min}$ is the minimum margin achievable on~$S$ over those hypotheses in $\\cH$ that separate the data, and $R_S(\\cH)$ is the empirical Rademacher complexity of $\\cH$. Such bounds state that simpler models will generalize better beyond the training data, particularly for data that is strongly separable.\n\n\\begin{remark} \\label{lec7:rmk:union_bound_margin}\nNote there is a subtlety here. If we think of the dataset as random, it follows that $\\gamma_{\\min}$ is a random variable. Consequently, the $\\gamma$ we choose to define the hypothesis class is random, which is not a valid choice when thinking about Rademacher complexity! Technically we cannot apply Talagrand's lemma with a random $\\kappa$ (which we took to be $1/\\gamma$). Also, when we use concentration inequalities, we implicitly assume that the $\\ell_\\gamma((x\\sp{i}, y\\sp{i}), h)$ are independent of each other. That is not the case if $\\gamma$ is dependent on the data.\n\nWe sketch out how one might address this issue below. The main idea is to do another union bound over $\\gamma$. Choose a family $\\Gamma = \\left\\{ 2^k: k \\in [-B, B] \\right\\}$ for some $B$. Then, for every fixed $\\gamma \\in \\Gamma$, with probability greater than $1 - \\delta$,\n\\begin{align}\n\\Err(h) \\leq \\hatL_\\gamma (h) + O \\left( \\frac{R_S(\\cH)}{\\gamma} \\right) + \\tilO \\left( \\sqrt{\\frac{\\log \\frac{1}{\\delta}}{n}} \\right).\n\\end{align}\nTaking a union bound over all $\\gamma \\in \\Gamma$, it further holds that for all $\\gamma \\in (0, B)$, \n\\begin{align}\n    \\Err(h) \\leq \\hatL_\\gamma (h) + O \\left( \\frac{R_S(\\cH)}{\\gamma} \\right) + \\tilO \\left( \\sqrt{\\frac{\\log \\frac{1}{\\delta}}{n}}\\right) + \\tilO \\left ( \\sqrt{\\frac{\\log B}{n}} \\right ). \\label{lec7:eqn:unionboundmargin}\n\\end{align}\nLast, choose the largest $\\gamma \\in \\Gamma$ such that $\\gamma \\leq \\gamma_{\\min}$. Then, for this value of $\\gamma$, our desired bound directly follows from the bound in \\eqref{lec7:eqn:unionboundmargin}. Namely, we have that $\\hatL_{\\gamma} (h) = 0$ and $O \\left( \\frac{R_S(\\cH)}{\\gamma} \\right) = O \\left( \\frac{R_S(\\cH)}{\\gamma_{\\min}} \\right)$. The additional term, $\\tilO\\left ( \\sqrt{\\frac{\\log B}{n} }\\right )$, is the price exacted by the uniform convergence argument required to correct the heuristic bound given in \\eqref{lec7:eqn:generalization_loss}.\n\n\\end{remark}\n"
  },
  {
    "path": "tex/collection/05-02-concrete-models.tex",
    "content": "% reset section counter\n%\\setcounter{section}{0}\n\n%\\metadata{lecture ID}{Your names}{date}\n\\metadata{7}{Spencer M.~Richards and Thomas Lew}{Feb.~3rd, 2021}\n\n\\sec{Linear models}\\label{lec7:sec:lin_models}\n\n\\subsec{Linear models with weights bounded in \\texorpdfstring{$\\ell_2$}{L2} norm}\nWe begin with the Rademacher complexity of linear models using weights with bounded $\\ell_2$ norm.\n\n\\begin{theorem}\\label{lec7:thm:l2-thm}\n    Let $\\mathcal{H} = \\{x \\mapsto \\inprod{w,x} \\mid w \\in \\R^d, \\Norm{w}_2 \\le B\\}$ for some constant $B > 0$. Moreover, assume $\\Exp_{x \\sim P}\\sbr{\\Norm{x}_2^2} \\leq C^2$, where $P$ is some distribution and $C > 0$ is a constant. Then\n    \\begin{align}\n        R_S(\\mathcal{H}) &\\le \\frac{B}{n} \\sqrt{\\sum_{i=1}^n \\Norm{x\\sp{i}}_2^2},  \\label{lec7:eqn:linear-sample}\n        \\intertext{and}\n        R_n(\\mathcal{H}) &\\le \\frac{BC}{\\sqrt{n}}.  \\label{lec7:eqn:linear}\n    \\end{align}\n\\end{theorem}\n\nGenerally speaking, there are two methods with which we can bound the Rademacher complexity of a model. The first method, which we used in Chapter \\ref{chap:uc}, consists of discretizing the space of possible outputs from our hypothesis class, then using a union bound or covering number argument to bound the Rademacher complexity of the model. While this method is powerful and generally applicable, it yields bounds that depend on the logarithm of the cardinality of this discretized output space, which in turn depends on the number of data points~$n$. In the proof below, we will instead use a more elegant, albeit limited technique which does not rely on discretization of the output space.\n\n\\begin{proof}\nWe start with the proof of \\eqref{lec7:eqn:linear-sample}. By definition,\n\\begin{align}\n    R_S(\\mathcal{H}) \n    &= \\Exp_\\sigma\\sbr{ \\sup_{\\Norm{w}_2 \\le B} \\frac{1}{n} \\sum_{i=1}^n\\sigma_i \\inprod{w,x\\sp{i}} }\n    \\\\&= \\frac{1}{n} \\Exp_\\sigma\\sbr{ \\sup_{\\Norm{w}_2 \\le B} \\inprod{w,\\sum_{i=1}^n\\sigma_i x\\sp{i}} }\n    \\\\&= \\frac{B}{n} \\Exp_\\sigma\\sbr{ \\Norm{\\sum_{i=1}^n \\sigma_i  x\\sp{i}}_2 }\n        &&\\text{($\\textstyle\\sup_{\\Norm{w}_2 \\le B} \\langle w,v\\rangle =B\\Norm{v}_2$)}\n    \\\\&\\leq \\frac{B}{n} \\sqrt{ \\Exp_\\sigma\\sbr{\\Norm{ \\sum_{i=1}^n \\sigma_i x\\sp{i} }_2^2} }\n        &&\\text{(Jensen's ineq. for $\\alpha \\mapsto \\alpha^2$)} \n    \\\\&= \\frac{B}{n} \\sqrt{ \\Exp_\\sigma \\sbr{\\sum_{i=1}^n \\rbr{\\sigma_i^2 \\Norm{x\\sp{i}}_2^2 + \\inprod{\\sigma_ix\\sp{i},\\sum_{j \\ne i}^n \\sigma_j x\\sp{j}} }} }\n    \\\\&= \\frac{B}{n} \\sqrt{\\sum_{i=1}^n \\Norm{x\\sp{i}}_2^2}.\n        &&\\text{($\\sigma_i$ indep. and $\\Exp[\\sigma_i]=0$)}\n\\end{align}\nThis completes the proof of \\eqref{lec7:eqn:linear-sample} for the empirical Rademacher complexity. The bound on the average Rademacher complexity in \\eqref{lec7:eqn:linear} follows from taking the expectation of both sides to get\n\\begin{equation}\n    R_n(\\mathcal{H}) = \\Exp\\sbr{ R_S(\\mathcal{H}) }\n    = \\frac{B}{n} \\Exp\\sbr{ \\sqrt{\\sum_{i=1}^n \\Norm{x\\sp{i}}_2^2} }\n    \\le \\frac{B}{n} \\sqrt{ \\sum_{i=1}^n \\Exp\\sbr{\\Norm{x\\sp{i}}_2^2} }\n    \\le \\frac{BC}{\\sqrt{n}},\n\\end{equation}\nwhere the first inequality is another application of Jensen's inequality, and the second follows from the assumption $\\Exp_{x \\sim P}\\sbr{\\Norm{x}_2^2} \\leq C^2$.\n\n\\end{proof}\n\nWe observe that both the empirical and average Rademacher complexities scale with the upper $\\ell_2$-norm bound $\\Norm{w}_2 \\le B$ on the parameters~$w$, which motivates regularizing the model. However, smaller weights in the model may reduce the margin $\\gamma_\\mathrm{min}$, which in turn hurts generalization according to \\eqref{lec7:eqn:generalization_loss}.\n\n\\begin{remark}\nNote that if we scale the data by some multiplicative factor, the bound on empirical Rademacher complexity $R_S(\\cH)$ will scale accordingly. However, at the same time, we expect the margin to scale by the same multiplicative factor, so the bound on the generalization gap in \\eqref{lec7:eqn:generalization_loss} does not change. This lines up with our intuition that the bound should not depend on the scaling of the data.\n\\end{remark}\n\n\\subsec{Linear models with weights bounded in \\texorpdfstring{$\\ell_1$}{L1} norm}\nNow, we consider linear models again, except we restrict the $\\ell_1$-norm of the parameters and assume an $\\ell_\\infty$-norm bound on the data.\n\n\\begin{theorem}\\label{lec7:thm:l1-thm}\n    Let $\\mathcal{H} = \\cbr{x \\mapsto \\inprod{w,x} \\mid w \\in \\R^d, \\Norm{w}_1 \\le B}$ for some constant $B > 0$. Moreover, assume $\\Norm{x\\sp{i}}_\\infty \\leq C$ for some constant $C > 0$ and all points in $S = \\{x\\sp{i}\\}_{i=1}^n \\subset \\R^d$. Then\n    \\begin{equation}\n        R_S(\\mathcal{H}) \\leq BC\\sqrt{\\frac{2\\log(2d)}{n}}.\n    \\end{equation}\n\\end{theorem}\n\nTo prove the theorem, we will need Massart's lemma, which provides a bound for the Rademacher complexity of a finite hypothesis class.\n\n    \\begin{lemma}[Massart's lemma]\n        Suppose $\\mathcal{Q} \\subset \\R^n$ is finite and contained in the $\\ell_2$-norm ball of radius $M\\sqrt{n}$ for some constant $M > 0$, i.e.,\n        \\begin{equation}\n            \\mathcal{Q} \\subset \\{v \\in \\R^n \\mid \\Norm{v}_2 \\leq M\\sqrt{n} \\}.\n        \\end{equation}\n        Then, for Rademacher variables $\\sigma = (\\sigma_1,\\sigma_2,\\dots,\\sigma_n) \\in \\R^n$,\n        \\begin{equation}\n            \\Exp_\\sigma \\left[ \\sup_{v\\in \\mathcal{Q}} \\frac{1}{n}\\inprod{\\sigma,v} \\right] \\leq M\\sqrt{\\frac{2\\log|\\mathcal{Q}|}{n}}.\n        \\end{equation}\n        As a corollary, if $\\mathcal{F}$ is a set of real-valued functions satisfying\n        \\begin{equation}\n            \\sup_{f\\in\\mathcal{F}} \\frac{1}{n}\\sum_{i=1}^n f(z\\sp{i})^2 \\leq M^2,\n        \\end{equation}\n        over some data $S = \\{z\\sp{i}\\}_{i=1}^n$, then\n        \\begin{align}\n            R_S(\\mathcal{F}) \\leq M\\sqrt{\\frac{2\\log|\\mathcal{F}|}{n}}, \\quad\\text{and}\\quad\n            R_n(\\mathcal{F}) \\leq M\\sqrt{\\frac{2\\log|\\mathcal{F}|}{n}}.\n        \\end{align}\n    \\end{lemma}\n\nWe will not prove Massart's lemma in detail. The intuition is to use concentration inequalities to bound $\\frac{1}{n}\\inprod{\\sigma, v}$ for fixed $v$, then to use a union bound over the elements $v \\in \\mathcal{Q}$.\n\nWe will now prove Theorem \\ref{lec7:thm:l1-thm}:\n\n\\begin{proof}[Proof of Theorem \\ref{lec7:thm:l1-thm}]\n    By definition,\n    \\begin{align}\n        R_S(\\mathcal{H}) &= \\Exp_\\sigma\\sbr{ \\sup_{\\Norm{w}_1 \\le B} \\frac{1}{n} \\sum_{i=1}^n\\sigma_i \\inprod{w,x\\sp{i}} } \\\\\n        &= \\frac{1}{n} \\Exp_\\sigma\\sbr{ \\sup_{\\Norm{w}_1\\le B} \\inprod{w,\\sum_{i=1}^n\\sigma_i x\\sp{i}} } \\\\\n        &= \\frac{B}{n} \\Exp_\\sigma\\sbr{ \\Norm{\\sum_{i=1}^n \\sigma_i  x\\sp{i}}_\\infty  },\n    \\end{align}\n    \n    where the last equality is because $\\sup_{\\Norm{w}_1 \\leq B}\\inprod{w,v} = B\\Norm{v}_\\infty$, i.e., the $\\ell_\\infty$-norm is the dual of the $\\ell_1$-norm, which is a consequence of H\\\"older's inequality. However, the $\\ell_\\infty$-norm is difficult to simplify further. Instead, we use the fact that $\\sup_{\\Norm{w}_1 \\leq 1} \\inprod{w,v}$ for any $v \\in \\R^d$ is always attained at one of the vertices $\\mathcal{W} = \\bigcup_{i=1}^d \\{-e_i,e_i\\}$, where $e_i \\in \\R^d$ is the $i$-th coordinate unit vector. Defining the restricted hypothesis class $\\bar{\\mathcal{H}} = \\{x \\mapsto \\inprod{w,x} \\mid w \\in \\mathcal{W}\\} \\subset \\mathcal{H}$, this yields\n    \\begin{align}\n        R_S(\\mathcal{H}) &= \\frac{1}{n} \\Exp_\\sigma\\sbr{ \\sup_{\\Norm{w}_1 \\le B} \\inprod{w,\\sum_{i=1}^n\\sigma_i x\\sp{i}} } \\\\\n        &= \\frac{B}{n} \\Exp_\\sigma\\sbr{ \\max_{w\\in\\mathcal{W}} \\inprod{w,\\sum_{i=1}^n\\sigma_i x\\sp{i}} } \\\\\n        &= BR_S(\\bar{\\mathcal{H}}).\n    \\end{align}\n    \n    In particular, the model class $\\bar{\\mathcal{H}}$ is bounded and finite with cardinality $|\\bar{\\mathcal{H}}| = 2d$. This suggests using Massart's lemma to complete the proof. To do so, we need to confirm that $\\mathcal{\\bar{H}}$ is bounded with respect to the $\\ell_2$-metric. Indeed, since the inner product of $x\\sp{i}$ with a coordinate vector $e_j$ just selects the $j$-th coordinate of $x\\sp{i}$, for any $w \\in \\mathcal{W}$ we have\n    \\begin{equation}\n        \\frac{1}{n}\\sum_{i=1}^n \\inprod{w,x\\sp{i}}^2 \\leq \\frac{1}{n}\\sum_{i=1}^n \\Norm{x\\sp{i}}^2_\\infty \\leq \\frac{1}{n}\\sum_{i=1}^n C^2 = C^2,\n    \\end{equation}\n    where the last inequality uses the assumption $\\Norm{x_i}_\\infty \\leq C$. So $\\bar{\\mathcal{H}}$ is bounded in the $\\ell_2$-metric and finite, thus by Massart's Lemma we have\n    \\begin{equation}\n        R_S(\\mathcal{H}) = B R_S(\\bar{\\mathcal{H}}) \\leq BC\\sqrt{\\frac{2\\log|\\bar{\\mathcal{H}}|}{n}} = BC\\sqrt{\\frac{2\\log(2d)}{n}},\n    \\end{equation}\n    which completes the proof.\n\\end{proof}\n\n\\subsec{Comparing the bounds for different \\texorpdfstring{$\\cH$}{H}}\n\nFirst, we note that for this hypothesis class of linear models, it is possible to obtain an upper bound proportional to $\\sqrt{d/n}$ using the VC~dimension, which grows quickly with the data dimension~$d$. Our bound is better since it does not have as strong of a dependence on~$d$, and accounts for the norms of our model parameters and the data.\n\nIn the two subsections above, we considered two different hypothesis classes of linear models, each restricting different norms. In both cases, the bound on the average Rademacher complexity depended on the product of the norm bound on the parameters $w$ and the norm bound on each data point $x$. To determine which choice of hypothesis class is better, consider the bounds\n    \\begin{equation*}\n        \\Norm{w}_2\\Norm{x}_2 \\quad\\text{vs.}\\quad \\Norm{w}_1\\Norm{x}_\\infty\n    \\end{equation*}\n    and see how they compare in different settings. We consider 3 settings here:\n    \n    \\begin{itemize}\n    \\item Suppose $w$ and $x$ are random variables with $w_i$ and $x_i$ close to the set of values $\\{-1,1\\}$. Then we have\n    \\begin{equation*}\n        \\sqrt{d}\\cdot \\sqrt{d} \\quad\\text{vs.}\\quad d\\cdot 1.\n    \\end{equation*}\n    In this case, there is no difference in using either linear hypothesis class.\n    \n    \\item If we additionally suppose $w$ is sparse with at most $k$ non-zero entries, then we have\n    \\begin{equation*}\n        \\sqrt{k}\\cdot\\sqrt{d} \\quad\\text{vs.}\\quad k\\cdot 1.\n    \\end{equation*}\n    So for $d \\gg k$, we have $\\sqrt{kd} \\gg k$ and thus $\\ell_1$-norm regularization leads to a better complexity bound when $w$ is suspected to be sparse. Indeed, $\\sqrt{d}\\Norm{x}_\\infty \\approx \\Norm{x}_2$ when the entries of $x$ are somewhat uniformly distributed, and so in the sparse case we have\n    \\begin{equation}\n        \\Norm{w}_2\\Norm{x}_2 \\geq \\sqrt{d}\\Norm{w}_2\\Norm{x}_\\infty \\geq \\Norm{w}_1\\Norm{x}_\\infty. \n    \\end{equation}\n    \n    \\item On the other hand, if $w$ is dense in the sense that $\\Norm{w}_2\\approx {\\sqrt{d}}\\Norm{w}_1$ (i.e., if all entries in $w$ are close to each other in magnitude), then\n    \\begin{equation}\n        \\Norm{w}_2\\Norm{x}_2 \\leq \\frac{1}{\\sqrt{d}}\\Norm{w}_1 \\cdot \\sqrt{d} \\Norm{x}_\\infty \\leq \\Norm{w}_1\\Norm{x}_\\infty.\n    \\end{equation}\n    In this case, it makes sense to regularize the $\\ell_2$-norm instead.\n    \\end{itemize}\n    \n    In practice, other multiplicative factors enter the generalization bound, so regularizing both the $\\ell_1$- and $\\ell_2$-norms of the model parameters $w$ is preferable.\n\n    Continuing with this rough style of analysis, for the hypothesis class with restricted $\\ell_2$-norm, we can write the bound on the generalization gap in \\eqref{lec7:eqn:generalization_loss} as\n    \\begin{equation}\n        \\text{generalization loss} \\lesssim \\frac{\\Norm{w}_2\\Norm{x}_2}{\\sqrt{n}\\gamma_{\\mathrm{min}}} + \\text{low-order term}.\n    \\end{equation}\n    The presence of $\\Norm{w}_2/\\gamma_{\\mathrm{min}}$ motivates both the minimum norm and the maximum margin formulations of the Support Vector Machine (SVM) problem as good methods to improve generalization performance of binary classifiers.\n\n%*****************************************************************************\n\\sec{Two-layer neural networks}\nWe now compute a bound for the Rademacher complexity of two-layer neural networks.  Throughout this section, we use the following notation:\n\\begin{itemize}\n    \\item $\\theta = (w, U)$ are the parameters of the model with $w \\in \\R^m$ and $U \\in \\R^{m \\times d}$, where $m$ denotes the number of hidden units. We use $u_i\\in\\R^d$ to denote the $i$-th row of $U$ (written as a column vector).\n    \\item $\\phi(z) = \\max(z, 0)$ is the ReLU activation function applied element-wise.\n    \\item $f_\\theta(x) = \\inprod{w,\\phi(Ux)} = w^\\top \\phi(Ux)$ is the model.\n    \\item $\\{ (x\\sp{i}, y\\sp{i}) \\}_{i=1}^n$ is the training set, with $x\\sp{i}\\in\\R^d$ and $y\\sp{i}\\in\\R$.\n\\end{itemize}\nWe start with a somewhat weak bound which introduces the technical tools we need to derive tighter bounds subsequently.\n\n\\begin{theorem}\\label{lec7:thm:thm_3}\n    For some constants $B_w > 0$ and $B_u > 0$, let\n    \\begin{equation}\n        \\mathcal{H} = \\cbr{ f_\\theta \\mid \\Norm{w}_2 \\leq B_w,\\ \\Norm{u_i}_2 \\leq B_u,\\ \\forall i \\in \\{1,2,\\dots,m\\} }, \\label{lec7:eqn:thm_3}\n    \\end{equation}\n    and suppose $\\Exp\\sbr{\\Norm{x}_2^2} \\leq C^2$. Then\n    \\begin{align}\n        R_n(\\mathcal{H}) \\le 2 B_w B_u C\\sqrt{\\frac{m}{n}}.\n    \\end{align}\n\\end{theorem}\n\nThis bound is not ideal as it depends on the number of neurons~$m$. Empirically, it has been found that the generalization error does \\emph{not} increase monotonically with~$m$. As more neurons are added to the model, thereby giving it more expressive power, studies have shown that generalization is improved \\cite{belkin2019}. This contradicts the bound above, which states that more neurons leads to worse generalization. We also note that the theorem can be generalized straightforwardly to the setting where the $w$ and $U$ are jointly constrained in the sense that we set $\\mathcal{H} = \\cbr{ f_\\theta \\mid \\Norm{w}_2\\cdot \\left(\\max_i\\Norm{u_i}_2\\right) \\leq B}$ and obtain the generalization bound $        R_n(\\mathcal{H}) \\le 2 B C\\sqrt{\\frac{m}{n}}.$ However, the $\\sqrt{m}$ dependency still exists under this formulation of $\\cH$. \nNevertheless, we now derive this bound.\n\n\\begin{proof}\n    By definition,\n    \\begin{align}\n        R_S(\\mathcal{H}) \n        &= \\Exp_\\sigma\\sbr{ \\sup_\\theta \\frac{1}{n} \\sum_{i=1}^n \\sigma_i \\inprod{w,\\phi(Ux\\sp{i})} }\n        \\\\&= \\frac{1}{n} \\Exp_\\sigma\\sbr{ \\sup_{U : \\Norm{u_j}_2 \\leq B_u} \\sup_{\\Norm{w}_2 \\leq B_w} \\inprod{w,\\sum_{i=1}^n \\sigma_i \\phi(Ux\\sp{i})} }\n        \\\\&= \\frac{B_w}{n}\\Exp_\\sigma\\sbr{ \\sup_{U : \\Norm{u_j}_2 \\leq B_u} \\Norm{ \\sum_{i=1}^n \\sigma_i \\phi(Ux\\sp{i})}_2 }\n            &&\\text{($\\textstyle\\sup_{\\Norm{w}_2\\leq B}\\inprod{w,v} = B\\Norm{v}_2$)}\n        \\\\&\\leq \\frac{B_w\\sqrt{m}}{n}\\Exp_\\sigma\\sbr{ \\sup_{U : \\Norm{u_j}_2 \\leq B_u} \\Norm{ \\sum_{i=1}^n \\sigma_i \\phi(Ux\\sp{i})}_\\infty }\n            &&\\text{($\\Norm{v}_2 \\leq \\sqrt{m}\\Norm{v}_\\infty$)}\n        \\\\&= \\frac{B_w\\sqrt{m}}{n}\\Exp_\\sigma\\sbr{ \\sup_{U : \\Norm{u_j}_2 \\leq B_u} \\max_{1\\leq j\\leq m} \\abs{ \\sum_{i=1}^n \\sigma_i \\phi(u_j^\\top x\\sp{i})} } \n        \\\\&= \\frac{B_w\\sqrt{m}}{n}\\Exp_\\sigma\\sbr{ \\sup_{\\Norm{u}_2 \\leq B_u} \\abs{ \\sum_{i=1}^n \\sigma_i \\phi(u^\\top x\\sp{i})} }\n        \\\\&\\leq \\frac{2B_w\\sqrt{m}}{n}\\Exp_\\sigma\\sbr{ \\sup_{\\Norm{u}_2 \\leq B_u} \\sum_{i=1}^n \\sigma_i \\phi(u^\\top x\\sp{i}) }\n            &&\\text{(by Lemma \\ref{lec8:lemma:absfortwo})} \\label{lec7:eqn:nn-proof1}\n        \\\\&\\leq \\frac{2B_w\\sqrt{m}}{n}\\Exp_\\sigma\\sbr{ \\sup_{\\Norm{u}_2 \\leq B_u} \\sum_{i=1}^n \\sigma_i u^\\top x\\sp{i} }, \\label{lec7:eqn:nn-proof2}\n    \\end{align}\n    where the last inequality follows by applying the contraction lemma (Talagrand's lemma) and observing that the ReLU function is $1$-Lipschitz. (Observe that the expectation in \\eqref{lec7:eqn:nn-proof1} is the Rademacher complexity for $\\{ x \\mapsto \\phi(u^\\top x) \\mid \\Norm{u}_2 \\leq B_u \\}$: this is the family that we are applying the contraction lemma to.)\n    \n    We now observe that the expectation in \\eqref{lec7:eqn:nn-proof2} is the Rademacher complexity of the family of linear models $\\{x \\mapsto \\inprod{u,x} \\mid \\Norm{u}_2\\leq B_u\\}$. Thus, applying Theorem~\\ref{lec7:thm:l1-thm} yields\n    \\begin{equation}\n        R_S(\\mathcal{H}) \\leq \\frac{2B_w\\sqrt{m}}{n}B_u\\sqrt{\\sum_{i=1}^n \\Norm{x\\sp{i}}_2^2}.\n    \\end{equation}\n    \n    Taking the expectation of both sides and using similar steps to those in the proof of Theorem~\\ref{lec7:thm:l1-thm} gives us\n    \\begin{align}\n        R_n(\\mathcal{H})  &= \\Exp\\left[ R_S(\\mathcal{H})\\right] \\\\\n        &\\leq \\frac{2B_wB_u\\sqrt{m}}{n} \\Exp\\sbr{\\sqrt{\\sum_{i=1}^n \\Norm{x\\sp{i}}_2^2}} \\\\\n        &\\leq \\frac{2B_wB_u\\sqrt{m}}{n} C\\sqrt{n} \\\\\n        &= 2 B_w B_u C\\sqrt{\\frac{m}{n}},\n    \\end{align}\n    which completes the proof.\n    \n\\end{proof}\n\nThis upper bound is undesirable since it grows with the number of neurons $m$, contradicting empirical observations of the generalization error decreasing with $m$.\n\n%*****************************************************************************\n\n\\subsec{Refined bounds}\n\\newcommand{\\boundsforcomp}{B}\nNext, we look at a finer bound that results from defining a new complexity measure. A recurring theme in subsequent proofs will be the functional invariance of two-layer neural networks under a class of rescaling transformations. The key ingredient will be the \\textit{positive homogeneity} of the ReLU function, i.e.\n\\begin{equation}\n\\alpha \\phi(x) = \\phi(\\alpha x) \\qquad \\forall \\alpha > 0.\n\\end{equation}\nThis implies that for any $\\lambda_i > 0$ ($i = 1, \\dots, m$), the transformation $\\theta = \\{(w_i, u_i)\\}_{1 \\leq i \\leq m} \\mapsto \\theta' = \\{(\\lambda_i w_i,  u_i / \\lambda_i )\\}_{1 \\leq i \\leq m}$ has no net effect on the neural network's functionality (i.e. $f_{\\theta} = f_{\\theta'}$) since \n\\begin{equation}\nw_i\\cdot \\phi \\left(u_i^\\top x\\sp i \\right) = (\\lambda_i w_i) \\cdot \\phi\\l(\\l( \\frac{u_i}{\\lambda_i}\\r)^\\top x\\sp i\\r).   \n\\end{equation}\nIn light of this, we devise a new complexity measure $C(\\theta)$ that is also invariant under such transformations and use it to prove a better bound for the Rademacher complexity. This positive homogeneity property is absent in the complexity measure used in the hypothesis class \\eqref{lec7:eqn:thm_3} of Theorem \\ref{lec7:thm:thm_3}.\n\n\\begin{theorem}\\label{lec8:thm:thm-improved-nn-rc}\n$\\operatorname{Let} C(\\theta)=\\sum_{j=1}^{m}\\left|w_{j}\\right|\\left\\|u_{j}\\right\\|_{2},$ and for some constant $\\boundsforcomp>0$ consider the hypothesis class\n\\begin{equation}\n\\mathcal{H}=\\left\\{f_{\\theta} \\mid C(\\theta) \\leq \\boundsforcomp\\right\\}. \\label{eqn:H}\n\\end{equation}\nIf $\\left\\|x\\sp{i}\\right\\|_{2} \\leq C$ for all $i \\in\\{1, \\ldots, n\\},$ then\n\\begin{equation}\nR_{S}(\\mathcal{H}) \\leq \\frac{2 \\boundsforcomp C}{\\sqrt{n}}.\n\\end{equation}\n\\end{theorem}\n\n\\begin{remark}\n\tCompared to Theorem~\\ref{lec7:thm:thm_3}, this bound does not explicitly depend on the number of neurons $m$. Thus, it is possible to use more neurons and still maintain a tight bound if the value of the new complexity measure $C(\\theta)$ is reasonable. In contrast, the bound of Theorem \\ref{lec7:thm:thm_3} explicitly grows with the total number of neurons. In fact, Theorem~\\ref{lec8:thm:thm-improved-nn-rc} is strictly stronger than Theorem~\\ref{lec7:thm:thm_3} as elaborated below. Note that \n\t\\begin{align}\n\t\t\\sum |w_j|\\|u_j\\|_2 &\\le \\left(\\sum |w_j|^2\\right)^{1/2} \\left(\\sum\\|u_j\\|_2^2\\right)^{1/2} \\tag{by Cauchy-Schwarz inequality} \\\\\n\t\t& \\le \\|w\\|_2 \\cdot \\sqrt{m} \\cdot \\max_{j}\\|u_j\\|_2\n\t\\end{align}\n\tTherefore, if we consider $\\cH^1 = \\{f_\\theta \\mid \\sum |w_j|\\|u_j\\|_2\\le B'\\}$ and $\\cH^2 = \\{f_\\theta \\mid \\|w\\|_2 \\cdot \\sqrt{m} \\cdot \\max_{j}\\|u_j\\|_2 \\le B'\\}$, then either Theorem~\\ref{lec8:thm:thm-improved-nn-rc} on $\\cH^1$ or Theorem~\\ref{lec7:thm:thm_3} on $\\cH^2$ gives the same generalization bound $O(B'/\\sqrt{n})$, but $\\cH^1 \\supset \\cH^2$. \n\t\n\tMoreover, Theorem~\\ref{lec8:thm:thm-improved-nn-rc} is stronger as we have more neurons---this is because the hypothesis class $\\cH$ as defined in~\\eqref{eqn:H} is bigger as $m$ increases. Because of this, it's possible to obtain a generalization guarantee that decreases as $m$ increases, as shown in Section~\\ref{sec:gen-bounds:decreasing-in-m}. \n\t\n%\tFor example, consider solving the constrained problem\n%\t\\begin{equation}\n%\t\\rho_m = \\min_\\theta C(\\theta) \n%\t\\quad \\text{such that}\\quad \n%\t\\text{$f_\\theta$ fits the data  $\\{(x\\sp{i}, y\\sp{i})\\}_{i=1}^n$.}\n%\t\\end{equation}\n%\tIn this case, $\\rho_m$ monotonically decreases as the number of neurons $m$ increases. Indeed, models with more parameters necessarily include models with a lower number of parameters and thus those of lower complexity.  As a result, it is possible to obtain lower complexity models by increasing the number of parameters $m$.\n\\end{remark}\n\n\\begin{proof}[Proof of Theorem~\\ref{lec8:thm:thm-improved-nn-rc}]\nDue to the positive homogeneity of the ReLU function $\\phi$, it will be useful to define the $\\ell_2$-normalized weight vector $\\bar{u}_j \\defeq u_j / \\norm{u_j}_2$ so that $\\phi\\left(u_j^\\top x\\right) = \\norm{u_j}_2 \\cdot \\phi(\\bar{u}_j^\\top x)$. The empirical Rademacher complexity satisfies\n\\allowdisplaybreaks\n\\al{\nR_S(\\cH) &= \\frac{1}{n}\\Exp_{\\sigma}\\left[ \\sup_{\\theta} \\sum_{i=1}^n \\sigma_i f_{\\theta}\\left(x\\sp{i}\\right) \\right] \\\\\n&= \\frac{1}{n}\\Exp_{\\sigma}\\left[ \\sup_{\\theta} \\sum_{i=1}^n \\sigma_i \\left[\\sum_{j=1}^m w_j \\phi\\left(u_j ^ T x\\sp{i}\\right) \\right] \\right] &&\\text{(by dfn of $f_\\theta$)} \\\\\n&=  \\frac{1}{n}\\Exp_{\\sigma}\\left[ \\sup_{\\theta} \\sum_{i=1}^n \\sigma_i \\left[\\sum_{j=1}^m w_j \\norm{u_j}_2  \\phi\\left(\\bar{u}_j ^ T x\\sp{i}\\right) \\right] \\right]  \n    && \\text{(by positive homogeneity of $\\phi$)}\\\\\n&= \\frac{1}{n}\\Exp_{\\sigma}\\left[ \\sup_{\\theta}  \\sum_{j=1}^m w_j \\norm{u_j}_2 \\left[ \\sum_{i=1}^n \\sigma_i  \\phi\\left(\\bar{u}_j ^ T x\\sp{i}\\right) \\right] \\right] \\\\ \n&\\leq \\frac{1}{n}\\Exp_{\\sigma}\\left[ \\sup_{\\theta}  \\sum_{j=1}^m |w_j| \\norm{u_j}_2 \\max_{k \\in [n]}\\left| \\sum_{i=1}^n \\sigma_i  \\phi\\left(\\bar{u}_k ^ T x\\sp{i}\\right) \\right| \\right] && \\l(\\because \\sum_j \\alpha_j \\beta_j \\leq \\sum_j |\\alpha_j| \\max_{k} |\\beta_k|\\r) \\\\ \n&\\leq \\frac{\\boundsforcomp}{n} \\Exp_{\\sigma}\\sbr{ \\sup_{\\theta = (w, U)} \\max_{k \\in [n]} \\left| \\sum_{i=1}^n \\sigma_i  \\phi\\left(\\bar{u}_k ^ T x\\sp{i}\\right) \\right| } && \\text{($\\because C(\\theta) \\leq \\boundsforcomp$)} \\\\\n&=  \\frac{\\boundsforcomp}{n} \\Exp_{\\sigma}\\sbr{ \\sup_{\\bar{u}: \\norm{\\bar{u}}_2 = 1} \\left| \\sum_{i=1}^n \\sigma_i  \\phi\\left(\\bar{u} ^ T x\\sp{i}\\right) \\right| } \\\\\n&\\le \\frac{\\boundsforcomp}{n} \\Exp_{\\sigma}\\sbr{ \\sup_{\\bar{u}: \\norm{\\bar{u}}_2 \\le 1} \\left| \\sum_{i=1}^n \\sigma_i  \\phi\\left(\\bar{u} ^ T x\\sp{i}\\right) \\right| } \\\\\n&\\le \\frac{2\\boundsforcomp}{n}  \\Exp_{\\sigma}\\sbr{ \\sup_{\\bar{u}: \\norm{\\bar{u}}_2 \\le 1} \\sum_{i=1}^n \\sigma_i  \\phi\\left(\\bar{u} ^ T x\\sp{i}\\right) } && \\text{(see Lemma \\ref{lec8:lemma:absfortwo})} \\\\\n&= 2\\boundsforcomp R_S(\\cH '),\n}\nwhere $\\cH' = \\l\\{x \\mapsto \\phi(\\bar{u}^\\top x) :  \\bar{u} \\in \\mathbb{R}^d, \\norm{\\bar{u}}_2 \\leq 1 \\r\\}$. By Talagrand's lemma, since $\\phi$ is $1$-Lipschitz, $R_S(\\cH') \\leq R_S(\\cH'')$ where  $\\cH'' = \\l\\{x \\mapsto \\bar{u}^\\top x :  \\bar{u} \\in \\mathbb{R}^d, \\norm{\\bar{u}}_2 \\leq 1 \\r\\}$ is a linear hypothesis space. Using $R_S(\\cH'') \\leq \\frac{C}{\\sqrt{n}}$ by Theorem \\ref{lec7:thm:l2-thm} then concludes the proof.\n\n\\end{proof}\n\nWe complete the proof by deriving the Lemma \\ref{lec8:lemma:absfortwo} used in the second-to-last inequality. Notably, the lemma's assumption holds in the current context, since\n\\al{\n\\sup_{\\theta} \\langle \\sigma, f_{\\theta}(x) \\rangle = \\sup_{\\bar{u}: \\norm{\\bar{u}}_2 \\leq 1} \n\\sum_{i=1}^n \\sigma_i \\phi \\l(\\bar{u}^\\top x\\sp i \\r)  \\geq 0.\n}\nsince one can take $\\bar{u} = 0$ for any $\\sigma = (\\sigma_1, \\dots, \\sigma_n)$.\n\n\\begin{lemma}\\label{lec8:lemma:absfortwo}\nLet $\\sigma = (\\sigma_1, ..., \\sigma_n)$ and $f_{\\theta}(x) = \\l(f_{\\theta}\\l(x\\sp{1}\\r), ...,  f_{\\theta}\\l(x\\sp{n} \\r)\\r)$. Suppose that for any $\\sigma \\in \\{\\pm 1\\}^n$, $\\sup_{\\theta} \\langle \\sigma, f_{\\theta}(x) \\rangle \\geq 0$. Then, \n\\begin{equation}\n\\mathbb{E}_{\\sigma}\\l[ \\sup_{\\theta}  \\l | \\langle \\sigma, f_{\\theta}(x) \\rangle \\r|  \\r] \\leq 2 \\mathbb{E}_{\\sigma}\\l[ \\sup_{\\theta}  \\langle \\sigma, f_{\\theta}(x) \\rangle   \\r].\n\\end{equation}\n\\end{lemma}\n\n\\begin{proof}\nLetting $\\phi$ be the ReLU function, the lemma's assumption implies that $\\sup_{\\theta} \\phi\\left(\\langle \\sigma, f_{\\theta}(x) \\rangle\\right) = \\sup_{\\theta}\\langle \\sigma, f_{\\theta}(x) \\rangle$ for any $\\sigma \\in \\{\\pm 1\\}^n$. Observing that $|z| = \\phi(z) + \\phi(-z)$, \n\\begin{align}\n\\sup_{\\theta} \\abs{\\inprod{ \\sigma, f_{\\theta}(x) }}%\n&= \\sup_{\\theta} \\left[ \\phi \\l(\\inprod{ \\sigma, f_{\\theta}(x) } \\r) + \\phi \\l(\\inprod{-\\sigma, f_{\\theta}(x) } \\r)\\right] \\\\\n&\\le \\sup_{\\theta}  \\phi \\l(\\inprod{ \\sigma, f_{\\theta}(x) } \\r) +  \\sup_{\\theta}  \\phi \\l(\\inprod{-\\sigma, f_{\\theta}(x) } \\r)  \\\\\n&= \\sup_{\\theta} \\inprod{ \\sigma, f_{\\theta}(x) } +  \\sup_{\\theta}  \\inprod{-\\sigma, f_{\\theta}(x) }. \n\\end{align}\nTaking the expectation over $\\sigma$ (and noting that $\\sigma \\overset d = -\\sigma$), we get the desired conclusion.\n\\end{proof}\n\n\n\n\\sec{More implications and discussions on two-layer neural nets}\nIn this section, we discuss practical implications of the refined neural network bound. \n\n\\subsec{Connection to \\texorpdfstring{$\\ell_2$}{L2} regularization}\\label{sec:gen-bounds:impliciation}\n\nRecall that margin theory yields\n\\begin{equation}\n\\text{for all } \\theta, \\quad \\Err(\\theta) \\leq \\frac{2R_S(\\cH)}{\\gammamin} + \\tilO\\l(\\sqrt{\\frac{\\log \\l( 2 / \\delta \\r)}{n}}\\r), \\label{lec8:eqn:margin-bound}\n\\end{equation}\nwith probability at least $1 -\\delta$. Thus, Theorem \\ref{lec8:thm:thm-improved-nn-rc} motivates us to minimize $\\frac{R_S(\\cH)}{\\gammamin}$ by regularizing $C(\\theta)$. Concretely, this can be formulated as the optimization problem \n\\al{\n\\text{minimize} & \\qquad C(\\theta) = \\sum_{j=1}^m |w_j|\\cdot \\norm{u_j}_2 \\nonumber \\tag{I} \\label{lec8:eqn:opt1} \\\\ \n\\text{subject to} & \\qquad \\gammamin(\\theta)\\ge 1, \\nonumber\n}\nor equivalently,\n\\al{\n\\text{maximize} & \\qquad \\gammamin(\\theta) \\nonumber \\tag{II} \\label{lec8:eqn:opt2} \\\\ \n\\text{subject to} & \\qquad C(\\theta)\\le 1. \\nonumber\n}\n\nAt first glance, the above seems orthogonal to techniques used in practice. However, it turns out that the optimal neural network from \\eqref{lec8:eqn:opt1} is functionally equivalent to that of the new problem:\n\\al{\n\\text{minimize} & \\qquad C_{\\ell_2}(\\theta) = \\frac{1}{2}\\sum_{j=1}^m |w_j|^2 + \\frac{1}{2}\\sum_{j=1}^m \\norm{u_j}_2^2 \\nonumber \\tag{I*} \\label{lec8:eqn:opt1star} \\\\ \n\\text{subject to} & \\qquad \\gammamin(\\theta)\\ge 1. \\nonumber\n}\nThis is a simple consequence of the positive homogeneity of $\\phi$. For any scaling factor $\\lambda=(\\lambda_1, \\dots, \\lambda_m)\\in \\R_+^m$, the rescaled neural network $\\theta_\\lambda \\defeq \\{(\\lambda_i w_i, u_i/\\lambda_i)\\}$ has the same functionality as the original neural network $\\theta = \\{w_i, u_i \\}$ (i.e. it achieves the same $\\gammamin$). Thus, \n\\al{\n\\min_{\\theta} C_{\\ell_2}(\\theta) &= \\min_{\\theta} \\min_{\\lambda} \\rbr{ \\frac{1}{2}\\sum_{j=1}^m \\lambda_j^2 |w_j|^2 + \\frac{1}{2}\\sum_{j=1}^m \\lambda_j^{-2}\\norm{u_j}_2^2 }\\\\\n&= \\min_{\\theta}  \\sum_{j=1}^m |w_j|\\cdot \\norm{u_j}_2 \\\\\n&= \\min_{\\theta}  C(\\theta)\n}\nwhere we have used the equality case of the AM-GM inequality, attainable by $\\lambda_j^* = \\sqrt{\\frac{\\norm{u_j}_2}{|w_j|}}$, in the second step. This equality case also shows that $\\theta^* = \\{(w_i, u_i ) \\}$ is the optimal solution of \\eqref{lec8:eqn:opt1} if and only if $\\hat{\\theta}^* = \\theta_{\\lambda^*}$ is the optimal solution of \\eqref{lec8:eqn:opt1star}---proving that $\\hat{\\theta}^*$ and $\\theta^*$ are functionally equivalent since they only differ by a positive scale factor. \n\nThis connects our $C(\\theta)$ regularization to $\\ell_2$-norm penalties that are more prevalent in practice. In retrospect, we see this equivalence is essentially due to the positive homogeneity of the neural network which ``homogenizes'' any inhomogeneous objective such as $C_{\\ell_2}$. Hence, we can just deal with $C(\\theta)$ which is transparently homogeneous.\n\n\\subsec{Generalization bounds that are decreasing in \\texorpdfstring{$m$}{m}} \\label{sec:gen-bounds:decreasing-in-m}\n\nNext, we show that the generalization bound given by Theorem \\ref{lec8:thm:thm-improved-nn-rc} does not deteriorate with the network width (number of neurons) $m$, which is consistent with experimental results. To this end, the perspective of \\eqref{lec8:eqn:opt2} enables us to isolate all dependencies of $m$ in $\\gammamin$. Letting $\\widehat \\theta_m$ denote the minimizer of program \\eqref{lec8:eqn:opt2} with width $m$ and defining optimal value $\\gamma_m^* = \\gammamin\\l(\\widehat \\theta_m\\r)$, we can rewrite the margin bound \\eqref{lec8:eqn:margin-bound} as \n\\begin{equation}\nL(\\widehat \\theta_m) \\le \\frac{4C}{\\sqrt{n}} \\cdot \\frac{1}{\\gamma_m^*} + \\text{(lower-order terms)},\n\\end{equation}\nwhere all dependencies on $m$ are now contained in $\\gamma_m^*$. Hence, to show that this bound does not worsen as $m$ grows, we just have to show that $\\gamma_m^*$ is non-decreasing in $m$. This is intuitively the case since a neural network of width $m+1$ contains one of width $m$ under the same complexity constraints. The following theorem formalizes this hunch:\n\n\\begin{theorem}\nLet $\\gamma_m^*$ be the minimum margin obtained by solving \\eqref{lec8:eqn:opt2} with a two-layer neural network of width $m$. Then $\\gamma_m^* \\leq \\gamma_{m+j}^*$ for all positive integers $j$.\n\\end{theorem}\n\n\\begin{proof}\nSuppose $\\theta = \\{(w_i, u_i)\\}_{1 \\leq i \\leq m}$ is a two-layer neural network of width $m$ satisfying $C(\\theta)\\le 1$. Then we may construct a neural network $\\widetilde \\theta = \\{(\\tilde w_i, \\tilde u_i)\\}_{1 \\leq i \\leq m+1}$ of width $m+1$ by simply taking\n\\al{\n(\\widetilde w_i, \\widetilde u_i) = \\begin{cases}\n(w_i, u_i) & i\\le m, \\\\\n(0,0) & \\text{otherwise.}\n\\end{cases}\n}\n$\\widetilde \\theta$ is functionally equivalent to $\\theta$ and $C(\\widetilde \\theta) = C(\\theta) \\le 1$. This means maximizing $\\gammamin$ over $\\{C(\\widetilde \\theta): \\widetilde \\theta\\text{ of width }m+1\\}$ should give no lower of a value than the maximum of $\\gammamin$ over $\\{C(\\theta): \\theta\\text{ of width }m\\}$.\n\\end{proof}\n\n\\subsec{Equivalence to an \\texorpdfstring{$\\ell_1$}{L1}-SVM in \\texorpdfstring{$m \\to \\infty$}{m -> inf} limit}\n\nSince $\\gamma_m^*$ is non-decreasing in $m$, the quantity \n\\begin{equation}\n\\gamma_\\infty ^* = \\lim_{m\\to \\infty } \\gamma_m^*\n\\end{equation}\nis well-defined. The next interesting fact is that in this $m \\to \\infty$ limit, $\\gamma_{\\infty}^*$ of the two-layer neural network is equivalent to the minimum margin of an $\\ell_1$-SVM. As a brief digression, we recap the formulation of $\\ell_p$-SVMs and discuss the importance of $\\ell_1$-SVMs in particular.\n\nSince a collection of data points with binary class labels may not be a priori separable, a \\textit{kernel model} first transforms an input $x$ to $\\varphi(x)$ where $\\varphi: \\mathbb{R}^d \\to \\mathcal{G}$ is known as the \\textit{feature map}. The model then seeks a separating hyperplane in this new (extremely high-dimensional) feature space $\\mathcal{G}$, parameterized by a vector $\\mu$ pointing from the origin to the hyperplane. The prediction of the model on an input $x$ is then a decision score that quantifies $\\varphi(x)$'s displacement with respect to the hyperplane:\n\\begin{equation}\ng_{\\mu, \\varphi}(x) \\defeq \\l\\langle \\mu, \\varphi(x) \\r\\rangle.\n\\end{equation}\nMotivated by margin theory, it is desirable to seek the maximum-margin hyperplane under a constraint on $\\mu$ to guarantee the generalizability of the model. In particular, a kernel model with an $\\ell_p$-constraint seeks to solve the following program:\n\\al{\n\\text{maximize} & \\qquad \\gamma_{min} \\coloneqq \\min_{i \\in [n]} y\\sp{i}\\langle \\mu, \\varphi(x\\sp{i}) \\rangle \\\\ \n\\text{subject to} & \\qquad \\norm{\\mu}_p \\le 1. \\nonumber\n}\nObserve that both the prediction and optimization of the feature model only rely on inner products in $\\mathcal{G}$. The ingenuity of the SVM is to choose maps $\\varphi$ such that $K(x, x') = \\l\\langle \\varphi(x), \\varphi(x') \\r\\rangle$ can be directly computed in terms of $x$ and $x'$ in the original space $\\mathbb{R}^d$, thereby circumventing the need to perform expensive inner products in the large space $\\mathcal{G}$. Remarkably, this ``kernel trick'' enables us to even operate in an implicit, infinite-dimensional $\\mathcal{G}$. \n\nThe case of $p=1$ is particularly useful in practice as $\\ell_1$-regularization generally produces sparse feature weights (the constrained parameter space is a polyhedron and the optimum tends to lie at one of its vertices). Hence, $\\ell_1$-regularization is an important feature selection method when one expects only a few dimensions of $\\cG$ to be significant. Unfortunately, the $\\ell_1$-SVM is not kernelizable due to the kernel trick relying on $\\ell_2$-geometry, and is hence infeasible to implement. However, our next theorem shows that a two-layer neural network can approximate a particular $\\ell_1$-SVM in the $m \\to \\infty$ limit (and in fact, for finite $m$). For the sake of simplicity, we sacrifice rigor in defining the space $\\mathcal{G}$ and convey the main ideas.\n\n\\begin{theorem}\\label{lec8:thm:thm8.5}\nDefine the feature map $\\phirelu: \\mathbb{R}^d \\to \\mathcal{G}$ such that $x$ is mapped to $\\phi(u^\\top x)$ for all vectors $u$ on the $d-1$-dimensional sphere $\\mathcal{S}^{d-1}$. Informally, \n$$\\phirelu(x) \\defeq \\begin{bmatrix} \\vdots \\\\ \\phi(u^\\top x) \\\\ \\vdots \\end{bmatrix}_{u\\in S^{d-1}}$$\nis an ``infinite-dimensional vector'' that contains an entry $\\phi(u^\\top x)$ for each vector $u \\in \\mathcal{S}^{d-1}$, and we let $\\phirelu(x)[u]$ denote the ``$u$''-th entry of this vector. Noting that $\\mathcal{G}$ is the space of functions which can be indexed by $u \\in S^{d-1}$, the inner product structure on $\\mathcal{G}$ is defined by $\\langle f, g \\rangle = \\int_{S^{d-1}} f[u]g[u] du$.\n\nUnder this set-up, we have\n\\begin{equation}\n\\gamma_{\\infty}^* = \\gamma_{\\ell_1}^*,\n\\end{equation}\nwhere $\\gamma_{\\ell_1}^*$ is the minimum margin of the optimized $\\ell_1$-SVM with $\\varphi = \\phirelu$.\n\\end{theorem}\n\n\\begin{proof}\n\nWe will only prove the $\\gamma_{\\infty}^* \\leq \\gamma_{\\ell_1}^*$ direction. (The $\\gamma_{\\infty}^* \\geq \\gamma_{\\ell_1}^*$ direction requires substantial functional analysis.)\n\nSuppose $\\gamma_\\infty^*$ is obtained by network weights $(w_1,w_2, \\cdots), (u_1, u_2, \\cdots)$ where $w_i\\in \\R, u_i\\in \\R^d$ (there is a slight subtlety here to be rectified later). Define renormalized versions of $\\{w_i\\}$ and $\\{u_i\\}$:\n\\begin{equation}\n\\widetilde w_i \\defeq w_i\\cdot \\norm{u_i}_2, \\qquad \\overline u_i \\defeq \\frac {u_i} {\\norm{u_i}_2}.   \n\\end{equation}\nNote that $\\{(\\widetilde w_i, \\overline u_i)\\}$ has the same functionality (and also the same complexity measure $C(\\theta)$, margin, etc.) as that of $\\{(w_i,u_i)\\}$, but now $\\overline u_i$ has unit $\\ell_2$-norm (i.e. $\\bar{u}_i \\in \\mathcal{S}^{d-1}$). Thus, $\\phi(\\overline u_i ^\\top x)$ can be treated as a feature in $\\cG$ and we can construct an equivalent $\\ell_1$-SVM (denoted by $\\mu$) such that $\\widetilde w_i$ is the coefficient of $\\mu$ associated with that feature. Since $\\widetilde w_i$ must only be ``turned on'' at $\\overline u_i $, we have \n\\al{\n\\mu[u] = \\sum_{i \\in \\mathcal{S}^{d-1}} \\tilde{w}_i \\delta(u - \\overline u_i),\n}\nwhere $ \\delta(u)$ is the Dirac-delta function. Given this $\\mu$, we can check that the SVM's prediction is\n\\al{\ng_{\\mu, \\phirelu}(x) &= \\int_{S^{d-1}} \\mu[u] \\phirelu(x)[u] du \\\\\n&= \\int_{S^{d-1}}   \\sum_{i \\in \\mathcal{S}^{d-1}} \\tilde{w}_i \\delta(u - \\overline u_i) \\phi\\left(\\overline u ^\\top x\\right) du \\\\\n&= \\sum_{i \\in \\mathcal{S}^{d-1}}  \\tilde{w}_i \\phi\\left(\\overline u_i ^\\top x\\right) ,\n}\nwhich is identical to the output $f_{\\{(\\widetilde w_i, \\overline u_i)\\}}(x)$ of the neural network. Furthermore, \n\\al{\n\\norm{\\mu}_1 =  \\sum_{i=1}^{\\infty} |\\widetilde w_i| = \\sum_{i=1}^{\\infty} |w_i|\\cdot \\norm{u_i}_2 \\leq 1,\n}\nwhere the last equality holds because $\\{(\\widetilde w_i, \\overline u_i)\\}$ satisfies the constraints of \\eqref{lec8:eqn:opt2}. This shows that our constructed $\\mu$ satisfies the $\\ell_1$-SVM constraint. Thus, $\\gamma_{\\infty}^* \\leq \\gamma_{\\ell_1}^*$ since the functional behavior of the optimal neural network is contained in the search range of the SVM.\n\n\\end{proof}\n\n\\begin{remark}\nHow well does a finite-dimensional neural network approximate the infinite-dimensional $\\ell_1$ network? Proposition B.11 of \\cite{wei2020regularization} shows that you only need $n+1$ neurons. Another way to say this is that $\\{\\gamma_m\\}$ stabilizes once $m=n+1$:\n\\begin{equation}\n\\gamma_1^* \\le \\gamma_2^* \\le \\dots \\le \\gamma_{n+1}^* = \\gamma_\\infty^*.\n\\end{equation}\nThe main idea of the proof is that if we have a neural net $\\theta$ with $(n+2)$ neurons, then we can always pick a simplification $\\theta'$ with $(n+1)$ neurons such that $\\theta,\\theta'$ agree on all $n$ datapoints.\n\nAs an aside, this result also resolves the issue in our partial proof. A priori, $\\gamma_{\\infty}^*$ may not necessarily be attained by a set of weights $\\{(\\widetilde w_i, \\overline u_i)\\}$, but the above shows that it is achievable with just $n+1$ non-zero indices.\n\n\\end{remark}\n"
  },
  {
    "path": "tex/collection/05-03-deep-nets.tex",
    "content": "\\sec{Deep neural nets (via covering number)}\\label{sec:deep_nets}\nIn Section~\\ref{lec9:sec:cover_to_radem}, we discuss how strong our bounds on covering number need to be in order to get a useful result. \nHere we describe some situations in which we know how to obtain these covering number bounds for concrete models such as linear models and neural networks. \n\n\\subsec{Preparation: covering number for linear models}\nFirst, consider the following covering number bound for linear models:\n\n\\begin{theorem}[\\cite{zhang2002}] \\label{lec9:thm:univariate_rad}\nSuppose $x^{(1)}, \\cdots, x^{(n)} \\in \\mathbb{R}^d$ are $n$ data points, and $p, q$ satisfies $1/p + 1/q = 1$ and $2 \\le p \\le \\infty$. Assume that $||x^{(i)}||_p \\le C$ for all $i$. Let:\n\\begin{align}\n    \\cF_q = \\{x \\mapsto w^\\top x : ||w||_q \\le B\\}\n\\end{align}\nand let $\\rho = L_2(P_n)$. Then, $\\log N(\\epsilon, \\cF_q, \\rho) \\le \\l [\\frac{B^2C^2}{\\epsilon^2}\\r ] \\log_2 (2d + 1)$. When $p = 2, q = 2$, we further obtain that:\n\\begin{align}\n    \\log N(\\epsilon, \\cF_2, \\rho) \\le \\l [\\frac{B^2C^2}{\\epsilon^2} \\r ] \\log_2 (2 \\min (n, d ) + 1)\n\\end{align}\n\\end{theorem}\n\\begin{remark}\nApplying \\eqref{lec9:eqn:rademacherbound_three} to the covering number bound derived above with $R = B^2C^2$, we conclude that the Rademacher complexity of this class of linear models satisfies\n\\begin{align}\n    R_S(\\cF_q) &\\le \\tilO{\\left( \\frac{BC}{\\sqrt{n}} \\right)}.\n\\end{align} \nWe also prove this result without relying on Dudley's theorem in Theorem~\\ref{lec7:thm:l2-thm}.\n\\end{remark}\nNext, we consider multivariate linear functions as they are building blocks for multi-layer neural networks. Let $M = (M_1, \\cdots, M_n) \\in \\mathbb{R}^{m \\times n}$ and $\\norm{M}_{2,1} = \\sum_{i = 1}^n \\norm{M_i}_2$. Then, $\\norm{M^\\top}_{2,1}$ denotes the sum of the $\\ell_2$ norms of the rows of $M$. \n\\begin{theorem}\\label{lec9:thm:multivariate_rad}\nLet $\\cF = \\{x \\to Wx : W \\in \\mathbb{R}^{m \\times d}, ||W^\\top||_{2, 1} \\le B\\}$ and let $C = \\sqrt{\\frac{1}{n} \\sum_{i = 1}^n ||x^{(i)}||_2^2}$. Then, \n\\begin{equation}\n\\log N(\\epsilon, \\cF, L_2(P_n)) \\le \\l [\\frac{c^2B^2}{\\epsilon^2} \\r ] \\ln (2dm).\n\\end{equation}\n\\end{theorem}\n\\begin{remark}\n    In some sense, Theorem~\\ref{lec9:thm:multivariate_rad} arises from treating each dimension of the multivariate problem independently. We can view the linear layer as applying $m$ different linear functions. Explicitly, if $W = \\begin{pmatrix} w_1^\\top \\\\ \\vdots \\\\ w_m^\\top \\end{pmatrix}$ and $Wx = \\begin{pmatrix} w_1^\\top x \\\\ \\vdots \\\\ w_m^\\top x \\end{pmatrix}$, then as we expect, $\\norm{W^\\top}_{2,1} = \\sum \\norm{w_i}_2$.\n\\end{remark}\n\n\n\\subsec{Deep neural networks}\nIn this lecture, we discuss a bound on the Rademacher complexity of a dense neural network. We set up notation as follows: $W_i$ denotes the linear weight matrix at the $i$-th layer of the neural network, we have a total of $r$ layers, and $\\sigma$ is the activation function which is 1-Lipschitz (for example, ReLU, softmax, or sigmoid). If the input is a vector $x$, the neural network's output can be represented as follows:\n\n\\begin{align}\n    f_\\theta(x) = W_r\\sigma(W_{r-1}\\sigma(\\cdots \\sigma(W_1x)\\ldots)),\n\\end{align}\nUsing this notation, we establish an upper bound on the Rademacher complexity of a dense neural network.\n\n\\begin{theorem}[\\cite{bartlett2017}]\n\\label{lec10:thm:dnn_rademacher}\nSuppose that $\\forall i, \\norm{x^{(i)}}_2 \\leq c$ and let\n\\begin{align}\n    \\cF = \\{f_\\theta : \\norm{W_i}_{\\textup{op}} \\leq \\kappa_i, \\norm{W_i^\\top}_{2,1} \\leq b_i\\}.\n\\end{align}\nThen,\n\\begin{equation}\n    R_S (\\cF) \\leq \\frac{c}{\\sqrt{n}} \\cdot \\underbrace{\\left(\\prod_{i=1}^r \\kappa_i \\right)}_{\\textup{(I)}} \\cdot \\underbrace{\\left( \\sum_{i=1}^r\\frac{b_i^{2/3}}{\\kappa_i^{2/3}}\\right)^{3/2}}_{\\textup{(II)}}. \\label{lec10:eqn:bartlett_rad_bound}\n\\end{equation}\n\\end{theorem}\nWe use $\\norm{W}_{\\textup{op}}$ to denote the operator norm (or spectral norm) of $W$, and recall that $\\norm{W_i^\\top}_{2,1}$ denotes the sum of the $\\ell_2$ norms of the rows of $W_i$. Examining \\eqref{lec10:eqn:bartlett_rad_bound}, we see that (II) is relatively small as it is a sum of matrix norms, and so the bound is dominated by (I), which is a product of matrix norms.\n\n\\begin{remark}\n    We note that $f(x) = Wx$ is Lipschitz with a Lipschitz constant of $\\norm{W}_{\\textup{op}}$. This is because \n    \\begin{align}\n        \\norm{f(x)-f(y)}_2 &= \\norm{Wx-Wy}_2 \\\\\n        &\\leq \\norm{W}_{\\textup{op}}\\norm{x-y}_2 &\\text{$(\\norm{W}_{\\textup{op}} = \\max_{x:\\norm{x}_2=1}\\norm{Wx}_2)$}\n    \\end{align}. \n\\end{remark}\n\n\\begin{remark}\n    As a corollary of the above theorem, we also get a bound on the generalization error for the margin loss of the following form:\n    \\begin{equation}\n        \\textup{generalization error} \\leq \\tilde{O}\\left(\\frac{1}{\\gamma_{\\min}} \\cdot \\frac{1}{\\sqrt{n}} \\cdot \\left(\\prod_{i=1}^r\\norm{W_i}_{\\textup{op}} \\right) \\cdot {\\left( \\sum_{i=1}^r\\frac{\\norm{W_i^\\top}^{2/3}_{2,1}}{\\norm{W_i}_{\\textup{op}}^{2/3}}\\right)^{3/2}}  \\right),\n    \\end{equation}\n    where $\\gamma_{\\min}$ denotes the margin.\n\\end{remark}\n\t\nFirst, we motivate the proof by presenting the main idea, and then work through each part of the proof. The main ideas of the proof can be summarized as follows:\n    \n\\begin{itemize}\n    \\item At a high level, we want to show that the covering number $N(\\epsilon, \\cF, \\rho)$ for a dense neural network is $\\leq \\frac{R}{\\epsilon^2}$. Proving this would enable us to apply Theorem~\\ref{lec9:thm:better-dudley} to get a bound on the Rademacher Complexity.\n    \\item To bound the covering number for a dense neural network, we use $\\epsilon$-covers to cover each layer of $f_\\theta$ separately, and then combine them to prove that there exists an $\\epsilon$-cover of the original function $f_\\theta$. \n    \\item To combine the $\\epsilon$-covers of each layer, we use the Lipschitzness of each layer.\n    \\item We control and approximate the error propagation that is introduced through discretizing each layer using $\\epsilon_i$-coverings in order to get a reasonable final $\\epsilon$.\n\\end{itemize}\n\nAs a prelude to the proof of Theorem~\\ref{lec10:thm:dnn_rademacher}, let us abstractify each layer of $\\cF$ as $\\cF_i$ where $\\cF_i$ corresponds to matrix multiplication by $W_i$ composed with a nonlinear activation function $\\sigma$. We then denote $\\cF$ as the composition of each of these (single layer) function spaces as follows:\n\\begin{align}\n    \\cF = \\cF_r \\circ \\cF_{r - 1} \\circ \\cdots \\circ \\cF_1 = \\{f_r \\circ f_{r - 1} \\circ \\cdots f_{1} : f_i \\in \\cF_i\\}\n\\end{align}\nWe will assume throughout that $f_i$ is $\\kappa_i$-Lipschitz, i.e.\n\\begin{align}\n    \\norm{f_i(x) - f_i(y)}_2 \\leq \\kappa_i \\norm{x - y}_2 \\label{lec10:eqn:lipschitz-def}\n\\end{align} \nLet us also assume, for simplicity, that $f_i(0) = 0$ and $\\norm{x\\sp{j}}_2 \\leq c$ for all $j = 1,\\dots,n$. Then, by applying the definition of Lipschitz continuity, we obtain that:\n\\begin{align}\n    \\norm{f_i(f_{i - 1}(\\cdots(f_1(x\\sp{j}))))}_2 \\leq \\underbrace{\\kappa_{i} \\cdot \\kappa_{i - 1} \\cdots \\kappa_1 \\cdot c}_{\\defeq c_i}\n\\end{align}\n\nWe now derive an $\\epsilon$-covering of $\\cF$ in two steps:\n\\begin{enumerate}\n    \\item Given inputs to the $i^{th}$ layer, we construct an $\\epsilon_i$-covering of the output space of the function $f_i$.\n    \\item Using the $\\epsilon_i$-covering as inputs to the $(i + 1)$-th layer, we show that we can use several single layer coverings to construct an $\\epsilon$-covering for a multilayer network.\n\\end{enumerate}\n\nFormally, the following lemma answers the second step in the above outline. Namely, given a covering number for a single layer, we show how to compute a covering number bound for multiple layers.\n\\begin{lemma}\n    Under the setup given above, if every input to $f_i$ satisfies $\\norm{z\\sp{j}}_2 \\leq c_{i - 1}$, we assume that  \n    \\begin{align}\n        \\log N(\\epsilon_i, \\cF_i, L_2(P_n)) \\leq g(\\epsilon_i, c_{i - 1}).\\footnotemark \\label{lec10:eqn:single_cover_bound}\n    \\end{align}\n    \\footnotetext{If $\\cF_i$ defines a collection of linear models, then $\\log N(\\epsilon_i, \\cF_i, L_2(P_n)) \\leq \\l \\lceil \\frac{c_{i - 1}^2}{\\epsilon_i^2} \\r \\rceil$.}\n    Then, there exists an $\\epsilon$-cover $\\cC$ of $\\cF_r \\circ \\cdots \\circ \\cF_1$ for $\\epsilon = \\epsilon_r + \\kappa_r\\epsilon_{r-1} + \\cdots + \\kappa_r\\kappa_{r-1}\\dots\\kappa_2\\epsilon_1$ such that\n    \\begin{align}\n        \\log \\abs{\\cC} \\leq \\sum_{i=1}^{r} g\\left(\\epsilon_i, c_{i-1}\\right)\n    \\end{align}\n    \\label{lec10:lma:additive_cover}\n\\end{lemma}\n\\begin{figure}[ht!]\n\t\\begin{center}\n\t\t\\includegraphics[width=\\textwidth]{figures/multilayer_covering.png}\n\t\\end{center}\n\t\\caption{We visualize the covering strategy adopted in the proof of Lemma~\\ref{lec10:lma:additive_cover}. The two grey sets depict the output spaces of the first and second layers, namely, $\\cQ_1$ and $\\cQ_2$, respectively. The blue dots in $\\cQ_1$ are the outputs of three functions in the $\\epsilon_1$-cover $\\cC_1$, while the blue subsets of $\\cQ_2$ depict $\\cF_2 \\circ f_1'$ and $\\cF_2 \\circ f_1''$. The red circles show how we construct a covering, $\\cC_2$, of $\\cQ_2$. In particular, the two collections of red circles depict the $\\cC_{2, f_1'}$ and $\\cC_{2, f_1''}$ covers. Taking the union of such covers over all functions in $\\cC_1$ yields $\\cC_2$.}\n\t\\label{lec10:fig:multilayer-covering}\n\\end{figure}\n\\begin{proof}\nLet $\\epsilon_1,\\dots,\\epsilon_r$ be the radius for each layer. Let $\\cC_1$ be an $\\epsilon_1$-cover of $\\cF_1$. Then, for all $f_1' \\in \\cC_1$, we define $\\cC_{2, f_1'}$ as an $\\epsilon_2$-covering of  the set \n\\begin{equation}\n    \\cF_2 \\circ f_1' = \\left\\{f_2\\left(f_1'\\left(X\\right)\\right) : f_2 \\in \\cF_2 \\right\\}.\n\\end{equation}\nTaking a union of this covering over all $f_1' \\in \\cC_1$ clearly yields an $\\epsilon$-covering for $\\cF_2 \\circ \\cF_2$. In paricular, if \n\\begin{align}\n    \\cC_2 = \\bigcup_{f_1'\\in \\cC_1}\\cC_{2,f_1'},\n\\end{align} \nthen $\\cC_2$ is an $\\epsilon$-cover of $\\cF_2 \\circ \\cF_1$ with $\\epsilon = \\epsilon_1 \\cdot \\kappa_2 + \\epsilon_2$. We depict this covering procedure in Figure~\\ref{lec10:fig:multilayer-covering}, and we prove this claim rigorously in the sequel.\n\nNext, we bound the sizes of these covers. Directly applying the assumption given by \\eqref{lec10:eqn:single_cover_bound}, we conclude that\n\\begin{align}\n    \\log \\abs{\\cC_{2, f_1'}} \\leq g\\left(\\epsilon_2, c_1\\right).\n\\end{align}\nThen, because $\\cC_2 = \\bigcup_{f_1'\\in \\cC_1}\\cC_{2,f_1'}$, it immediately follows that\n\\begin{align}\n    \\abs{\\cC_{2}} &\\leq \\abs{\\cC_{1}} \\exp\\left(g\\left(\\epsilon_2, c_1\\right)\\right) \\label{lec10:eqn:iterative_cover_bound-1}\\\\\n    \\log\\abs{\\cC_{2}} &\\leq \\log\\abs{\\cC_{1}} + g\\left(\\epsilon_2, c_1\\right) \\\\\n    &\\leq g\\left(\\epsilon_1, c_0\\right) + g\\left(\\epsilon_2, c_1\\right).  \\label{lec10:eqn:iterative_cover_bound-3}\n\\end{align}\nSimilarly, given $\\cC_k$, for any $f_k' \\circ f_{k-1}' \\circ \\cdots \\circ f_1' \\in \\cC_k$, we construct a $\\cC_{k+1, f_k', \\dots, f_1'}$ that is an $\\epsilon_{k+1}$-covering of $\\cF_{k+1} \\circ f_k' \\circ \\cdots \\circ f_1'$. We similarly define \n\\begin{equation}\n    \\cC_{k+1} = \\bigcup_{\\substack{f_i \\in \\cC_i \\\\ i \\leq k}} C_{k+1, f_k', \\dots, f_1'}.\n\\end{equation}\nThen, inducting on the argument given in \\eqref{lec10:eqn:iterative_cover_bound-1}-\\eqref{lec10:eqn:iterative_cover_bound-3}, we conclude that\n\\begin{align}\n    \\log \\abs{\\cC_{k+1}} \\leq g\\left(\\epsilon_{k+1}, c_k\\right) + \\cdots + g\\left(\\epsilon_1, c_0\\right)\n\\end{align}\nNext, we show that for the above construction, the radius of the cover for $\\cF$ is\n\\begin{align}\n    \\epsilon = \\sum_{i=1}^{r} \\left(\\epsilon_i \\prod_{j=i+1}^{r}\\kappa_{j}\\right).\n\\end{align}\nFor any choice of $f_r \\circ \\cdots \\circ f_1 \\in \\cF_r \\circ \\cF_{r-1} \\circ \\cdots \\circ \\cF_1$,  then, by definition of $\\cC_1$, there exists $f_1' \\in \\cC_1$ such that \n\\begin{equation}\n    \\rho(f_1, f_1') \\leq \\epsilon_1.\n\\end{equation} \nSimilarly, we know there exists $f_2' \\circ f_1' \\in \\cC_{2, f_1'}$ such that \n\\begin{equation} \n    \\rho\\left(f_2' \\circ f_1', f_2\\circ f_1' \\right) \\leq \\epsilon_2.\n\\end{equation}\nWe can leverage these two facts and the triangle inequality to now prove that $f_2' \\circ f_1'$ is close to $f_2 \\circ f_1$. Namely,\n\\begin{align}\n   \\rho\\left(f_2' \\circ f_1', f_2 \\circ f_1\\right) &\\leq \\rho\\left(f_2' \\circ f_1', f_2 \\circ f_1'\\right) + \\rho\\left(f_2 \\circ f_1', f_2 \\circ f_1\\right) &\\text{(triangle ineq.)} \\\\ \n   &\\leq \\epsilon_2 + \\rho\\left(f_2 \\circ f_1', f_2 \\circ f_1\\right) &\\text{(def. of $\\cC_{2, f'_1}$)}\\\\ \n   &\\leq \\epsilon_2 + \\kappa_2 \\rho\\left(f_1', f_1\\right) &\\text{\\eqref{lec10:eqn:lipschitz-def}}\\\\ \n   &\\leq \\epsilon_2 + \\kappa_2\\epsilon_1 &\\text{(def. of $\\cC_{1}$)}\n\\end{align}\nInducting to prove this argument for arbitrary $k$, we similarly apply the definition of $\\cC_{k, f'_{k - 1},\\dots,f'_1}$ to conclude that there exists $f'_{k} \\circ f'_{k - 1} \\circ \\cdots \\circ f'_1 \\in \\cC_k$ such that\n\\begin{equation}\n    \\rho(f'_k \\circ f'_{k - 1} \\circ \\cdots f'_1, f_k \\circ f'_{k - 1} \\circ \\cdots f'_1) \\leq \\epsilon_k\n\\end{equation}\nThen, expanding using the triangle inequality and peeling off terms by applying the definition of our $\\epsilon_i$-coverings, we again show that\n\\begin{align}\n    \\rho\\left(f_k' \\circ f_{k-1}' \\circ \\cdots \\circ f_1', f_k \\circ \\cdots \\circ f_1\\right) &\\leq \\rho\\left(f_k' \\circ f_{k-1}'\\circ \\cdots \\circ f_1', f_k \\circ f_{k-1}'\\circ \\cdots \\circ f_1' \\right) \\\\ \n    &\\quad + \\rho\\left(f_k \\circ f_{k-1}'\\circ f_{k-2}' \\circ \\cdots \\circ f_1', f_k \\circ f_{k-1}\\circ f_{k-2}' \\circ \\cdots \\circ f_1'\\right) \\nonumber \\\\ \n    &\\quad + \\cdots + \\rho\\left(f_k \\circ f_{k-1}\\circ \\cdots \\circ f_2 \\circ f_1', f_k \\circ f_{k-1}\\circ \\cdots \\circ f_1\\right) \\nonumber \\\\ \n    &\\leq \\rho\\left(f_k' \\circ f_{k-1}'\\circ \\cdots \\circ f_1', f_k \\circ f_{k-1}'\\circ \\cdots \\circ f_1' \\right) \\\\\n    &\\quad + \\kappa_{k} \\cdot \\rho(f'_{k - 1} \\circ \\cdots \\circ f'_1, f_{k - 1} \\circ f'_{k - 2} \\circ \\cdots \\circ f'_1) \\\\\n    &\\quad + \\cdots + \\left(\\prod_{j = 2}^k \\kappa_j\\right) \\rho(f'_1, f_1) \\nonumber  \\\\\n    & \\leq \\sum_{i=1}^{k} \\left(\\epsilon_i\\prod_{j=i+1}^{k}\\kappa_{j}\\right).\n\\end{align}\nNote that the first inequality follows by the triangle inequality, the second by the $\\kappa_i$-Lipschitz continuity of $f_i$, and the third by applying the definition of each of our $\\epsilon_i$-covers.\n\\end{proof}\n\n\\begin{proof}[Proof of Theorem~\\ref{lec10:thm:dnn_rademacher}]\nWe now apply Lemma~\\ref{lec10:lma:additive_cover} to dense neural networks. Dense neural networks consist of a composition of layers, where each layer is a linear model composed with a 1-Lipschitz activation. Using Theorem~\\ref{lec9:thm:multivariate_rad} along with the property that 1-Lipschitz functions will only contribute a factor of at most $1$ (Lemma~\\ref{lec9:lma:talagrand}), the covering number of each layer can be bounded by:\n\\begin{align}\n    g\\left(\\epsilon_i, c_{i-1}\\right) = \\tilde{O}\\left(\\frac{c_{i-1}^2b_i^2}{\\epsilon_i^2}\\right),\n\\end{align}\nwhere $c_{i-1}^2$ is the norm of the inputs, $b_i^2$ is $\\norm{W_i^\\top}_{2,1}$, and $\\epsilon_i^2$ is the radius. From Lemma~\\ref{lec10:lma:additive_cover}, we know that \n\\begin{align}\n    \\log N(\\epsilon, \\cF, \\rho) &= \\tilde{O}\\left(\\sum_{i=1}^{r}\\frac{c_{i-1}^2b_i^2}{\\epsilon_i^2}\\right) \n\\end{align}\nfor\n\\begin{align}\n    \\epsilon &= \\sum_{i=1}^{r} \\left(\\epsilon_i \\prod_{j=i+1}^{r}\\kappa_j\\right)\n\\end{align}\n\nWe now have a bound on $N(\\epsilon, \\cF, \\rho)$ that relies on $\\epsilon_i$'s, but $N(\\epsilon, \\cF, \\rho)$ should only be a function of $\\epsilon$. Since we already know that $\\epsilon = \\sum_{i=1}^{r} \\left(\\epsilon_i \\prod_{j=i+1}^{r}\\kappa_j\\right)$, we keep $\\epsilon$ constant and optimize the upper bound of $N(\\epsilon, \\cF, \\rho)$ over different choices of $\\epsilon_i$. To find the optimal $\\epsilon_i$, we will first find a lower bound on $N(\\epsilon, \\cF, \\rho)$. We then choose $\\epsilon_i$ so that this lower bound is achieved. Ultimately, our optimized $\\epsilon_i$ yields a bound on the covering number of the following form: $\\log\\left(N\\left(\\epsilon, \\cF, \\rho\\right)\\right) \\leq \\frac{R}{\\epsilon^2}$, where $R$ is some constant independent of $\\epsilon$. \n\nWe derive this lower bound using Holder's inequality, which states that\n\\begin{align}\n    \\langle a,  b \\rangle \\leq \\|a\\|_p \\|b\\|_q\n\\end{align}\nwhen $\\frac{1}{p} + \\frac{1}{q} = 1$. Writing out the vectors $a, b$, we get that \n\\begin{align}\n    \\sum_{i}a_ib_i \\leq \\left(\\sum a_i^p\\right)^{\\frac{1}{p}}\\left(\\sum b_i^q\\right)^{\\frac{1}{q}}\n\\end{align}\n\nLet $\\alpha_i^2 = c_{i-1}^2b_i^2, \\beta_i = \\prod_{j=i+1}^{r}\\kappa_j$. By Holder's inequality, using $p = 3, q = \\frac{3}{2}$, we get\n\\begin{align}\n    \\left(\\sum_{i=1}^{r}\\frac{\\alpha_i^2}{\\epsilon_i^2}\\right)\\left(\\sum_{i=1}^{r}\\beta_i\\epsilon_i\\right)^2 &\\geq \\left(\\sum_{i=1}^{r}\\left(\\alpha_i\\beta_i\\right)^{\\frac{2}{3}}\\right)^{\\frac{3}{2}}\n\\end{align}\n\\begin{align}\n    \\sum_{i=1}^{r}\\frac{\\alpha_i^2}{\\epsilon_i^2} &\\geq \\frac{R}{\\epsilon^2},\n\\end{align}\nwhere $R = \\left(\\left(\\sum_{i=1}^{r}\\left(c_{i-1}b_i\\prod_{j=i+1}^{r}\\kappa_j\\right)^{\\frac{2}{3}}\\right)\\right)^{\\frac{3}{2}}$. We note that equality holds when \n\\begin{align}\n    \\epsilon_i = \\left(\\frac{c_{i-1}^2b_i^2}{\\prod_{j=i+1}^{r}\\kappa_j}\\right)^{\\frac{1}{3}} \\cdot \\underbrace{\\frac{\\epsilon}{\\left(\\sum_{i=1}^{r}\\frac{b_i^{\\frac{2}{3}}}{\\kappa_i^{\\frac{2}{3}}}\\right)\\prod_{i=1}^{r}\\kappa_i^{\\frac{2}{3}} }}_{\\epsilon'} \\label{eqn:lec10:holder_eps_defn}\n\\end{align}\nUsing this choice of $\\epsilon_i$ and letting $\\epsilon'$ equal the second factor in \\eqref{eqn:lec10:holder_eps_defn} for notational convenience, we know that the log covering number is (up to a constant factor):\n\\al{\n    \\sum_{i=1}^r \\frac{c_{i-1}^2b_i^2}{\\epsilon_i^2} &= \\sum_{i=1}^r \\frac{c_{i-1}^2b_i^2(\\kappa_{i+1}\\cdots\\kappa_r)^\\frac{2}{3}}{c_{i-1}^\\frac{4}{3}b_i^\\frac{4}{3}(\\epsilon')^2} \\\\\n    &= \\sum_{i=1}^r (c_{i-1}b_i\\kappa_{i+1}\\cdots\\kappa_r)^\\frac{2}{3}\\frac{1}{(\\epsilon')^2} \\\\\n    &= c^\\frac{2}{3}\\sum_{i=1}^r \\left(\\frac{b_i}{\\kappa_i}\\right)^\\frac{2}{3} \\prod_{i=1}^r \\kappa_i^\\frac{2}{3} \\frac{\\left(c^\\frac{2}{3}\\left(\\sum_{i=1}^r (\\frac{b_i}{\\kappa_i})^\\frac{2}{3} \\prod_{i=1}^r \\kappa_i^\\frac{2}{3}\\right)\\right)^2}{\\epsilon^2} \\\\\n    &= \\left(c^\\frac{2}{3}\\sum_{i=1}^r \\left(\\frac{b_i}{\\kappa_i}\\right)^\\frac{2}{3} \\prod_{i=1}^r \\kappa_i^\\frac{2}{3}\\right)^3\\frac{1}{\\epsilon^2} \\\\\n    &= c^2\\prod_{i=1}^r \\kappa_i^2\\left(\\sum_{i=1}^r \\left(\\frac{b_i}{\\kappa_i}\\right)^\\frac{2}{3}\\right)^3\\frac{1}{\\epsilon^2}.\n}\nSince this log covering number is of the form $R / \\epsilon^2$, we can apply \\eqref{lec9:eqn:rademacherbound_three} and conclude that\n\\al{\n    \\mathcal{R}_S(\\cF) \\lesssim \\sqrt\\frac{R}{n}\n}\nLast, plugging in\n\\al{\n    R = c^2\\prod_{i=1}^r \\kappa_i^2\\left(\\sum_{i=1}^r \\left(\\frac{b_i}{\\kappa_i}\\right)^\\frac{2}{3}\\right)^3\n}\nwe obtain the desired result\n\\al{\n    \\mathcal{R}_S(\\cF) \\lesssim \\frac{c}{\\sqrt n}\\prod_{i=1}^r \\kappa_i\\left(\\sum_{i=1}^r \\left(\\frac{b_i}{\\kappa_i}\\right)^\\frac{2}{3}\\right)^\\frac{3}{2}.\n}\n\n\\end{proof}\n\n\\chapter{Data-dependent Generalization Bounds for Deep Nets}\\label{sec:deep_nets_data_dependent}\n\nIn Theorem~\\ref{lec10:thm:dnn_rademacher}, we proved the following bound on the Rademacher complexity of deep neural networks:\n\\begin{align}\n    R_S(\\cF) \\leq \\prod_{i = 1}^r \\norm{W_i}_{\\text{op}} \\cdot \\mathsf{poly}(\\norm{W_1}, \\dots, \\norm{W_r}).\n\\end{align}\nThis bound, however, suffers from multiple deficiencies. In particular, it grows exponentially in the depth, $r$, of the network and $\\norm{W_i}_{\\text{op}}$ measures the worst-case Lipschitz-ness of the network layers over the input space. %As a consequence, the bound fails to accurately predict the good generalization properties of deep nets.\n\nIn this section, we obtain a tighter generalization bound that depends upon the realized Lipschitz-ness of the model on the training data. To further motivate this approach, we also note that stochastic gradient descent, i.e. the typical optimization method typically used to fit deep neural networks, prefers models that are more Lipschitz (see Chapter (TBD) for further discussion) \\tnotelong{add references later}. This preference must be realized by the model \\emph{on empirical data}, however, as no learning algorithm has access to the model's properties over the entire data space.\n\nUltimately, we aim to prove a tighter bound on the population loss that grows polynomially in the Lipschitz-ness of $f$ on the empirical data. Namely, given that $f$ is parameterized by some $\\theta$, we hope to derive a bound on the population loss at $\\theta$ that is a \\emph{polynomial} function of the Lipschitz-ness of $f$ on $x\\sp{1},\\dots,x\\sp{n}$ as well as the norm of $\\theta$.\n\n\\paragraph{Uniform convergence with a data-dependent hypothesis class.}\n%Classical uniform convergence does not have a single consistent definition. \nSo far in this course, given some complexity measure we denote as $\\text{comp}(\\cdot)$, our uniform convergence results always appear in one of the two following forms (which are essentially equivalent). Namely, with high probability,\n\\begin{align}\n\\forall f\\in \\cF, ~~L(f) &\\leq \\frac{\\text{comp}(\\cF)}{\\sqrt{n}} &&\\text{(I)} \\\\\n\\forall f, ~~ L(f) &\\leq \\frac{\\text{comp}(f)}{\\sqrt{n}}  &&\\text{(II)}\n\\end{align}\n\n\\begin{remark}\n    Most of the results we have obtained so far are of type I, e.g. with $\\text{comp}(\\cF)/\\sqrt{n} = R_n(\\cF)$. We obtain results of type II by considering a restricted set of functions $\\cF_C = \\{f : \\text{comp}(f) \\leq C\\}$. We then apply a type I bound to $\\cF_C$ and take a union bound over all $C$. Therefore, these two type of bounds are essentially equivalent (up to a small additive factor difference due to the additional union bound over the choices of $C$.)\n\\end{remark}\n\nNote, however, that neither of these approaches produce bounds that depend upon the data. By contrast, in the sequel, we will derive a new \\textit{data-dependent} generalization bound. These bounds state that with high probability over the choice of the empirical data and, for all functions $f\\in \\cF$,\n\\begin{align}\n    L(f) \\leq \\text{comp}\\left (f, \\{(x\\sp{i}, y\\sp{i})\\}_{i = 1}^n\\right)\n\\end{align}\nEven though the complexity measure depends on the training data, and is thus a random variable by itself, it can be used as a regularizer which can be added to the original training loss.\n\n\\begin{remark}\nAlthough there is no universal consensus on the type of generalization bound we should derive, we can argue that there is no way to leverage more information in a generalization bound beyond the empirical data. For example, one might try to use the input distribution $P$ to define the complexity measure, but if we allowed ourselves access to $P$, we could just define $\\text{comp}(f, P) = \\Exp_P[f(X)]$. In some sense, defining a generalization bound using the true distribution amounts to cheating, and the dependence on the empirical data seems to be proper because the bound can still be used as a regularizer. %, so it becomes difficult to define a distibution-dependent generalization bound in a principled way.\n\\end{remark}\n\nIn this new paradigm, we can no longer take the previous approach of obtaining type I bounds and then derive a type II bound via a reduction. To see why, suppose that we have the hypothesis class\n\\begin{align}\n    \\cF_C = \\{f: \\text{comp}(f, \\{(x\\sp{i}, y\\sp{i})\\}_{i=1}^n) \\leq C)\\}\n\\end{align}\nIf our complexity measure depends on the empirical data, then so does our hypothesis class $\\cF_C$, which makes $\\cF_C$ itself a random variable. However, our theorems regarding Rademacher complexity require that the hypothesis class be fixed before we ever see the empirical data.\n\nWe may hope to get around this by changing the way we think about uniform convergence. Consider the simplified case where our new complexity measure is separable, i.e.\n\\begin{align}\n    \\text{comp}(f, \\{(x\\sp{i}, y\\sp{i})\\}_{i=1}^n) = \\sum_{i=1}^n h(f, x\\sp{i}),\n\\end{align}\nfor some function $g$. Then we can consider an \\textit{augmented loss}:\n\\begin{align}\n    \\tilde{\\ell}(f) = \\ell(f) \\ind{h(f, x\\sp{i} \\leq C)} \\label{eqn:5}\n\\end{align}\n\\begin{figure}[ht]\n    \\centering\n    \\begin{tikzpicture}\n        \\draw[->] (0, 0) -- (5, 0) node[right] {$\\theta$};\n        \\draw[->] (0, 0) -- (0, 5) node[above] {loss};\n        \\draw[scale=0.5, domain=0:6, smooth, ultra thick, variable=\\x, blue] plot (\\x, {7 - \\x + sin(\\x r)});\n        \\draw[scale=0.5, domain=0:5.5, smooth, ultra thick, variable=\\x, green] plot (\\x, {10 - 1.6*\\x + sin(0.90*\\x r)});\n        \\draw[scale=0.5, domain=6:10, smooth, ultra thick, variable=\\x, blue] plot (\\x, {\\x - 5 + sin(\\x r)}) node[right] {test};\n        \\draw[scale=0.5, domain=5.5:10, smooth, ultra thick, variable=\\x, green] plot (\\x, {1.4*\\x - 6.67 + sin(\\x r)}) node[right] {train};\n        \\draw[dashed] (2, 0) -- (2, 5);\n        \\draw[dashed] (3.5, 0) -- (3.5, 5);\n        \\draw [decorate,decoration={brace,amplitude=5pt,mirror,raise=2ex}]\n        (2,0) -- (3.5,0) node[midway,yshift=-2em]{low-complexity params};\n    \\end{tikzpicture}\n    \\caption{These curves depict a ``low-complexity'' region in parameter space. The \\textcolor{blue}{blue} curve is the unobserved test loss we aim to bound, while the \\textcolor{green}{green} curve denotes the empirical training loss we observe. Observe that in the region of $\\theta$ that we identify as being ``low-complexity,'' the gap between the train and test losses is smaller than in the high-complexity regions.}\n    \\label{lec11:fig:low_vs_high_complexity}\n\\end{figure}\nSuppose we have a region of low complexity in our existing loss function as depicted in Figure~\\ref{lec11:fig:low_vs_high_complexity}. Because this region is random, so we cannot selectively apply uniform convergence. However, we can use our new surrogate loss function $\\tilde{\\ell}$ in that region. By modifying the loss function in this way, we can still fix the hypothesis class ahead of time, allowing us to apply existing tools to $\\tilde{\\ell}(f)$. The surrogate loss was used in~\\cite{wei2019data} to obtain a data-dependent generalization bound, though there are possibly various other ways to define surrogate losses and apply existing uniform convergence guarantees. In the sequel, we introduce a particular surrogate ``margin'' that allows us to cleanly apply our previous results to a (implicitly) data-dependent hypothesis class \\cite{wei2019data}.\n\n\\sec{All-layer margin} \\label{sec:all_layer_margin}\nWe next introduce a new surrogate loss called the \\textit{all-layer margin} that can also be thought of as a surrogate margin. This loss will essentially zero out high-complexity regions so that we may focus on low-complexity regions for which we can expect small generalization gap. Note that the all-layer margin we analyze will not explicitly zero-out high-complexity regions using an indicator function, but instead implicitly takes into account some data-dependent characteristics of the model. Once we adopt this new loss function, we will be able to apply some of our earlier methods.\n\nLet $f: \\R^d \\to \\R$ be a classification model. Recall that the standard margin is defined as $y f(x)$, with $y$ in $\\{-1, 1\\}$. We will say that $g_f(x, y)$ is a \\textit{generalized margin} if it satisfies\n\\begin{align}\n    g_f(x, y) = \\begin{cases}\n0,& \\text{ if } f(x)y \\leq 0 \\text{ (an incorrect classification)}\\\\\n> 0,& \\text{ if } f(x)y > 0 \\text{ (a correct classification)}\n\\end{cases}.\n\\end{align}\n%That is, the generalized margin ``zeroes out\" incorrect classifications.\nTo simplify the exposition of the machinery below, we also introduce the \\textit{$\\infty$-covering number} $N_\\infty(\\epsilon, \\cF)$ as the minimum cover size with respect to the metric $\\rho$ defined as the infinity-norm distance on an input domain $\\cX$: \n\\begin{equation}\n\\rho(f, f) \\triangleq \\sup_{x \\in \\mathcal{X}} |f(x) - f'(x)| \\triangleq \\|f - f'\\|_\\infty.\\footnote{If $f$ maps $\\cX$ to multi-dimensional outputs, we will define $\\rho(f, f) \\triangleq \\sup_{x \\in \\mathcal{X}} \\|f(x) - f'(x)\\| \\triangleq \\|f - f'\\|_\\infty$ where the norm in $\\|f(x) - f'(x)\\|$ is a norm in the output space of $f$ (which will be the Euclidean norm in this rest of this section).}\n\\end{equation}\n\\begin{remark}\n    Notice that $N_\\infty(\\epsilon, \\cF) \\geq N(\\epsilon, \\cF, L_2(P_n))$. This is because the $\\rho = L_\\infty(\\cX)$ is a more demanding measure of error: $f$ and $f'$ must be close on \\textit{every} input, not just the empirical data. That is,\n    \\begin{equation}\n    \\sqrt{\\frac{1}{n} \\sum_{i=1}^n (f(x_i) - f'(x_i))^2} \\leq \\sup_{x \\in \\mathcal{X}} |f(x) - f'(x)|. \\label{lec11:eqn:l_inf_vs_l2pn}\n    \\end{equation}\n\\end{remark}\n\n\\begin{lemma}\nSuppose $g_f$ is a generalized margin. Let $\\cG = \\{g_f: f \\in \\mathcal{F}\\}$. Suppose that for some $R$, $\\log N_\\infty(\\epsilon, \\cG) \\leq \\lfloor \\frac{R^2}{\\epsilon^2} \\rfloor$ for all $\\epsilon > 0$.\\footnote{Recall that this is the worst dependency on $\\epsilon$ that we can tolerate when converting covering number bounds to Rademacher complexity.} Then, with high probability over the randomness in the training data, for every $f$ in $\\mathcal{F}$ that correctly predicts all the training examples,\n\\begin{equation}\nL_{01} \\leq \\tilO \\l (\\frac{1}{\\sqrt{n}} \\cdot \\frac{R}{\\min_{i \\in [n]} g_f(x\\sp{i}, y\\sp{i})} \\r ) + \\tilO\\l (\\frac{1}{\\sqrt{n}}\\r ).\n\\end{equation}\n\\label{lec11:genmargin-lemma}\n\\end{lemma}\n\n\\begin{proof}\nThe high-level idea of our proof is to replace $\\cF$ with $\\cG$ before repeating the standard margin theory argument from Section~\\ref{sec:formal_margin}.\n\nLet $\\ell_\\gamma$ be the ramp loss given in \\eqref{lec6:eqn:ramp_loss}, which is 1 for negative values, 0 for values greater than $\\gamma$, and a linear interpolation between 1 and 0 for values between 0 and $\\gamma$. \nWe define the surrogate loss as $\\hat{L}_\\gamma(\\theta) = \\frac{1}{n} \\sum_{i = 1}^n \\ell_\\gamma(g_{f_\\theta}(x\\sp{i}, y\\sp{i}))$, and the surrogate population loss as $L_\\gamma(\\theta) = \\Exp[\\ell_\\gamma(g_{f_\\theta}(x, y))]$. Applying Corollary~\\ref{lec6:cor:ggap-rsbound}, where we used the Rademacher complexity to control the generalization error, we conclude that\n\\begin{equation}\nL_\\gamma(\\theta) - \\hat{L}_\\gamma(\\theta) \\leq R_S(\\ell_\\gamma \\circ \\cG) + \\tilO\\l (\\frac{1}{\\sqrt{n}}\\r ).\n\\end{equation}\nNext we observe that \n\\begin{align}\n    \\log N(\\epsilon, \\ell_\\gamma \\circ \\cG, L_2(P_n)) &\\leq \\log N(\\epsilon\\gamma, \\cG, L_2(P_n)) &\\text{(Lemma~\\ref{lec9:lma:talagrand})} \\\\\n    &\\leq \\log N_\\infty(\\epsilon\\gamma, \\cG) &\\text{\\eqref{lec11:eqn:l_inf_vs_l2pn}} \\\\\n    &\\leq \\l \\lfloor \\frac{R^2}{\\epsilon^2 \\gamma^2} \\r \\rfloor &\\text{(by assumption)}.\n\\end{align}\nThen, using our results relating the log of the covering number to a bound on the Rademacher complexity (recall \\eqref{lec9:eqn:rademacherbound_three} and Theorem~\\ref{lec9:thm:better-dudley}), we conclude that $R_S(\\ell_\\gamma \\circ \\cG) \\leq \\tilO\\l (\\frac{R}{\\gamma \\sqrt{n}}\\r )$.\nTake $\\gamma = \\gamma_{\\min} = \\min_{i} g_\\gamma(x\\sp{i}, y\\sp{i})$.\\footnote{A caveat: because $\\gamma$ is a random variable, proving this result rigorously requires taking a union bound over a discretized $\\gamma$. We sketched out this argument more thoroughly in Remark~\\ref{lec7:rmk:union_bound_margin}.} Using Corollary~\\ref{lec6:cor:ggap-rsbound}, we conclude that $\\hat{L}_{\\gamma_\\text{min}} (\\theta) \\leq 0 + \\tilO\\l (\\frac{R}{\\sqrt{n} \\cdot \\gamma_\\text{min}} \\r ) + \\tilO\\l (\\frac{1}{\\sqrt{n}}\\r )$, as desired.\n\\end{proof}\nFor which $g_f$ can we bound the covering number? If we take $g_f(x, y) = yf(x)$, then the covering number depends on the product $\\prod_i \\norm{W_i}_{\\text{op}}$, but we originally set out to do better than this. If we have a linear model $w^\\top x$, the normalized margin, $\\frac{y \\cdot w^\\top x}{\\norm{w}}$, governs the generalization performance. But how do we normalize for more general models? \n\nFor a deep neural net, a potential normalizer is the product of the Lipschitz constants of the layers. However, we do not want to normalize by a constant that depends only on the function class, so we take a different approach. We interpret the normalized margin as the solution to the following optimization problem:\n\\begin{equation}\n    \\begin{aligned}\n        \\min_\\delta \\quad & \\norm{\\delta}_2 \\\\\n        \\textrm{s.t.} \\quad & w^\\top(x + \\delta) y \\leq 0\n    \\end{aligned}\n\\end{equation}\nIn plain English, this problem searches for the minimum perturbation that gets our data point across the boundary.\n\nThis perturbation view of the standard margin can be extended naturally to multiple layers. For the math to work, it turns out that we need to perturb all the layers. We define the \\textit{all-layer margin} as below. We will consider perturbed models $\\delta = (\\delta_1, \\dots, \\delta_r)$, where each $\\delta_i$ is a perturbation \\textit{vector} associated with the $i$-th layer (and it has the same dimensionality as the $i$-th layer activation). We incorporate these perturbations into our model in the following way (so that we can handle the scaling in a clean way):\n\\begin{align}\n    h_1(x, \\delta) &= W_1 x + \\delta_1 \\cdot \\norm{x}_2 \\\\\n    h_2(x, \\delta) &= \\sigma(W_2 h_1(x, \\delta)) + \\delta_2 \\cdot \\norm{h_1(x, \\delta)}_2 \\\\\n    &\\vdots \\nonumber \\\\\n    f(x, \\delta) = h_r(x, \\delta) &= \\sigma(W_r h_{r - 1}(x, \\delta)) + \\delta_r \\cdot \\norm{h_{r - 1}(x, \\delta)}_2.\n\\end{align}\nWe can then ask: what was the smallest perturbation that changed our decision? That is, let\n\\begin{align}\n    m_f(x, y) \\defeq \\min_\\delta \\sqrt{\\sum_{i=1}^r ||\\delta_i||_2^2} \\quad \\text{s.t.} \\quad f(x, \\delta) y \\leq 0,\n\\end{align}\ni.e. the smallest perturbation that yields incorrect predictions.\n\nInformally, $m_f(x, y)$ is a measure of how hard it is to perturb the model $f$. $f$ can be hard to perturb for two reasons: $f$ is Lipschitz (in its intermediate layers) and/or $yf(x)$ is large. In other words, the all-layer margin is a normalized version of the standard margin, normalized by the Lipschitzness of the model at the particular data point $(x,y)$.  %Even more informally, large margins imply confidence in our predictions, and so it becomes harder to change the model's mind.\n\nWe now introduce our main result regarding the all-layer margin.\n\\begin{theorem} \\label{lec11:thm:poly_gen_bound_deep_nets}\nWith high probability, for all $f$ with training error $0$,\n\\begin{equation}\nL_{01}(f) \\leq \\tilO\\l (\\frac{1}{\\sqrt{n}} \\cdot \\frac{\\sum_{i=1}^r \\norm{W_i}_{1, 1}}{\\min_{i \\in [n]} m_f(x\\sp{i}, y\\sp{i})}\\r ) + \\tilO\\l (\\frac{r}{\\sqrt{n}}\\r ),\n\\end{equation}\nwhere\n$\\norm{W}_{1, 1}$ is the sum of the absolute values of the entries of W.\n\\end{theorem}\nIn summary, robustness to perturbations in intermediate layers implies good generalization. We will interpret the bound, compare the bounds with previous works, and discuss further extensions in the remarks following the proofs of the theorem.  (E.g, in Remark~\\ref{remark:1}, we will argue that this bound is strictly better than the one we obtained in Theorem~\\ref{lec10:thm:dnn_rademacher}; in the worst case, we still have that $\\frac{1}{m_f(x, y)} \\leq \\frac{\\prod \\norm{W_i}_{\\text{op}}}{f(x)}$.)\n\nTo prove this theorem, it suffices to bound $N_\\infty(\\epsilon, \\cG)$ by $O(\\frac{\\sum{\\norm{W_i}_{1, 1}}}{\\epsilon^2})$ and apply Lemma~\\ref{lec11:genmargin-lemma}. Towards this goal, let $\\cF_i = \\{ z \\mapsto \\sigma (W_i z) : \\norm{W_i}_{1, 1} \\leq \\beta_i \\}$. Then, $\\cF = \\cF_r \\circ \\cF_{r-1} \\circ \\cdots \\circ \\cF_1$. \n\n\\begin{lemma}[Decomposition Lemma]\\label{lec11:lma:decomp}\nLet $m \\circ \\cF$ denote $\\{m_f : f \\in \\cF \\}$. Then, \n\\begin{equation}\n\\log N_\\infty\\l (\\sqrt{\\sum_{i=1}^r \\epsilon_i^2}, m \\circ \\cF \\r ) \\leq \\sum_{i=1}^r \\log N_\\infty(\\epsilon_i, \\cF_i),\n\\end{equation}\nwhere $N_\\infty(\\epsilon_i, \\cF_i)$ is defined with respect to the input domain $\\mathcal{X} = \\{x : \\norm{x}_2 \\leq 1 \\}$.\n\\end{lemma}\n\nThat is, we only have to find the covering number for each layer, and then we have the covering number for the (all-layer margin of the) composed function class. Notice that we bounded the covering number of $m \\circ \\cF$ in the above lemma, not $\\cF$.\n\nThen, the desired result follows directly from the preceding decomposition lemma.\n\\begin{corollary} Assume that $\\log N_\\infty(\\epsilon_i, \\cF_i) \\leq \\l \\lfloor \\frac{c_i^2}{\\epsilon_i^2} \\r \\rfloor$ for every $\\cF_i$, i.e. the function class corresponding to the $i$-th layer of $f$ in Theorem~\\ref{lec11:thm:poly_gen_bound_deep_nets}. Then, by taking $\\epsilon_i = \\epsilon \\cdot \\frac{c_i}{\\sqrt{\\sum_i c_i^2}}$, we have that\n\\begin{equation}\n    \\log N_\\infty(\\epsilon, m \\circ \\cF) \\leq \\frac{\\sum_i c_i^2}{\\epsilon^2}.\n\\end{equation}\n\\end{corollary}\nThis result gives the complexity of the composed model in terms of the complexity of the layers, with each $c_i$ given by $\\norm{W_i}_{1, 1}$. For linear models, we can show $N_\\infty(\\epsilon_i, \\cF_i) \\leq \\tilO\\l (\\frac{\\beta_i^2}{\\epsilon^2} \\r )$ (where $\\beta_i$ is a bound on $\\norm{W_i}_{1, 1}$), and this implies Theorem~\\ref{lec11:thm:poly_gen_bound_deep_nets}\\footnote{Technically, we also need to union bound over the choices of $\\beta_i$, which can also be achieved following Remark~\\ref{lec7:rmk:union_bound_margin}.} Finally, we are only left with the proof of Lemma~\\ref{lec11:lma:decomp}. \n\n\\begin{proof}[Proof of Lemma~\\ref{lec11:lma:decomp}]\nNow we will prove a limited form of the decomposition lemma for affine models: $\\cF_i = \\{ z \\mapsto \\sigma(W_i z): \\norm{W_i}_{1, 1} \\leq \\beta_i \\}$. There are two crucial steps to this problem. First, we will prove that $m_f(x, y)$ is 1-Lipschitz in $f$. That is, for all $\\cF = \\cF_r \\circ \\cF_{r-1} \\circ \\cdots \\circ \\cF_1$ and $\\cF' = \\cF_r' \\circ \n\\cF_{r-1}' \\circ \\cdots \\circ \\cF_1'$,\n\\begin{align}\n    \\abs{m_f(x, y) - m_{f'}(x, y)} \\leq \\sqrt{\\sum_{i=1}^r \\max_{\\norm{x}_2 \\leq 1} \\norm{f_i(x) - f_i'(x)}_2^2}. \\label{lec11:eqn:one_lipschitz_claim}\n\\end{align}\nNotice that now we are working with a clean sum of differences, with no multipliers! \n\nSecond, we construct a cover: Let $U_1, \\dots, U_r$ be $\\epsilon_1, \\dots, \\epsilon_r$-covers of $\\cF_1, \\dots, \\cF_r$, respectively, such that $\\abs{U_i} = N_\\infty(\\epsilon_i, \\cF_i)$. By definition, for all $f_i$ in $\\cF_i$, there exists a $u_i \\in U_i$ such that $\\max_{\\norm{x} \\leq 1} \\norm{f_i(x) - u_i(x)}_2 \\leq \\epsilon_i$. Take $U = U_r \\circ U_{r-1} \\circ \\cdots \\circ U_1 = \\{u_r \\circ u_{r-1} \\circ \\cdots \\circ u_1 \\}$ as the cover for $m \\circ \\cF$. Suppose we were given $f = f_r \\circ \\cdots \\circ f_1 \\in \\cF$. Let $u_r, \\dots, u_1$ be the nearest neighbors of $f_r, \\dots, f_1$. Then\n\\begin{align}\n|m_f(x, y) - m_u(x, y)| &\\leq \\sqrt{\\sum_{i=1}^r \\max_{||x|| \\leq 1} ||f_i(x) - u_i(x)||_2^2} \\\\\n&\\leq \\sqrt{\\sum_{i=1}^r \\epsilon_i^2} &&\\text{(by construction).}\n\\end{align}\n\nHaving established the validity of our cover, we now return to our claim of 1-Lipschitz-ness stated in \\eqref{lec11:eqn:one_lipschitz_claim}. By symmetry, it is sufficient to prove an upper bound for $m_{f'}(x, y) - m_f(x, y)$.\n\nLet $\\delta_1^*, \\dots, \\delta_r^*$ be the optimal choices of $\\delta$ in defining $m_f(x, y)$. Our goal is to turn these into a feasible solution of $m_{f'}(x, y)$, which we denote by $\\hat{\\delta}_1, \\dots, \\hat{\\delta}_r$. If this solution is feasible, we obtain the bound $m_{f'}(x, y) \\leq \\sqrt{\\sum \\norm{\\hat{\\delta}_i}^2_2}$.\n\nIntuitively, we want to define a perturbation for $f'$ that does the same thing as $\\delta_1^*,\\dots,\\delta_r^*$ for $f$. In plain English, $(f', \\hat{\\delta}_1, \\dots, \\hat{\\delta}_r)$ should do the same thing as $(f_1, \\delta_1^*, \\dots, \\delta_r^*)$. Recall that $f$ has parameters $W_1, \\dots, W_r$ and $f'$ has parameters $W_1', \\dots, W_r'$. Then, under the optimal perturbation,\n\\begin{align}\n    h_1 &= W_1 x + \\delta_1^* \\norm{x}_2 \\\\\n    h_2 &= \\sigma(W_2 h_1) + \\delta_2^* \\norm{h_1}_2 \\\\\n    &\\vdots \\nonumber \\\\\n    h_r &= \\sigma(W_r h_{r - 1}) + \\delta_r^* \\norm{h_{r - 1}}_2\n\\end{align}\nWe want to imitate this by perturbing $f'$ in some way. In particular, let\n\\begin{equation}\n    h_1 = W_1'x + \\underbrace{\\delta_1^* \\norm{x}_2 + (W_1 - W_1')x}_{\\defeq \\text{ }\\hat{\\delta}_1 \\norm{x}_2},\n\\end{equation}\nwhere the last term serves to compensate for the difference between $W_1$ and $W_1'$. Thus, $\\hat{\\delta}_1 \\defeq \\delta_1^* + \\frac{(W_1 - W_1')x}{\\norm{x}_2}$.\nWe repeat this argument for every layer. Using the second layer as an example, \n\\begin{align}\n    h_2 &= \\sigma(W_2' h_1) + \\underbrace{\\delta_2^*\\norm{h_1} + \\sigma(W_2 h_1) - \\sigma(W_2' h_1)}_{\\defeq \\text{ }\\hat{\\delta}_2 \\norm{h}_2}.\n\\end{align}\nSo, $\\hat{\\delta}_2 = \\delta_2^* + \\frac{\\sigma(W_2 h_1) - \\sigma(W_2' h_1)}{\\norm{h_1}_2}$. In general, \n\\begin{align}\n    \\hat{\\delta}_i \\defeq \\delta_i^* + \\frac{\\sigma(W_ih_{i-1}) - \\sigma(W_i' h_{i-1})}{\\norm{h_{i-1}}_2}\n\\end{align} \n\nThen $\\hat{\\delta}_1,\\dots, \\hat{\\delta}_r$ on $f'$ are making the same predictions as $\\delta_1, \\dots, \\delta_r$ on $f'$. Last, observe that\n\\begin{align}\n    m_{f'}(x, y) &\\leq \\sqrt{\\sum ||\\hat{\\delta}_i||_2^2} \\\\\n    &\\leq \\sqrt{\\sum \\norm{\\delta_i^*}_2^2} + \\sqrt{\\sum_{i = 1}^r \\left (\\frac{\\sigma(W_i h_{i-1}) - \\sigma(W_i' h_{i-1})}{\\norm{h_{i-1}}_2} \\right)^2 } &\\text{(Minkowski's Ineq.)\\footnotemark}\\\\\n    &\\leq m_f(x, y) + \\sqrt{\\sum_{i=1}^r \\max_{\\norm{x}_2 \\leq 1} (\\sigma(W_i x)-\\sigma(W'_i x))^2} \\label{lec11:eqn:l2_constraint} \\\\\n    &= m_f(x, y) + \\sqrt{\\sum_{i=1}^r \\max_{\\norm{x}_2 \\leq 1} (f_i(x)-f_i'(x))^2}\n\\end{align} \n\\footnotetext{Minkowski's inequality, which states that $\\sqrt{\\sum \\norm{a_i + b_i}_2^2} \\leq \\sqrt{\\sum \\norm{a_i}_2^2} + \\sqrt{\\sum \\norm{b_i}_2^2}$. In this setting, this inequality can also be proved using Cauchy-Schwarz.}\nNote that in \\eqref{lec11:eqn:l2_constraint}, constraining $\\norm{x}_2 \\leq 1$ is equivalent to dividing by the $\\ell_2$-norm of $x$.\n\\end{proof}\n\n\\begin{remark}\\label{remark:1}\nWe can compare the above with Theorem~\\ref{lec10:thm:dnn_rademacher} proven in \\cite{bartlett2017}.\n\\begin{equation}\n\\begin{split}\nf(x, \\delta) - f(x) &\\leq \\norm{\\delta_r}_2 \\cdot \\norm{W_{r-1}}_{\\text{op}} \\cdots \\norm{W_1}_{\\text{op}} \\\\\n&\\quad + \\norm{W_r}_{\\text{op}} \\cdot \\norm{\\delta_{r-1}}_2 \\cdot \\norm{W_{r - 2}}_{\\text{op}} \\cdots \\norm{W_1}_{\\text{op}} \\\\\n&\\quad + \\cdots  \\\\\n&\\quad + \\norm{W_r}_{\\text{op}} \\cdots \\norm{W_2}_{\\text{op}} \\cdot \\norm{\\delta_1}_2.\n\\end{split}\n\\end{equation}\nIgnoring minor details (e.g. dependency on $r$), we suppose that $y = 1$. Then, if $f(x) > 0$ and $f(x + \\delta) \\leq 0$, it must be the case that $\\norm{\\delta}_2 \\lesssim \\frac{|f(x)|}{\\prod_{i = 1}^r \\norm{W_i}_{\\text{op}}}$. This further implies that \n\\begin{align}\n    \\frac{m_f(x, y)}{y f(x)} \\gtrsim \\frac{1}{\\prod_{i = 1}^r \\norm{W_i}_{\\text{op}}}.\n\\end{align}\nRearranging, we conclude that we have obtained a tighter bound since the inverse margin $\\frac{1}{m_f(x, y)} \\lesssim \\frac{1}{yf(x)} \\cdot \\prod_{i = 1}^r \\norm{W_i}_{\\text{op}}$.\n\\end{remark}\n\n\\begin{remark}\nLater, we will show that SGD prefers Lipschitz solutions and Lipschitzness on data points.\\tnotelong{add a reference later}\n Implicitly, SGD seems to be maximizing the all-layer margin. Since the algorithm is (in a sense) minimizing Lipschitzness on a data point, this likely accounts for the empirically observed gap between the two bounds. \n\\end{remark}\n\n\\begin{remark}\nThe approach we have described here is also similar to other methods in the deep learning literature. Other authors have introduced a method known as SAM (a form of sharpness-aware regularization); this method applies a perturbation to the parameter $\\theta$ itself rather than on the intermediate hidden parameters $h_i$. However, these two methods are related! If we consider the (single-example) loss $\\frac{\\partial \\ell}{\\partial W_i}$, it equals $\\frac{\\partial \\ell}{\\partial h_{i+1}} \\cdot h_i^\\top$. Note that the norm of the term on the left is bounded by the product of the norms of the two terms of the right; this observation relates the model's Lipschitzness with respect to the parameters to its Lipschitzness with respect to the hidden layer outputs.\n\\tnotelong{a reminder for Tengyu to have a stronger argument ehre}\n\\end{remark}\n\n\\begin{remark}\nFinally, we can prove a more general version of this result in which we do not need to study the minimum margin of the entire dataset, and instead consider the average margin. Using this approach, we can show that the test error is bounded above by \n$\\frac{1}{n} \\sqrt{\\frac{1}{n} \\sum_{i=1}^n \\frac{1}{m_f(x\\sp{i}, y\\sp{i})^2}}$ times the sum of complexities of each layer, plus a low-order term.\n\\end{remark}\n"
  },
  {
    "path": "tex/collection/06-dltheory.tex",
    "content": "% reset section counter\n\\setcounter{section}{0}\n\n%\\metadata{lecture ID}{Your names}{date}\n\\metadata{9}{Rafael Rafailov and Aidan Perreault}{Feb 10th, 2021}\n\nWe now turn to a high-level overview of deep learning theory. To begin, we outline a framework for classical machine learning theory, then discuss how the situation is different from deep learning theory.\n\n\\sec{Framework for classical machine learning theory}\nAt the risk of oversimplification, we can divide classical machine learning theory into three parts:\n\n\\begin{enumerate}\n\\item {\\bf Approximation theory} attempts to answer whether there is any choice of parameters $\\theta$ that achieves low population error. In other words, is the choice of hypothesis class good enough to approximate the ground truth function? Using notation from earlier in this course, the goal is to upper bound $L(\\theta^*) = \\min_{\\theta \\in \\Theta} L(\\theta).$\n    \n\\item {\\bf Statistical generalization} focuses on bounding the excess risk $L(\\hat{\\theta}) - L(\\theta^*)$. In Chapter \\ref{chap:uc} we obtained the following bound:\n    \n\\begin{equation}\nL(\\hat{\\theta})-L(\\theta^*)\\leq \\underbrace{L(\\hat{\\theta})-\\hat{L}(\\hat{\\theta})}_{\\text{generalization error}} + |L(\\theta^*)-\\hat{L}(\\theta^*)|.\n\\end{equation}\n    \nThe first term here is the generalization error, which usually has an upper bound of the form $R(\\theta)/\\sqrt{n}$, where $R(\\theta)$ is some complexity measure.\\footnote{In earlier chapters, we defined the complexity of a hypothesis class, not of a specific parameter value. To reconcile these two approaches, think of $R$ as a measure of complexity (such as a norm) that we can then use to define a hypothesis class $\\Theta$, i.e.~$\\Theta = \\{\\theta' : R(\\theta') \\le R(\\theta)\\}$.} This is a demonstration of \\href{https://en.wikipedia.org/wiki/Occam%27s_razor}{\\textit{Occam's Razor}}: the principle that simple (parsimonious, or low-complexity) explanations tend to generalize better. \n    \nThis statistical approach allows us to define a regularized loss  $\\hat{L}_{\\textup{reg}}(\\theta)=\\hat{L}(\\theta)+\\lambda R(\\theta)$. Minimizing this loss gives us a solution $\\hat{\\theta}_\\lambda$ which simultaneously has low training error and low complexity, which lets us bound both the training error and the generalization error. To summarize, in the classical setting, we can prove statements of the form\n    \n\\begin{equation}\\label{lec9:eqn:classical-guarantee}\n\\text{Any global minimizer }\\hat{\\theta}_\\lambda \\text{ of } \\hat{L}_{\\textup{reg}} \\textup{ has small excess risk }  L(\\hat{\\theta}_\\lambda) - L(\\theta^*)\\,.\n\\end{equation}\n\n\\item {\\bf Optimization} considers how to obtain the minimizer $\\hat\\theta$ or $\\hat{\\theta}_\\lambda$ computationally. This usually involves convex optimization: if $\\hat{L}$ or $\\hat{L}_{\\textup{reg}}$ is convex, then we have a polynomial-time algorithm to find the global minimum.\n\\end{enumerate}\n\nWhile there are many tradeoffs to consider between these three components (for example, we may be able to find a loss function for which optimization is easy, but generalization becomes worse), they are conceptually independent, and it is typically possible to study each area individually, then combine all three to get a result.\n\n\\sec{Deep learning theory and its differences}\nThe situation is more complex for deep learning theory. Two prominent differences are (a) the models are non-linear and the objective functions are non-convex, and (b) in deep learning, researchers have observed in many cases that more parameters typically help improve the performance, and many state-of-the-art models have much more parameters than the number of training data. (b) is often referred as to ``over-parameterization\".\n\n\\begin{figure}[ht]\n    \\centerline{\\includegraphics[width=4in]{figures/overparameterization.png}}\n    \\caption[lec9:fig:overparam]{The black and red lines denote the training and test error, respectively, of a three layer neural network fit to and evaluated on MNIST \\cite{neyshabur2015norm}. While classical generalization theory predicts that beyond some threshold, the test error will increase with complexity (shown by the purple line), the true test error continues to decline with overparameterization. Though not depicted here, Neyshabur et al. observe similar test set error curves for a neural network fit to CIFAR-10.}\n    \\label{lec9:fig:overparam}\n\\end{figure}\n\nLet us consider the difference in each of the three components described for classical machine learning theory. \n\n\\begin{enumerate} \n\\item {\\bf Approximation theory:} Large neural net models are considered to be very expressive. That is, both the population loss $L(\\theta)$ and the finite sample loss $\\hat{L}(\\theta)$ can be made small. In fact, neural networks are \\textit{universal approximators}; see for example \\cite{hornik1991}. This can be a somewhat misleading statement as the definition of universal approximator allows for the size of the network to be impracticably large, but morally it seems to hold true in practice anyway.\n        \nThis expressivity is possible because neural networks are usually highly \\textit{over-parametrized}: they have many more parameters than samples. It is possible to prove that in this regime, the network can ``memorize'' the entire dataset and achieve approximately zero training error \\cite{arpit2017memorization}.\n    \n\\item {\\bf Statistical generalization:} Relatively weak regularization is used in practice. In many cases only weak $\\ell_2$ regularization is used, i.e.\n\\begin{equation}\n\\widehat{L}_{\\textup{reg}}(\\theta)=\\hat{L}(\\theta)+\\lambda\\|\\theta\\|_2^2.\n\\end{equation}\n    \nThe first interesting fact is that this regularized loss does not have a unique (approximate) global minimizer. This is due to overparametrization: there are so many degrees of freedom that there are many approximate global minimizers with approximately the same $\\ell_2$ norm.\n    \nHowever, it turns out that these global minimizers are not equally good: many models which achieve zero training error may have very bad test error (Figure~\\ref{lec9:fig:bad-global-min}). Take, for example, using stochastic gradient descent (SGD) to learn a model to classify the dataset CIFAR-10. In Figure~\\ref{lec9:fig:dl-implicitreg}, we show two instantiations of this: one starting with a large learning rate and slowly decreasing it, and one with a small learning rate throughout. Even though both instantiations result in approximately zero training error, the former leads to much better test performance. \n\nTherefore, the job of optimizers in deep learning is not just to find an arbitrary global minimum: we need to find the right global minimum. This contrasts sharply with \\eqref{lec9:eqn:classical-guarantee} from the classical setting, where achieving a global minimum leads to good guarantees on generalization error. This means that \\eqref{lec9:eqn:classical-guarantee} is simply not powerful enough to deal with deep learning, because it cannot distinguish between global minima with good test error and bad test error.\n\n\\begin{figure}[t]\n    \\centering\n    \\begin{subfigure}[t]{0.49\\textwidth}\n        \\centering\n        \\includegraphics[width=3in]{figures/bad global min .png}\n        \\caption{}\n        \\label{lec9:fig:bad-global-min}\n    \\end{subfigure}\n    \\hfill\n    \\begin{subfigure}[t]{0.49\\textwidth}\n        \\centering\n        \\hspace*{-1.8em}\n        \\includegraphics[width=3in]{figures/deep-learning-implicit-reg.png}\n        \\caption{}\n        \\label{lec9:fig:dl-implicitreg}\n    \\end{subfigure}\n    \\caption{We use dotted and solid lines to depict training and test error, respectively. Figure~\\ref{lec9:fig:bad-global-min} demonstrates how global minimizers for the training loss can have differing performance on test data. In Figure~\\ref{lec9:fig:dl-implicitreg}, blue and red colors differentiate between the model fit with a decaying learning rate and a small constant learning rate. Though both neural networks shown in this plot achieve 0 training error, the global minimizer obtained by a more sophisticated learning rate schedule appears to generalize better to unseen data.}\n    \\label{lec9:fig:global_min}\n\\end{figure}\n\n\\item {\\bf Optimization:} The discussion above means that optimization plays a significant role in generalization for deep learning. Different training algorithms/optimizers have different ``implicit biases'' or ``implicit regularization effect'', causing them to converge to different global minimizers. Understanding the implicit regularization effect of optimizers is thus a central goal of deep learning theory. The lack of understanding implicit regularization hinders the development of fast optimizers---it is impossible to design a good optimization algorithm without also considering its impact on generalization. In fact, many algorithms for non-convex optimization have been proposed that work well for minimizing training loss, but because their implicit bias is different, they lead to worse test performance and are therefore not too useful.\n    \nOften these implicit biases or implicit regularization effect can be characterized in the form of showing the optimizers prefer $\\hat\\theta$ of certain low complexity among all the global minimizers. The deep learning analog of \\eqref{lec9:eqn:classical-guarantee} often consists of two statements: (a) the optimizer implicitly prefers low complexity solution according to complexity measure $R(\\cdot)$ by converging to a global minimizer $\\hat{\\theta}$ with low complexity $R(\\hat{\\theta})$, and (b) low complexity solutions generalize. This means that we end up doing more work on the optimization front---the optimizer needs to ensure both a small training loss and a low complexity solution. On the other hand, proving generalization bounds (statement (b)) works similarly to the classical setting once we understand how our optimizer finds a low-complexity solution.\n    \n\\end{enumerate}\n\n%To explain the success of deep learning, we will cover three tasks in the next two chapters\\todo{specify chapter number}:\nWe summarize some of the results that we will present in the future chapters. \\ttodo{add chapter number later}\n\n\\begin{enumerate}\n    \\item \\textbf{Optimization.} First, we will prove that under certain data distribution assumption, optimizers such as stochastic gradient decent can converge to an approximate global minimum, even though the objective function is non-convex. Results of this form can be shown on matrix factorization problems and linearized neural networks, even without over-parameterization, but so far are limited to these simple models.  Second, we will discuss a recent approach, called neural tangent kernels (NTK), which proves that for almost any neural networks, with overparameterization, gradient descent can converge to a global minimum, \\textit{under specific hyperparameter settings} (e.g, specific learning rate and initialization). However, it turns out that these specific hyperparaemeter settings \\textit{does not} provide sufficient implicit regularization effect for the learned models to generalize. (In other words, the optimizer only returns a global minimizer, but not a global minimizer that generalizes well.)\n    \n    \\item \\textbf{Implicit regularization effect.} This involves showing that the solution $\\hat{\\theta}$ obtained by a particular optimizer has low complexity $R(\\hat{\\theta})\\leq C$ according to some complexity measure $R(\\cdot)$ (which depends on the choice of optimizers). It's believed and empirically observed that any changes or tricks in the optimizers (e.g., learning rate schedule, batch size, initialization, batchnorm) could introduce additional implicit regularization effects. We will only demonstrate these on some special cases of models (e.g. logistic regression, matrix factorization) and optimizers (e.g. gradient descent, label noise in SGD, dropout, learning rate). Recently, there are also more general results with label noise SGD~\\citep{blanc2019implicit,damian2021label}. \n    \n    \\item \\textbf{Generalization bounds.} This part involves showing that for all $\\theta$ such that $R(\\theta)\\leq C$ with $\\hat{L}(\\theta)\\approx 0$, we have $L(\\theta)$ is small. That is, we show that low-complexity solutions to the empirical risk problem generalize well. We will be working with more fine-grained complexity measures (e.g., those complexity measures that are similar to the complexity measure in part 2 above that are preferred by the optimizer). Here, many tools we developed in classical machine learning can still apply.\n\\end{enumerate}"
  },
  {
    "path": "tex/collection/07-01-nonconvex.tex",
    "content": "% reset section counter\n\\setcounter{section}{0}\n\n%\\metadata{lecture ID}{Your names}{date}\n\\metadata{10}{Kevin Han and Han Wu}{Feb 17th, 2021}\n\nIn the previous chapter, we outlined conceptual topics in deep learning theory and how the situation was different from classical machine learning theory. In particular, we described \\textit{approximation theory}, \\textit{statistical generalization} and \\textit{optimization}. In this chapter, we will focus on optimization theory in deep learning. We will introduce some basics about optimization (Section~\\ref{sec:optim_convergence}), discuss how we can make the notion ``all local minima are global minima'' rigorous, and walk through two examples where this is the case (Section~\\ref{sec:two_optim_examples}). Finally, we introduce the neural tangent kernel approach which allows us to characterize of the loss of general neural networks near a specific initialization (or under specific parameterization).\n\n\\sec{Optimization landscape} \\label{sec:optim_intro}\n\nThe big question that we have in mind is the following: many existing optimizers are designed for optimizing convex functions. \\textbf{Why do they still work well empirically for non-convex functions?} We note that it is not true that these optimizers always work well with non-convex functions: there are still some very hard cases that give trouble (e.g. very deep feed-forward networks are still hard to fit because of issues like vanishing and exploding gradients). One possible reason is that the non-convex functions that we are minimizing in deep learning usually have some nice properties: see Figure \\ref{lec10:fig:optimization} for an illustration.\n\n\\begin{figure}[ht!]\n    \\centering\n    \\includegraphics[scale = 0.5]{figures/landscape.png}\n    \\caption{Classification of different functions for optimization. The functions we optimize in deep learning seem to fall mostly within the middle cloud.}\n    \\label{lec10:fig:optimization}\n\\end{figure}\n\n\\begin{figure}[ht!]\n    \\centering\n    \\includegraphics[scale = 0.3]{figures/gradient_descent.png}\n    \\caption{Illustration of how gradient descent does not always find the global minimum. In the picture, gradient descent initialized at the blue point only makes it to the local minimum at the red point: it does not find the global minimum at the black point.}\n    \\label{lec10:fig:gradient_descent}\n\\end{figure}\nBefore diving into details, we first highlight some observations that will be important to keep in mind when discussing optimization in deep learning. Suppose $g(\\theta)$ is the loss function. Recall that the \\textit{gradient descent (GD)} algorithm would do the following:\n\\begin{enumerate}\n    \\item $\\theta_0 \\defeq$ initialization\n    \\item $\\theta_{t + 1} = \\theta_t - \\eta\\nabla g(\\theta_t)$, where $\\eta$ is the step size.\n\\end{enumerate}\nHere are some observations to :\n\\begin{enumerate}\n    \\item[] \\textit{Observation 1}: Gradient descent can find a global minimum for convex functions\\footnote{A more precise version of this claim is that gradient descent can find a point that has function value arbitrary close to the global minimal value. } but cannot always find the global minimum for any general continuous functions (see Figure \\ref{lec10:fig:gradient_descent} for an illustration).\n    \\item[] \\textit{Observation 2}: Finding the global minimum of general non-convex functions is NP-hard.\n%    \\item[] \\textit{Observation 3}: Gradient descent .\n    \\item[] \\textit{Observation 3}: The objective function in deep learning is non-convex., but empirically gradient descent/stochastic gradient descent typically finds an approximate global minimum of loss function in deep learning.\n\\end{enumerate}\n\nThese observations motivate the following two-step plan:\n\n\\begin{enumerate}\n    \\item Identify a large set of functions that stochastic gradient descent/gradient descent can solve.\n    \\item Prove that some of the loss functions in machine learning problems belong to this set. (Most of the effort will be spent here.)\n\\end{enumerate}\n\\textbf{Basic idea:} Gradient descent can find local minimum $+$ all local minima of $f$ are also global $\\Rightarrow$ Gradient descent can find global minima.\n\n\\sec{Efficient convergence to (approximate) local minima} \\label{sec:optim_convergence}\nLet $f$ be a twice-differentiable function. We start with the following definition:\n\\begin{definition} [Local minimum of a function]\nWe say that $x$ is a \\textit{local minimum} of a function $f$ if there exists an open neighborhood $N$ around $x$ such that in $N$, the function values are at least $f(x)$.\n\\end{definition}\n\nNote that if $x$ is a local minimum of $f$, then $\\nabla f(x) = 0$ and $\\nabla^2 f(x) \\succeq 0$. However, as the next example shows, the reverse is not true. When $\\nabla f(x) = 0$ and $\\nabla^2 f(x)$ vanishes in some direction (i.e. merely positive semi-definite instead of being strictly positive definite), higher-order derivatives start to matter.\n\n\\begin{example}\n\\label{lec10:ex:counterexample}\nConsider the function $f(x_1, x_2) = x_1^2 + x_2^3$. $(x_1, x_2) = (0, 0)$ satisfies $\\nabla f(x) = 0$ and $\\nabla^2 f(x)|_{(x_1, x_2) = (0, 0)} = \\begin{bmatrix} 2 & 0 \\\\\n0 & 0\\end{bmatrix} \\succeq 0$. However, if we move in the negative direction of $x_2$, we can decrease the function value. Hence, this example shows why $\\nabla f(x) = 0$ and $\\nabla^2 f(x) \\succeq 0$ does not imply that $x$ is a local minimum.\n\\end{example}\n\nIt is generally not easy to verify if a point is a local minimum. In fact, we have the following theorem regarding the computational tractability:\n\\begin{theorem}\n\\label{lec10:thm:np_hard}\nIt is NP-hard to check whether a point is a local minimum or not \\cite{murty1987}. In addition, Hillar and Lim \\cite{hillar2013} show that a degree four polynomial is NP-hard to optimize.\n\\end{theorem}\n\n\\subsec{Strict-saddle condition}\nTheorem~\\ref{lec10:thm:np_hard} forces us to consider more specific types of functions to be able to obtain computational tractability. To this end, we define the following \\textit{strict-saddle condition}:\n\n\\begin{definition} [Strict-saddle condition \\cite{lee2016}]\nFor positive $\\alpha, \\beta, \\gamma$, we say that $f: \\R^d \\mapsto \\R$ is \\textit{$(\\alpha, \\beta, \\gamma)$-strict-saddle} if every $x \\in \\bbR^d$ satisfies one of the following:\n\\begin{enumerate}\n    \\item $\\|\\nabla f(x)\\|_2 \\geq \\alpha$.\n    \\item $\\lambda_{\\min}(\\nabla^2 f(x)) \\leq -\\beta$.\n    \\item $x$ is $\\gamma$-close to a local minimum $x^*$ in Euclidean distance, i.e. $\\|x - x^*\\|_2 \\leq \\gamma$.\n\\end{enumerate}\n\\end{definition}\n\nIntuitively speaking, this definition is saying if a point has zero gradient and positive semi-definite Hessian, it must be close to a local minimum, i.e. there is no pathological case like Example \\ref{lec10:ex:counterexample}.\n\nWe have the following theorem for functions that satisfy strict-saddle condition:\n\n\\begin{theorem} [Informally stated]\nIf $f$ is $(\\alpha, \\beta, \\gamma)$-strict-saddle for some positive $\\alpha, \\beta, \\gamma$, then many optimizers (e.g. gradient descent, stochastic gradient descent, cubic regularization) can converge to a local minimum with $\\epsilon$-error in Euclidean distance in time $poly \\left(d, \\frac{1}{\\alpha}, \\frac{1}{\\beta}, \\frac{1}{\\gamma}, \\frac{1}{\\epsilon}\\right)$.\n\\end{theorem}\n\nTherefore, if all local minima are global minima and the function satisfies the strict-saddle condition, then optimizers can converge to a global minimum with $\\epsilon$-error in polynomial time. (See Figure \\ref{lec10:fig:strict-saddle} for an example of a function whose local minima are all global minima.) The next theorem expresses this concretely by being explicit about the strict-saddle condition:\n\n\\begin{theorem}\nSuppose $f$ is a function that satisfies the following condition: $\\exists  \\ \\epsilon_0, \\tau_0, c > 0$ such that if $x \\in \\bbR^d$ satisfies $\\|\\nabla f(x)\\|_2 \\leq \\epsilon < \\epsilon_0$ and $\\nabla^2 f(x) \\succeq -\\tau_0I$, then $x$ is $\\epsilon^c$-close to a global minimum of $f$. Then many optimizers can converge to a global minimum of $f$ up to $\\delta$-error in Euclidean distance in time $poly\\left(\\frac{1}{\\delta}, \\frac{1}{\\tau_0}, d \\right)$.\n\\end{theorem}\n\n\\begin{figure}[ht!]\n    \\centering\n    \\includegraphics[scale = 0.5]{figures/localmin.png}\n    \\caption{A two-dimensional function with the property that all local minima are global minima. It also satisfies the strict-saddle condition because all the saddle points have a strictly negative curvature in some direction.}\n    \\label{lec10:fig:strict-saddle}\n\\end{figure}\n\n\\sec{All local minima are global minima: two examples} \\label{sec:two_optim_examples}\nSo far, we have focused on general results. Next, we give two concrete examples that have the property that all local minima are global minima: (i) principal components analysis (PCA)/matrix factorization/linearized neural nets, and (ii) matrix completion. \\tnotelong{need some quick literature survey; Tengyu will add}%There is a rich literature on this topic and \n\n\\subsec{Principal components analysis (PCA)}\nLet matrix $M \\in \\bbR^{d \\times d}$ be symmetric and positive semi-definite. Consider the problem of finding the best rank-1 approximation of the matrix $M$. The objective function here is non-convex:\n\\begin{equation}\n    \\min_{x \\in \\bbR^d}g(x) \\triangleq \\frac{1}{2}\\|M - xx^\\top \\|_F^2.\n\\end{equation}\n\n\\begin{theorem}\nAll local minima of $g$ are global minima (even though $g$ is non-convex).\n\\end{theorem}\n\n\\begin{remark}\nFor $d = 1$, $g(x) = \\frac{1}{2}(m - x^2)^2$ for some constant $m$. Figure~\\ref{lec10:fig:pca_objective} below shows such an example. We can see that all local minima are indeed global minima.\n\\end{remark}\n\n\\begin{figure}[ht!]\n    \\centering\n    \\includegraphics[scale = 0.4]{figures/pca.png}\n    \\caption{Objective function for principal components analysis (PCA) when $d = 1$.}\n    \\label{lec10:fig:pca_objective}\n\\end{figure}\n\n\\begin{proof}\n\n\\textit{Step 1: Show that all stationary points must be eigenvectors.} From HW0, we know that $\\nabla g(x) = -(M - xx^\\top )x$, hence\n\\begin{equation}\\label{lec10:eqn:pca-firstorder}\n\\nabla g(x) = 0 \\implies Mx = \\|x\\|_2^2\\cdot x,\n\\end{equation}\nwhich implies that $x$ is an eigenvector of $M$ with eigenvalue $\\|x\\|_2^2$. From the Eckart–Young–Mirsky theorem we know the global minimum (i.e. the best rank-1 approximation) is the eigenvector with the largest eigenvalue.\n\n\\textit{Step 2: Show that all local minima must be eigenvectors of the largest eigenvalue.} We use the second order condition for this. For $x$ to be a local minimum we need $\\nabla^2g(x) \\succeq 0$, which means for any $v \\in  \\bbR^d$, \n\\begin{equation}\n\\langle v, \\nabla^2g(x) v \\rangle \\geq 0.\n\\end{equation}\nTo compute $\\langle v, \\nabla^2g(x) v \\rangle$, we use the following trick: expand $g(x + v)$ into $g(x) + \\text{linear term in } v + \\text{quadratic term in } v$, then the quadratic term will be $\\frac{1}{2}\\langle v, \\nabla^2g(x) v \\rangle$ (see HW0 Problem 2d for an example). Using this trick, we get \n\n\\begin{align}\n    g(x+v) &= \\frac{1}{2}\\|M - (x+v)(x+v)^\\top \\|_F^2 \\\\\n           &= \\frac{1}{2}\\|M-xx^\\top\\|_F^2 - \\langle M-xx^\\top , xv^\\top + vx^\\top\\rangle + \\frac{1}{2}\\langle xv^\\top + vx^\\top , xv^\\top + vx^\\top \\rangle \\nonumber \\\\\n          & \\quad -\\langle M-xx^\\top, vv^\\top\\rangle + \\text{higher order terms in }v.\n\\end{align}\nHence, we have \n\\begin{align}\n    \\frac{1}{2}\\langle v, \\nabla^2g(x) v \\rangle & = \\frac{1}{2}\\langle xv^\\top + vx^\\top, xv^\\top + vx^\\top \\rangle\n          -\\langle M-xx^\\top, vv^\\top\\rangle  \\\\\n          &= \\langle x, v\\rangle^2 + \\|x\\|_2^2\\|v\\|_2^2 - v^ Mv + \\langle x, v\\rangle^2 \\\\\n          & = 2\\langle x, v\\rangle^2 + \\|x\\|_2^2\\|v\\|_2^2 - v^\\top Mv.\n\\end{align}\n\nPicking $v = v_1$, the unit eigenvector with the largest eigenvalue (denoted $\\lambda_1$), for $x$ to be a local minimum it must satisfy \n\\begin{equation}\n\\langle v_1, \\nabla^2g(x) v_1 \\rangle = 2\\langle x, v_1 \\rangle^2 - v_1^\\top Mv_1 + \\|x\\|_2^2 \\geq 0.\n\\end{equation}\n\nNote that by \\eqref{lec10:eqn:pca-firstorder}, all our candidates for local minima are eigenvectors of $M$ so naturally we have two cases:\n\\begin{itemize}\n\\item \\textit{Case 1: $x$ has eigenvalue $\\lambda_1$}. Then x is the global minimum (by the Eckart–Young–Mirsky theorem).\n\\item \\textit{Case 2: $x$ has eigenvalue $\\lambda < \\lambda_1$}. Then we know $x$ and $v_1$ are orthogonal (eigenvectors with different eigenvalues are always orthogonal), hence \n\\begin{equation}\n2\\langle x, v_1 \\rangle^2 - v_1^\\top Mv_1 + \\|x\\|_2^2 = 0  -\\lambda_1 + \\lambda \\geq 0,\n\\end{equation}\nwhich implies $\\lambda \\geq \\lambda_1$, a contradiction. \n\\end{itemize}\n\nIn summary, if $x$ is a stationary point and $x$ is not a global minimum, then moving in the direction of $v_1$ would lead to second-order improvement and $x$ cannot be a local minimum. \n\\end{proof}\n"
  },
  {
    "path": "tex/collection/07-02-nonconvex.tex",
    "content": "% reset section counter\n%\\setcounter{section}{0}\n\n%\\metadata{lecture ID}{Your names}{date}\n\\metadata{11}{Andrew Wang}{Feb 22nd, 2021}\n\n\\subsec{Matrix Completion \\texorpdfstring{\\cite{ge2016}}{[Ge et al., 2016]}}\nWe consider rank-1 matrix completion for simplicity. Let $M = zz^\\top$ be a rank-1 symmetric and positive semi-definite matrix for some $z\\in \\bbR^d$. Given random entries of $M$, our goal is to recover the rest of entries. Formally, we have the following definitions:\n\n\\begin{definition}\nSuppose $M\\in \\bbR^{d\\times d}$ and $\\Omega \\subseteq [d] \\times [d]$, we define $P_{\\Omega}(M)$ to be the matrix obtained by zeroing out every entry outside $\\Omega$. \n\\end{definition}\n\n\\begin{definition}[Matrix Completion]\nSuppose $M\\in \\bbR^{d\\times d}$ and every entry of $M$ is included in $\\Omega$ with probability $p$. The \\textit{matrix completion task} is to recover $M$ (with respect to some loss functions) given the observation $P_{\\Omega}(M)$.\n\\end{definition}\n\nA nice real world example of matrix completion is when we have a matrix describing the user ratings for each item. We only observe a small portion of the entries as each customer only buys a small subset of the items. A good matrix completion algorithm is indispensable for a recommendation engine. \n\n\\begin{remark}\nWe need $d$ parameters to describe a rank-1 matrix $M$ and the number of observations is roughly $pd^2$. Thus, for identifiability we need to work in the regime where $pd^2 > d$, i.e. $p \\gg \\frac{1}{d}$. \n\\end{remark}\n\nWe define our non-convex loss functions to be \n\\begin{align}\n    \\min_{x \\in \\bbR^d} f(x) & \\triangleq \\frac{1}{2}\\sum_{(i,j)\\in \\Omega}(M_{ij}-x_ix_j)^2 \\\\\n     & = \\frac{1}{2}\\|P_{\\Omega}(M-xx^\\top)\\|_F^2.\n\\end{align}\n\nTo really solve our problem we need some regularity condition on the ground truth vector $z$ (recall $M = zz^\\top$). \\textit{Incoherence} is one such condition:\n\\begin{definition}[Incoherence]\nWithout loss of generality, assume the ground truth vector $z\\in\\bbR^d$ satisfies $\\|z\\|_2 = 1$. $z$ satisfies the \\textit{incoherence condition} if $\\|z\\|_{\\infty} \\leq \\frac{\\mu}{\\sqrt{d}}$, where $\\mu$ is considered to be a constant or log in dimension $d$. \n\\end{definition}\n\n\\begin{remark}\nA nice counterexample to think about why such condition is necessary is when $z = e_1$ and $M = e_1 e_1^\\top$. All entries of $M$ are 0 except for a 1 in the top-left corner. There is no way to recover $M$ without observing the top-left corner.\n\\end{remark}\n\nThe goal is to prove that local minima of this objective function are close to a global minimum:\n\n\\begin{theorem}\\label{lec11:thm:matrix-completion}\nAssume $p = \\dfrac{\\textrm{poly}(\\mu, \\log d)}{d\\epsilon^2}$ for some sufficient small constant $\\epsilon$ and assume $z$ is incoherent. Then with high probability, all local minima of $f$ are $O(\\sqrt{\\epsilon})$-close to $+z$ or $-z$ (the global minima of $f$).\n\\end{theorem}\n\nBefore presenting the proof, we make some observations that will guide the proof strategy.\n\n\\begin{remark}\n$f(x)$ can be viewed as a sampled version of the PCA loss function $g(x) = \\frac{1}{2}\\norm{M - xx^\\top}_F^2 = \\frac{1}{2}\\sum_{(i,j) \\in [d]\\times[d]} (M_{ij} - x_ix_j)^2$, in which we only observe a subset of the matrix entries. Thus, we would like to claim that $f(x) \\approx g(x)$. However, matching the values of $f$ and $g$ is not sufficient to prove the theorem: even a small margin of error between $f$ and $g$ could lead to creation of many spurious local minima (see Figure~\\ref{lec11:fig:matrix_completion_f_g} for an illustration). In order to ensure that the local minima of $f$ look like the local minima of $g$, we will need further conditions like $\\nabla f(x) \\approx \\nabla g(x)$ and $\\nabla^2 f(x) \\approx \\nabla^2 g(x)$.\n\\end{remark}\n\n\\begin{figure}\n    \\centering\n    \\includegraphics[width=2.5in]{figures/matrix-completion-f-g.png}\n    \\caption{Even if $f(x)$ and $g(x)$ are no more than $\\epsilon$ apart at any given $x$, without any additional knowledge, the local minima of $f$ may possibly look dramatically different from the local minima of $g$. However, the proofs in this section show that the landscape of $f$ (the matrix completion objective) and $g$ (the PCA objective) are have similar properties by proving more advanced concentration inequalities. }\n    \\label{lec11:fig:matrix_completion_f_g}\n\\end{figure}\n\n\\begin{remark}\nKey idea: concentration for scalars is easy. We can approximate a sum of scalars via a sample:\n\\begin{equation}\n\\sum_{(i,j) \\in \\Omega} T_{ij} \\approx p\\sum_{(i,j) \\in [d]\\times[d]} T_{ij},\n\\end{equation}\nwhere we use $\\approx$ to mean that\n\\begin{equation}\n\\Bigl| \\sum_{(i,j) \\in \\Omega} T_{ij} - p\\sum_{(i,j) \\in [d]\\times[d]} T_{ij} \\Bigr| < \\epsilon\n\\end{equation}\nwith high probability. This suggests the strategy of casting the estimation of our desired quantities in the form of estimating a scalar sum via a sample. In particular, we note that for any matrices $A$ and $B$,\n\\begin{equation}\n\\langle A, P_\\Omega(B) \\rangle = \\sum_{(i,j) \\in \\Omega} A_{ij}B_{ij} \\approx p\\langle A, B \\rangle.\n\\end{equation}\n\\end{remark}\n\nTo make use of this observation to understand the quantities of interest ($\\nabla f(x)$ and $\\nabla^2 f(x)$), we compute the bilinear and quadratic forms for $\\nabla f(x)$ and $\\nabla^2 f(x)$ respectively:\n\\begin{equation}\n\\langle v, \\nabla f(x) \\rangle = \\langle v, P_\\Omega(M-xx^\\top)x \\rangle = \\langle vx^\\top, P_\\Omega(M-xx^\\top) \\rangle,\n\\end{equation}\nwhere we have used the fact that $\\langle A,BC \\rangle = \\langle AC^\\top,B\\rangle$. Also note that $vx^\\top$ is a rank-1 matrix and $M-xx^\\top$ is a rank-2 matrix.\n\\begin{align}\n\\langle v, \\nabla^2 f(x) v \\rangle &= \\norm{P_\\Omega(vx^\\top + xv^\\top)}_F^2 - 2\\langle P_\\Omega(M-xx^\\top), vv^\\top \\rangle \\\\\n&=  \\langle P_\\Omega(vx^\\top + xv^\\top), vx^\\top + xv^\\top \\rangle - 2\\langle P_\\Omega(M-xx^\\top), vv^\\top \\rangle,\n\\end{align}\n\nwhere we have used the fact that $\\norm{P_\\Omega(A)}_F^2 = \\langle P_\\Omega(A), P_\\Omega(A)\\rangle = \\langle P(\\Omega(A), A\\rangle$.\n\nThe key lemma that applies the scalar concentration to these matrix quantities is as follows:\n\n\\begin{lemma}\nLet $\\epsilon>0$, $p = \\dfrac{\\textrm{poly}(\\mu, \\log d)}{d\\epsilon^2}$. Given that $A = uu^\\top, B=vv^\\top$ for some $u, v$ satisfying $\\norm{u}_2 \\leq 1$, $\\norm{v}_2 \\leq 1$, $\\norm{u}_\\infty \\leq \\mu / \\sqrt{d}$, $\\norm{v}_\\infty \\leq \\mu / \\sqrt{d}$, we have $|\\langle P_\\Omega(A), B \\rangle/p - \\langle A, B\\rangle| \\leq \\epsilon$ w.h.p.\n\\label{lec11:lem:concentration_lemma}\n\\end{lemma}\n\nIf we can show that $g$ has no bad local minima via a proof that only uses $g$ via terms of the form $\\langle v, \\nabla g(x) \\rangle$ and $\\langle v, \\nabla^2 g(x) v \\rangle$, then by Lemma~\\ref{lec11:lem:concentration_lemma} this proof will automatically generalize to $f$ by concentration.\n\nNext, we prove some facts about $g$ and show the analogous proofs for $f$ that we will use in the proof of Theorem~\\ref{lec11:thm:matrix-completion}.\n\n\\begin{lemma}[Connecting inner product and norm for $g$]\\label{lec11:lem:inner-g}\nIf $x$ satisfies $\\nabla g(x) = 0$, then $\\langle x,z \\rangle^2 = \\norm{x}_2^4$.\n\\end{lemma}\n\n\\begin{proof}\n\\begin{align}\n    \\nabla g(x) = 0 &\\implies \\langle x, \\nabla g(x) \\rangle = 0 \\\\\n   & \\implies \\langle x, (zz^\\top-xx^\\top)x \\rangle = 0 & (\\because \\nabla g(x) = (M - xx^\\top)x) \\\\\n   & \\implies \\langle x,z \\rangle^2 = \\norm{x}_2^4.\n\\end{align}\n\\end{proof}\n\n\\begin{lemma}[Connecting inner product and norm for $f$]\\label{lec11:lem:inner-f}\nSuppose $\\norm{x}_\\infty \\leq 2\\mu / \\sqrt{d}$. If $x$ satisfies $\\nabla f(x) = 0$, then $\\langle x,z \\rangle^2 \\geq \\norm{x}_2^4 - \\epsilon$ with high probability.\n\\label{inner_prod_norm_f}\n\\end{lemma}\n\n\\begin{proof}\n\\begin{align}\n    \\nabla f(x) = 0 &\\implies \\langle x, \\nabla f(x) \\rangle = 0 \\\\\n    & \\implies \\langle x, \\nabla g(x) \\rangle \\approx \\langle x, \\nabla f(x) \\rangle/p \\pm \\epsilon & \\text{(by Lemma \\ref{lec11:lem:concentration_lemma})} \\\\\n   & \\implies |\\langle x, (zz^\\top-xx^\\top)x \\rangle| \\leq \\epsilon & \\text{w.h.p.} \\\\\n   & \\implies \\langle x,z \\rangle^2 \\geq \\norm{x}_2^4 - \\epsilon & \\text{w.h.p.}\n\\end{align}\n\\end{proof}\n\n\\begin{lemma}[Bound norm for $g$]\\label{lec11:lem:bound-g}\n    If $\\nabla^2 g(x) \\succeq 0$, then $\\norm{x}_2^2 \\geq 1/3$.\n\\end{lemma}\n\n\\begin{proof}\n\\begin{align}\n    \\nabla^2 g(x) \\succeq 0\n    &\\implies \\langle z, \\nabla^2 g(x)z\\rangle \\geq 0 \\\\\n    &\\implies \\norm{zx^\\top + xz^\\top}_F^2 - 2z^\\top(zz^\\top-xx^\\top)z \\geq 0 \\\\\n    &\\implies 2 \\norm{x}^2_2 + 2 \\inprod{x, z}^2 - 2 + 2 \\inprod{x, z}^2 \\geq 0 &\\text{(cyclic trace prop.)} \\\\\n    &\\implies 3\\norm{x}_2^2 = \\norm{x}_2^2 + 2\\norm{x}_2^2 \\geq \\norm{x}_2^2 + 2\\langle x,z \\rangle^2 \\geq 1 &\\text{(by Cauchy-Schwarz)} \\\\\n    &\\implies \\norm{x}_2^2 \\geq 1/3.\n\\end{align}\n\\end{proof}\n\n\\begin{lemma}[Bound norm for $f$]\\label{lec11:lem:bound-f}\n    Suppose $\\norm{x}_\\infty \\leq \\mu / \\sqrt{d}$. If $\\nabla^2 f(x) \\succeq 0$, then $\\norm{x}_2^2 \\geq 1/3 - \\epsilon/3$ with high probability.\n\\end{lemma}\n\\begin{proof}\n\\begin{align}\n    \\nabla^2 f(x) \\succeq 0\n    &\\implies \\langle z, \\nabla^2 f(x)z \\rangle \\geq 0 \\\\\n    &\\implies \\langle z, \\nabla^2g(x)z \\rangle \\geq -\\epsilon & \\text{w.h.p. (by Lemma \\ref{lec11:lem:concentration_lemma})} \\\\\n    &\\implies 3\\norm{x}_2^2 \\geq 1-\\epsilon & \\text{w.h.p.} \\\\\n    &\\implies \\norm{x}_2^2 \\geq 1/3 - \\epsilon/3 & \\text{w.h.p.}\n\\end{align}\n\\end{proof}\n\n\\begin{lemma}[$g$ has no bad local minimum]\n    All local minima of $g$ are global minima.\n\\end{lemma}\n\n\\begin{proof}\n\\begin{align}\n    \\nabla g(x) = 0\n    & \\implies \\langle z, \\nabla g(x) \\rangle = 0 \\\\\n    & \\implies \\langle z, (zz^\\top-xx^\\top)x \\rangle = 0 \\\\\n    & \\implies \\langle x,z\\rangle (1-\\norm{x}_2^2) = 0.\n\\end{align}\nSince $|\\langle x,z \\rangle| \\geq 1/3 \\neq 0$ (by Lemma~\\ref{lec11:lem:bound-g}), we must have $\\norm{x}_2^2 = 1$. But then Lemma~\\ref{lec11:lem:inner-g} implies $\\langle x, z\\rangle^2 = \\norm{x}_2^4 = 1$, so $x = \\pm z$ by Cauchy-Schwarz.\n\\end{proof}\n\nWe now prove Theorem~\\ref{lec11:thm:matrix-completion}, restated for convenience:\n\\begin{theorem}[$f$ has no bad local minimum]\nAssume $p = \\dfrac{\\textrm{poly}(\\mu, \\log d)}{d\\epsilon^2}$. Then with high probability, all local minima of $f$ are $O(\\sqrt{\\epsilon})$-close to $+z$ or $-z$.\n\\end{theorem}\n\n\\begin{proof}\nObserve that $\\norm{x-z}_2^2 = \\norm{x}_2^2 + \\norm{z}_2^2 - 2\\langle x,z \\rangle \\leq \\norm{x}_2^2 + 1 - 2\\langle x,z \\rangle$. Our goal is to show that this quantity is small with high probability, hence we need to bound $\\norm{x}_2^2$ and $\\langle x,z \\rangle$ w.h.p. Note that the following bounds in this proof are understood to hold w.h.p.\n    \nLet $x$ be such that $\\nabla f(x) = 0$. For $\\epsilon \\leq 1/16$,\n\\begin{align}\n\\langle x,z \\rangle^2 &\\geq \\norm{x}_2^4 - \\epsilon &\\text{(by Lemma~\\ref{lec11:lem:inner-f})} \\\\\n&\\geq (1/3-\\epsilon/3)^2 - \\epsilon &\\text{(by Lemma~\\ref{lec11:lem:bound-f})} \\\\\n&\\geq 1/32. \\label{lec11:eqn:xz-bound}\n\\end{align}\n\nWith this, we can get a bound on $\\norm{x}_2^2$:\n\\begin{align}\n\\nabla f(x) = 0 &\\implies \\langle x, \\nabla f(x) \\rangle = 0 \\\\\n&\\implies |\\langle z, \\nabla g(x) \\rangle| \\leq \\epsilon & \\text{(by Lemma \\ref{lec11:lem:concentration_lemma})} \\\\\n&\\implies |\\langle x,z\\rangle| \\cdot |1-\\norm{x}_2^2| \\leq \\epsilon &\\text{(by dfn of $g$)} \\\\\n&\\implies |1-\\norm{x}_2^2| \\leq 32\\epsilon = O(\\epsilon) &\\text{(by \\eqref{lec11:eqn:xz-bound})} \\\\\n&\\implies \\norm{x}_2^2 = 1 \\pm O(\\epsilon). \\label{lec11:eqn:xnorm-bound}\n\\end{align}\n    \nNext, we bound $\\langle x,z \\rangle$:\n\\begin{align}\n\\langle x, z \\rangle^2 &\\geq \\norm{x}_2^4 - \\epsilon &\\text{(by Lemma \\ref{inner_prod_norm_f})} \\\\\n&\\geq (1-O(\\epsilon))^2 - \\epsilon &\\text{(by \\eqref{lec11:eqn:xnorm-bound})} \\\\\n&= 1 - O(\\epsilon).\n\\end{align}\n\nFinally, we put these quantities together to bound $\\norm{x-z}_2^2$. We have two cases:\n    \n\\textbf{Case 1}: $\\langle x,z\\rangle \\geq 1 - O(\\epsilon)$. Then\n\\begin{align}\n\\norm{x-z}_2^2 &= \\norm{x}_2^2 + \\norm{z}_2^2 - 2\\langle x,z \\rangle \\\\\n&\\leq \\norm{x}_2^2 + 1 - 2\\langle x,z \\rangle \\\\\n&\\leq 1 + O(\\epsilon) + 1 - 2(1-O(\\epsilon)) \\\\\n&\\leq O(\\epsilon).\n\\end{align} \n    \nHence we conclude $x$ is $O(\\sqrt{\\epsilon})$-close to $z$.\n    \n\\textbf{Case 2}: $\\langle x,z\\rangle \\leq -(1 - O(\\epsilon))$. Then by an analogous argument, $x$ is $O(\\sqrt{\\epsilon})$-close to $-z$.\n\\end{proof}\n\nWe have shown above that matrix completion of a rank-1 matrix has no spurious local minima. This proof strategy can be extended to handle higher-rank matrices and noisy matrices \\cite{ge2016}. The proof also demonstrates a generally useful proof strategy: often, reducing a hard problem to an easy problem results in solutions that do not give much insight into the original problem, because the proof techniques do not generalize. It can often be fruitful to seek a proof in the simplified problem that makes use of a restricted set of tools that could generalize to the harder problem. Here we limited ourselves to only using $\\langle v, \\nabla g(x)\\rangle$ and $\\langle v, \\nabla^2 g(x) v\\rangle$ in the easy case; these quantities could then be easily converted to analogous quantities in $f$ via the concentration lemma (Lemma~\\ref{lec11:lem:concentration_lemma}).\n\n\\subsec{Other problems where all local minima are global minima}\nWe have now demonstrated that two classes of machine learning problems, rank-1 PCA and rank-1 matrix completion, have no spurious local minima and are thus amenable to being solvable by gradient descent methods. We now outline some major classes of problems for which it is known that there are no spurious local minima.\n\n\\begin{itemize}\n    \\item Principal component analysis (covered in previous lecture).\n    \\item Matrix completion (and other matrix factorization problems). On a related note, it has also been shown that linearized neural networks of the form $y = W_1W_2x$, where $W_1$ and $W_2$ are optimized separately, have no spurious local minima \\cite{baldi1989neural}. It should be noted that linearized neural networks are not very useful in practice since the advantage of optimizing $W_1$ and $W_2$ separately versus optimizing a single $W=W_1W_2$ is not clear.\n    \\item Tensor decomposition. The problem is as follows:\n    \\begin{align}\n        \\text{maximize }\\quad \\sum_{i=1}^d \\sum_{j=1}^d \\sum_{k=1}^d \\sum_{l=1}^d T_{ijkl} x_ix_jx_kx_l \\quad \\text{such that } \\quad \\norm{x}_2 = 1.\n    \\end{align}\n    Additionally, constraints are imposed on the tensor $T$ to make the problem tractable. For example, one condition is that $T$ must be a low-rank tensor with orthonormal components \\cite{ge2015}.\n\\end{itemize}"
  },
  {
    "path": "tex/collection/07-03-nonconvex.tex",
    "content": "% reset section counter\n%\\setcounter{section}{0}\n\n%\\metadata{lecture ID}{Your names}{date}\n\\metadata{12}{Rohan Taori and Jonathan Lee}{Feb 24nd, 2021}\n\n\\sec{Neural tangent kernel (NTK) approach}\nIn general, the loss landscapes of neural networks (with nonlinearities) is currently not as well understood. We now introduce the \\textit{neural tangent kernel} which allows us to make some characterizations of the loss near a given neural network initialization.\n\n\\begin{figure}\n    \\centering\n    \\includegraphics[width=2.5in]{figures/ntk-loss-landscape.png}\n    \\caption{The training loss landscape around a given parameter initialization $\\theta^0$. We hope that the neighborhood around $\\theta^0$ contains a local minimum that is close to the global minimum.}\n    \\label{lec11:fig:ntk_loss_landscape}\n\\end{figure}\n\nThe key insight of the NTK approach is that if we take an appropriate random parameter initialization $\\theta^0$ (which we will choose later), we can identify a special neighborhood of $\\theta^0$, denoted $B(\\theta^0)$, where ``everything is nice''. That is, the function is convex in $B(\\theta^0)$, there is a global minimum in the $B(\\theta^0)$, and the algorithm starting at $\\theta^0$ will converge to that global minimum. (See Figure~\\ref{lec11:fig:ntk_loss_landscape} for an illustration.)\n\nTake a random initialization $\\theta = \\theta^0$ and Taylor expand the loss around $\\theta^0$ w.r.t. $\\theta$:\n\\begin{align}\nf_\\theta(x) &= \\underbrace{f_{\\theta^0}(x) + \\langle \\nabla_\\theta f_{\\theta^0}(x), \\theta - \\theta^0 \\rangle}_{g_\\theta(x)} + O((\\theta-\\theta^0)^2).\n\\end{align}\n\nIn other words, we take the tangent plane to $f_\\theta$ at $x$ (a linear approximation). We observe that $g_\\theta$ is an affine function of $\\theta$. Additionally, defining $\\Delta \\theta = \\theta - \\theta^0$, we see that the first term does not depend on $\\Delta \\theta$ while the second term $\\langle \\nabla_\\theta f_{\\theta^0}(x), \\theta - \\theta^0 \\rangle$ is linear in $\\Delta \\theta$. (For convenience, we will sometimes choose to design $\\theta^0$ such that $f_{\\theta^0}(x) = 0$ so that $g_\\theta$ is linear in $\\theta^0$. However, the difference is not very important since $f_{\\theta^0}(x)$ can simply be subsumed in the training labels $y$ via $y' = y - f_{\\theta^0}(x)$.)\n\nNow we have that $y \\approx \\nabla_\\theta f_{\\theta^0}(x)^\\top \\Delta \\theta$. We can view $\\phi(x) \\triangleq \\nabla_\\theta f_{\\theta^0}(x)$ as a feature map, i.e. we can rewrite the expression as $\\phi(x)^\\top \\Delta \\theta$ where $\\phi(x)$ is fixed (only depends on $\\theta^0$ and the architecture). This observation motivates the definition of the \\textit{neural tangent kernel}:\n\n\\begin{definition}[Neural tangent kernel]\nThe \\emph{neural tangent kernel} $K$ is defined as the function\n\\begin{equation}\nK(x, x') = \\langle \\phi(x), \\phi(x')\\rangle = \\langle \\nabla f_{\\theta^0}(x), \\nabla f_{\\theta^0}(x') \\rangle.\n\\end{equation}\n\\end{definition}\n\nSuppose we fit $g_\\theta(x)$ to $y$, i.e. we minimize the loss \n\\begin{equation}\n\\textrm{Loss} = \\ell(\\phi(x)^\\top \\Delta \\theta, y),\n\\end{equation}\nwhere $\\phi(x)^\\top\\Delta \\theta$ is linear and the loss as a whole is convex.\nWe will show that for a sufficiently wide neural network with proper initialization $\\theta^0$, optimizing $f_\\theta(x)$ starting from $\\theta^0$ never leaves the neighborhood of $\\theta^0$, effectively behaving the same as optimizing $g_\\theta(x)$. In particular, two questions have to be answered:\n\n\\begin{enumerate}\n    \\item \\textit{Why does there exist a small neighborhood $B(\\theta^0)$ such that there exists a global minimum in $B(\\theta^0)$?} This is more surprising, and it involves proper design of $\\theta^0$. We will spend the rest of this chapter answering this question mathematically.\n    \\item \\textit{Does gradient descent on the original loss with respect to $f_\\theta(x)$ stay in the neighborhood $B(\\theta^0)$?} The answer to this question is ``yes''. However, more technical machinery is required to prove it. We skip discussion of this as it is the less surprising claim.\n\\end{enumerate}\n\n\\subsec{The two-layer network case}\n\nWe demonstrate the NTK approach for the two-layer network setup. For $i \\in [m]$, let $a_i \\in \\R$ be scalars and let $w_i \\in \\R^d$ be vectors. Let $\\sigma: \\R \\to \\R$ be the ReLU activation function defined as $\\sigma(t) = \\max\\{ t, 0\\}$. Suppose we have the following two-layer network:\n\\begin{equation}\\label{lec12:eqn:network}\n\\hat{y} = f_\\theta(x) = \\frac{1}{\\sqrt{m}} \\sum_{i=1}^m a_i \\sigma (w_i^\\top x),\n\\end{equation}\nfor some input $x$.\n\nOur weight matrix is $W = \\begin{bmatrix} w_1^\\top \\\\ \\vdots \\\\ w_m^\\top \\end{bmatrix} \\in \\R^{m \\times d}$. We initialize $W$ randomly using $W_{ij} \\stackrel{i.i.d.}{\\sim} \\mathcal{N}(0, 1)$ for all $i$ and $j$. We initialize $a_i \\in \\{\\pm 1\\}$ and assume that the $a_i$'s are fixed after initialization, i.e. not updated during training. (We fix $a_i$ for simplicity: the results still hold when we are allowed to optimize $a_i$ in training.) We also assume that $x$ has norm on the order of $1$, and that the true label $y$ is on the order of $1$.\n\nIn our analysis, we will assume we have sufficiently large $m = \\poly(n, d)$. In other words, the width of the network $m$ is sufficiently large such that $\\poly(n, d)$ factors are not important in the analysis. For simplicity, we write $O_{d,n}(1)$ to hide polynomial dependencies on $d$ and $n$. Thus $O_{d,n}(m^c) = m^c \\cdot \\poly(n, d)$.\n\n\\textbf{Why do we need the scaling factor $1 / \\sqrt{m}$ in Equation~\\eqref{lec12:eqn:network}?} It is included to prevent the model outputs from blowing up when we increase the number of neurons $m$. Note that $\\sigma (w_i^\\top x) \\approx O_{d,n}(1)$ since $w_i^\\top \\approx O_{d,n}(1)$ and $x \\approx O_{d,n}(1)$. Since $a_i \\in \\{ \\pm 1\\}$ for all $i$, this implies that $ \\sum_{i=1}^m a_i \\sigma (w_i^\\top x) \\approx O_{d,n}(\\sqrt{m})$. Thus, the scaling factor is needed to obtain $\\hat{y} = f_\\theta(x) = O_{d,n}(1)$.\n\nNext, we introduce some notation that will be helpful for our analysis. Let $\\Delta \\theta = \\theta - \\thetazero$. Suppose we have $n$ examples $x^{(1)},...,x^{(n)}$ and labels $y^{(1)},...,y^{(n)}$. Let $\\vec{y} = \\begin{bmatrix} y^{(1)} & \\dots & y^{(n)} \\end{bmatrix}^\\top$. Let\n\\begin{equation}\n\\vec{y'} = \\begin{bmatrix} y^{(1)}-f_{\\thetazero}(x^{(1)}) \\\\ \\vdots \\\\ y^{(n)}-f_{\\thetazero}(x^{(n)}) \\end{bmatrix}\n\\end{equation}\nbe the transformed labels where we subsume the affine term in the label, allowing us to treat this as a purely linear model without loss of generality. Note that $\\theta = \\text{vec}(W) \\in \\R^{dm}$ is the vectorized version of the weights. Let $\\Phi^{(i)} = \\nabla_\\theta f_{\\thetazero}(x^{(i)}) \\in R^{dm}$ be the feature associated with the $i$th example, and let $\\Phi$ denote the collection of the features across the examples:\n\\begin{equation}\n\\Phi = \\begin{bmatrix} \\Phi^{(1)^\\top} \\\\ \\vdots \\\\ \\Phi^{(n)^\\top} \\end{bmatrix} \\in R^{n \\times dm}.\n\\end{equation}\n\nRecall that we defined\n\\begin{align}\n    g_\\theta(x) = f_{\\thetazero}(x) + \\langle\\nabla_\\theta f_{\\thetazero}, \\theta - \\thetazero\\rangle,\n\\end{align}\nwhich is the linear approximation of $f_\\theta(x)$ at $\\thetazero$. If we wish to fit $g_\\theta(x)$ to $y$ with the $\\ell_2$-loss, we may consider minimizing the following objective function over $\\theta$:\n\\begin{align}\n\\sum_{i=1}^n \\left( y^{(i)} - f_{\\thetazero}(x^{(i)}) - \\langle \\nabla_\\theta f_{\\thetazero}(x\\sp{i}), \\theta - \\thetazero \\rangle \\right)^2 \n&=  \\sum_{i=1}^n \\left(y^{(i)} - f_{\\thetazero}(x^{(i)}) - \\Delta\\theta^\\top \\Phi^{(i)} \\right)^2 \\\\\n&=  \\norm{\\vec{y'} - \\Phi \\Delta\\theta}_2^2.\n\\end{align}\nThis is equivalent to the following optimization problem:\n\\begin{align}\n    \\min_{\\Delta \\theta} \\quad \\norm{\\vec{y'} - \\Phi \\Delta\\theta}_2^2. \\label{lec12:eqn:opt-problem}\n\\end{align}\nSince $n \\ll dm$, so we have an undetermined linear system. Since our goal is to show that the relevant neighborhood around $\\thetazero$ is small, we should choose the minimum norm solution which can be found directly by the pseudoinverse: $\\hat{\\theta} = \\Phi^+ \\vec{y'}$,  where $\\Phi^+$ is the pseudoinverse of $\\Phi$ given by $\\Phi^+ = \\Phi^\\top (\\Phi \\Phi^\\top)^{-1}$.\n\nIt remains to show that the norm of $\\hat{\\theta}$ is small. Before we can do that, we will prove some useful claims:\n\n\\begin{lemma}[$\\Phi$ is a well-conditioned matrix]\\label{lec12:lem:claim1}\nWhen $\\thetazero$ is random, $\\Phi$ is well-conditioned in the sense that \\begin{align}\n\\sigma_{\\min}(\\Phi) \\approx \\frac{1}{\\sqrt{n}} \\norm{\\Phi}_F \\quad \\text{and} \\quad\n\\sigma_{\\max}(\\Phi) \\approx \\frac{1}{\\sqrt{n}} \\norm{\\Phi}_F.\n\\end{align}\n($\\sigma_{\\min}(\\Phi)$ and $\\sigma_{\\max}(\\Phi)$ denote the smallest and largest singular values of $\\Phi$ respectively.) Specifically, $\\sigma_{min}(\\Phi) \\gtrsim \\Omega \\left( \\frac{1}{\\sqrt{n}} \\norm{\\Phi}_F \\right)$, and vice-versa for $\\sigma_{\\max}(\\Phi)$.\n{\\color{blue} Note: this lemma is very problematic and perhaps outright wrong. It will be updated in the next revision} \\tnoteimp{Tengyu will fix this}\n\\end{lemma}\n\nWe omit the proof as it uses tools from random matrix theory that are not required for this course. (The high-level idea is to show that $\\Phi \\Phi^\\top \\approx c \\cdot I$ for some constant scalar $c$.)\n\n\\begin{remark}\nSince $\\Phi \\in R^{n \\times dm}$, $\\Phi$ has at most $n$ singular values $\\sigma_1 \\geq \\ldots \\geq \\sigma_n \\geq 0$. Since $\\norm{\\Phi}_F = \\sqrt{\\sigma_1^2 + ... + \\sigma_n^2}$, the fact that $\\sigma_n \\approx \\frac{1}{\\sqrt{n}} \\norm{\\Phi}_F$ means that all the singular values are not very different from each other.\n\\end{remark}\n\n\\begin{lemma}[Frobenius norm of $\\Phi$ is order $1$]\\label{lec12:lem:phi-norm}\n$\\norm{\\Phi}_F \\asymp \\Theta_{d,n}(1)$, which implies that\n\\begin{equation}\n\\norm{\\Phi}_{\\text{op}}, \\ \\norm{\\Phi^+}_{\\text{op}}  \\asymp \\Theta_{d,n}(1).\n\\end{equation}\n\\end{lemma}\n\n\\begin{proof}\nApplying the definition of $\\Phi^{(i)}$, we have \n\\begin{align}\n    \\Phi^{(i)} &= \\text{vec}\\left(\\frac{\\partial f_{\\theta}(x^{(i)})}{\\partial W}\\right) = \\frac{1}{\\sqrt{m}} (a \\odot \\sigma' (w x^{(i)})) \\cdot \\left(x^{(i)}\\right)^\\top.\n\\end{align}\n\nThus, its norm can be written as\n\\begin{align}\n\\norm{\\Phi^{(i)}}_2 &= \\frac{1}{\\sqrt{m}} \\norm{a \\odot \\sigma' (w x^{(i)})}_2 \\cdot \\norm{x^{(i)}}_2  \\\\\n&\\approx \\Theta_{d,n}\\left(\\frac{1}{\\sqrt{m}} \\cdot \\sqrt{m} \\cdot 1 \\right) \\label{lec12:eqn:approx} \\\\\n&\\approx \\Theta_{d,n}(1).\n\\end{align}\nThe first equality is because $\\norm{ab^\\top}_2 = \\norm{a}_2 \\cdot \\norm{b}_2 / \\norm{ab^\\top}_F$ for any vectors $a$ and $b$, and \\eqref{lec12:eqn:approx} is because $\\norm{x^{(i)}}_2$ is on order of $1$, $a \\odot \\sigma' (w x^{(i)})$ is a vector of length $m$ with each entry being on the order of $1$ (each $a_i$ is either $1$ or $-1$, and $\\sigma'$ is either $0$ or $1$). Summing up over the $\\Phi^{(i)}$'s,\n\\begin{align}\n\\norm{\\Phi}_F &= \\sqrt{\\sum_{i=1}^n \\norm{\\Phi^{(i)}}_2^2} \\approx \\Theta_{d,n}(1).\n\\end{align}\n\nPutting this together with Lemma~\\ref{lec12:lem:claim1}, all the singular values of $\\Phi$ are $\\Theta_{d,n}(1)$. By extension,  $\\norm{\\Phi^+}_{\\text{op}} \\asymp \\Theta_{d,n}(1)$ as well, since if a matrix $A$ has singular values $\\sigma_1, \\dots,\\sigma_n$, $A^+$ has singular values $1 / \\sigma_1, \\dots,1/\\sigma_n$. \n\\end{proof}\n\nNow we can leverage the previous two lemmas to produce a bound on the $\\ell_2$-norm of the solution of the optimization problem~\\eqref{lec12:eqn:opt-problem}, $\\hat \\theta$. Recall that $\\hat \\theta = \\Phi^+ \\vec {y'}$. Upper bounding the norm yields\n\\begin{align}\n    \\| \\hat \\theta\\|_2 & \\leq \\| \\Phi^+ \\|_{\\text{op}} \\cdot \\|\\vec{y'} \\|_2  \\\\\n    & \\leq O_{d, n} (1) \\cdot \\|\\vec{y'} \\|_2 &\\text{(by Lemma~\\ref{lec12:lem:phi-norm})} \\\\\n    & \\leq  O_{d, n} (1),\n\\end{align}\nwhere the last inequality is because $\\vec{y'}$ is of dimension $n$ and each entry is on the order of $1$ (the original labels are on the order of $1$ and the shifts $f_{\\thetazero}(x\\sp{i})$ are also on the order of $1$). Although $\\| \\hat \\theta\\|_2$ may not appear to be small, since it may still be $\\poly(d, n)$, we can view it as comparatively small relative to the size of $\\thetazero$ since\n\\begin{align}\n    \\| \\thetazero \\|_2 &= \\| W^0 \\|_F^2 \\\\\n    &\\asymp \\sqrt {dm} &\\text{(because $ W^0$ has $dm$ entries of order $1$)} \\\\\n    &= \\Theta_{d, n}(\\sqrt m).\n\\end{align}\nThus, the neighborhood size is much smaller than the norm of the initialization in terms of $m$. Further justification of the $\\ell_2$-norm as a reasonable metric for defining neighborhood size may require deeper inspection of the higher order terms and their behavior within the neighborhood. The intuition is that one only needs to move a little to reach the solution. Relative to the norm of the initialization, the neighborhood size is shrinking.\n\n\\begin{remark}\nWhile we do not cover this in detail, the main takeaway on the optimization front is that the problem of fitting $g_\\theta(x)$ is a standard strongly convex optimization problem, which enjoys the geometric rate of convergence.\n\\end{remark}\n\n\n\n"
  },
  {
    "path": "tex/collection/07-03-ntk.tex",
    "content": "% reset section counter\n%\\setcounter{section}{0}\n\n%\\metadata{lecture ID}{Your names}{date}\n\\metadata{13}{Justin Young and Josh Cho}{November 1, 2021}\n\n\\sec{The Neural Tangent Kernel (NTK) Approach} \\label{sec:ntk_approach}\n\nIn the previous sections, we studied non-convex optimization problems in which all local minima are global. Selecting the parameters of a deep neural network is another commonly encountered non-convex optimization problem, but it is unrealistic to expect that all local minima will also be global minima in this setting. Here we consider a particular objective for which we can identify particular regions of the input space in which all local minima are also global minima. We can show that this objective corresponds to certain types of deep neural networks, but this analysis remains limited. For further reading about this approach to studying neural network optimization, see \\cite{liang2018adding} and \\cite{du2019width}.\n\n\\tnotelong{Tengyu should  double check this later}\n\nTo be more formal, we take an appropriate parameter initialization $\\theta^0$ such that in a neighborhood around it, which we denote by $B(\\theta^0)$, the loss function is convex and its global minimum is attained. Figure \\ref{lec13:fig:NTKapproach} depicts a function and region for which this condition holds. \n\n\\begin{figure}[ht]\n    \\centering\n    \\includegraphics[scale=0.3]{figures/ntk_initialization.png}\n    \\caption{Training loss around an initialized $\\theta^0$. The dotted lines indicate $B(\\theta^0)$, a region where the loss is convex, and where a global minimum exists.}\n    \\label{lec13:fig:NTKapproach}\n\\end{figure}\n\n\nGiven a nonlinear $f_\\theta(x)$, we examine the Taylor expansion at $\\theta^0$: \n\\begin{align} \n    f_\\theta(x) = \\underbrace{f_{\\theta^0}(x) + \\langle \\nabla_\\theta f_{\\theta^0}(x),\\theta-\\theta^0 \\rangle}_{\\defeq g_\\theta(x)} + \\text{ higher order terms}\n\\end{align} \n\nNote that $g_\\theta(x)$ is an affine function in $\\theta$, as $f_{\\theta^0}(x)$ is a constant for fixed $x,\\theta^0$. Similarly, defining $\\Delta \\theta = \\theta-\\theta^0$, we can say that $g_\\theta(x)$ is linear in $\\Delta \\theta$. For convenience, we will sometimes choose $\\theta^0$ such that $f_{\\theta^0}(x) = 0$ for all $x$. It is easy to see why such an initialization exists. Consider splitting a two-layer neural network $f_{\\theta}(x)$ with width $2m$ into two halves, each with $m$ neurons; the outputs of these two networks are then given by $\\sum_{i=1}^m a_i \\sigma (w_i^\\top x)$ and $\\sum_{i=1}^m -a_i \\sigma (w_i^\\top x)$, respectively. Here, $w_i$ can be randomly chosen so long as $W_i$ is the same in both halves, and $a_i$ can be randomly chosen as long as the other half is initialized with $-a_i$. Summing these two networks together yields $f_{\\theta^0}(x) \\equiv 0$ for all $x$.\n\nWhen $f_{\\theta^0}(x) \\equiv 0$, we have that \n%\\begin{align}\n  %  y' &= y- f_{\\theta^0}(x) \\\\\n \\begin{align}\n g_\\theta(x)= \\inprod{\\nabla_\\theta f_{\\theta^0}(x), \\Delta \\theta},\n\\end{align}\nwe observe that $\\Delta \\theta$ depends upon the parameter we evaluate the network at, while $\\nabla_\\theta f_{\\theta^0}(x)$ can be thought of as a feature map since it is a fixed function of $x$ (given the architecture and $\\theta^0$) that does not depend on $\\theta$ whatsoever. We thus let $\\phi(x) \\triangleq \\nabla_\\theta f_{\\theta^0}(x)$, which motivates the following definition: \n\n\\begin{definition}[Neural Tangent Kernel]\nFor simplicity, we assume $f_{\\theta^0}(x)=0$ so that $y=y'$. The \\textit{neural tangent kernel} $K$ is given by  \n\\begin{align} \n    K(x,x') &= \\inprod{\\phi(x), \\phi(x')} \\\\\n    &= \\inprod{\\nabla_\\theta f_{\\theta^0}(x), \\nabla_\\theta f_{\\theta^0}(x')}.\n\\end{align} \n\\end{definition}\nHere, the feature $\\nabla_\\theta f_{\\theta^0}(x)$ is precisely the gradient of the neural network. This is where the ``tangent'' in Neural Tangent Kernel comes from. \n\nInstead of $f_\\theta(x)$, suppose we use the approximation $g_\\theta(x)$, which we recall is linear in $\\theta$. The kernel method gives a linear model on top of features. When $\\theta \\approx {\\theta^0}$, given a convex loss function $\\ell$, we have \n\\begin{align} \n    \\underbrace{\\ell (f_\\theta(x),y)}_{\\substack{\\text{not} \\\\ \\text{necessarily} \\\\ \\text{convex}}} \\approx \\underbrace{\\ell(g_\\theta(x),y)}_{\\text{convex}}.\n\\end{align} \nConvexity of the RHS follows from the fact that a convex function, $\\ell$, composed with a linear function, $g_\\theta$, is still convex. \n\nA natural question to ask is: how valid is this approximation? We devote the rest of this chapter to answering this question. First, we define the empirical loss: \n\\begin{align}\n    \\hat{L}(f_\\theta) & = \\frac{1}{n}\\sum_{i=1}^n \\ell \\left( f_\\theta\\big( x^{(i)} \\big) , y^{(i)} \\right) \\\\ \n    \\hat{L}(g_\\theta) & = \\frac{1}{n}\\sum_{i=1}^n \\ell \\left( g_\\theta\\big( x^{(i)} \\big) , y^{(i)} \\right).\n\\end{align} \nThe key idea is that the Taylor approximation works for certain cases. We defer a more complete enumeration of these cases to a later section of this monograph. Here we outline the high-level approach we take to validate and use this Taylor expansion. Namely, we will show that there exists a neighborhood around $\\theta^0$ called $B(\\theta^0)$, such that we have the following:\n\\begin{enumerate}\n    \\item Accurate approximation: $f_\\theta(x) \\approx g_\\theta(x)$, and $\\hat{L}(f_\\theta) \\approx \\hat{L}(g_\\theta)$ for all $\\theta \\in B(\\theta^0)$.\n    \\item It suffices to optimize in $B(\\theta^0)$: There exists an approximate global minimum $\\hat{\\theta} \\in B(\\theta^0)$, so $\\hat{L}(g_{\\hat{\\theta}}) \\approx 0$. This is the lowest possible loss (because the loss is nonnegative), which implies we are close to the global minimum. Because of 1, this implies that $\\hat{L}(f_{\\hat{\\theta}}) \\approx 0$ as well. See Figure~\\ref{lec13:fig:ntkglobalmin} for an illustration.\n    \\item Optimizing $\\hat{L} (f_\\theta)$ is similar to optimizing $\\hat{L}(g_\\theta)$ and does not leave $B(\\theta^0)$, i.e. everything is confined to this region. Intuitively, this last point to some extent is ``implied\" by (1) and (2), but this claim still requires a formal proof. \n\\end{enumerate}\n\n\\begin{figure}[ht!]\n    \\centering\n    \\includegraphics[scale=0.5]{figures/ntk_global_min.png}\n    \\caption{Here, $\\hat{L}(g_{\\theta})$ and $\\hat{L}(f_{\\theta})$ are both plotted. At $\\hat{\\theta}$, we have reached the approximate global minimum where $\\hat{L}(g_{\\hat{\\theta}}) \\approx 0$, in turn implying also that $\\hat{L}(f_{\\hat{\\theta}}) \\approx 0$.}\n    \\label{lec13:fig:ntkglobalmin}\n\\end{figure}\n\nNote (1), (2), and (3) can all be true in various settings. In particular, to attain all three, we will require: \n\\begin{enumerate}[label=\\alph*]\n    \\item[(a)] Overparametrization and/or a particular scaling of the initialized $\\theta^0$. \n    \\item[(b)] Small (or even zero) stochasticity, so $\\theta$ never leaves $B(\\theta^0)$. This condition is guaranteed by a small learning rate or full-batch gradient descent. \n\\end{enumerate} \nDespite the limitations of the requirements of (a) and (b), the existence of such a region is still surprising. Given the loss landscape which could potentially be highly non-convex, it is striking to find a neighborhood where the loss function is convex (e.g. quadratic) with a global minimum. This suggests there is some flexibility in the loss landscape.  \n\nTo begin our formal discussion, we  start by providing tools for proving (1) and (2). Let \n\\begin{align}\n    \\phi^{(i)} = \\phi(x\\sp{i}) = \\nabla_\\theta f_{\\theta^0}( x\\sp{i}) \\in \\R^p\n\\end{align}\nand \n\\begin{align}\n    \\Phi = \\begin{bmatrix} {\\phi\\sp{1}}^\\top \\\\ \\vdots \\\\ {\\phi\\sp{n}}^\\top \\end{bmatrix} \\in \\R^{n \\times p}\n\\end{align}\nwhere $p$ is the number of parameters. Taking the quadratic loss, we have\n\\begin{align}\n    \\hat{L}(g_\\theta) = \\frac{1}{n}\\sum_{i=1}^n \\left( y\\sp{i} - \\phi\\l(x\\sp{i} \\r)^\\top \\Delta \\theta \\right)^2 = \\frac{1}{n} \\norm{\\vec{y} - \\Phi \\cdot \\Delta \\theta}_2^2\n\\end{align} \nwhere $\\vec{y} = \\l[ y\\sp{1}, \\cdots, y\\sp{n} \\r]^\\top \\in \\R^n$. Note that this looks a lot like linear regression, where $\\Phi$ and $\\Delta \\theta$ are the analogues of the design matrix and parameter, respectively. We further assume that $y^{(i)} = O(1)$ and $\\norm{y}_2 = O(\\sqrt{n})$. Now, we can prove a lemma that addresses the second of the three conditions we described above, i.e. that it is sufficient to optimize in some small ball around $\\theta^0$.\n\n\\begin{lemma}[for (2)] \\label{lec13:lma:nearest_minimum}\n    Suppose we are in the setting where $p \\geq n$, $\\textup{rank}(\\Phi) = n$, and $\\sigma_{\\min}(\\Phi) = \\sigma > 0$. Then, letting $\\Delta \\hat{\\theta}$ denote the minimum norm solution, i.e. the nearest global minimum, of $\\vec{y} = \\Phi \\Delta \\theta$, we have \n    \\begin{align} \n        \\norm{\\Delta \\hat{\\theta}}_2 \\leq O(\\sqrt{n} / \\sigma)\n    \\end{align} \n\\end{lemma}\n\\begin{remark} \\label{lec13:rmk:intuitiononlemma} \n    The meaning of the bound on $\\Delta \\hat{\\theta}$ becomes clear if we consider the ball given by \n    \\begin{align}\n        B_{\\theta^0} = \\{ \\theta = \\theta^0 + \\Delta \\theta: \\norm{\\Delta \\theta}_2 \\leq O(\\sqrt{n}/\\sigma )\\}.\n    \\end{align} \n    In particular, notice that $B_{\\theta^0}$ contains a global minimum, so this lemma characterizes how large the ball must be to contain a global minimum. \n    \\end{remark} \n\\begin{remark}\n\tWe also note that the condition $\\textup{rank}(\\Phi) = n$ and $\\sigma > 0$ can be thought of as a ``finite-sample expressivity'' condition, saying that the features $\\Phi$ are expressive enough so that there exists a linear model on top of these features that perfectly fit the data. The condition $\\textup{rank}(\\Phi) = n$ requires $p \\ge n$---so we need some amount of over-parameterization to apply these analysis. \n\\end{remark}\n\\begin{proof}\n    Letting $\\Phi^+$ denote the Moore-Penrose pseudoinverse of $\\Phi$, note that $\\Delta \\hat{\\theta} = \\Phi^+ \\boldsymbol{y}$, and $\\norm{\\Phi^+} _{\\text{op}} = \\frac{1}{\\sigma_{\\min} (\\Phi)} = \\frac{1}{\\sigma}$.  A simple argument shows \n    \\begin{align}\n        \\norm{\\Delta \\hat{\\theta}}_2 &\\leq \\norm{\\Phi^+}_{\\text{op}} \\cdot \\norm{\\vec{y}}_2 \\\\\n        &\\leq O\\left( \\frac{1}{\\sigma}\\cdot \\sqrt{n} \\right),\n    \\end{align} \n    where the last inequality follows from the assumption that $\\norm{\\vec{y}}_2 \\leq O(\\sqrt{n})$. \n\\end{proof}\nNext, we prove a lemma that addresses the first of the three steps we described above.\n\\begin{lemma}[for (1)] \n    \\label{lec13:lma:accurate_approximation}\n    Suppose $\\nabla_\\theta f_\\theta(x)$ is $\\beta$-Lipschitz in $\\theta$, i.e. for every $x$, and $\\theta, \\theta'$, we have \n    \\begin{align}\n        \\norm{\\nabla_\\theta f_{\\theta} (x) - \\nabla_{\\theta} f_{\\theta'}(x)}_2 \\leq \\beta \\cdot \\norm{ \\theta - \\theta'}_2.\n    \\end{align} \n    Then, \n    \\begin{align} \n        \\left| f_\\theta(x) - g_\\theta(x) \\right| \\leq O \\left( \\beta \\norm{\\Delta \\theta}_2^2 \\right).\n    \\end{align}  \n    If we further restrict our choice of $\\theta$ using $B_{\\theta^0}$ as defined in Remark~\\ref{lec13:rmk:intuitiononlemma}, we obtain that\n    \\begin{align} \n        | f_\\theta(x) - g_\\theta(x) | \\leq O \\left( \\frac{\\beta n }{\\sigma^2 }\\right), \\quad \\forall \\theta \\in B_{\\theta^0}. \\label{lec13:eqn:lemma1bound} \n    \\end{align} \n\\end{lemma}\n\\begin{proof}\n    The proof comes from the following fact:  if $h(\\theta)$ is such that $\\nabla h(\\theta)$ is $\\beta$-Lipschitz (which if differentiable is equivalent to $\\norm{\\nabla^2 h(\\theta)}_{\\text{op}} \\leq \\beta$), then\n    \\begin{align}\n        \\bigg| \\underbrace{h(\\theta)}_{f_\\theta(x)}  \\underbrace{-h(\\theta^0) - \\inprod{\\nabla h(\\theta^0), \\theta-\\theta^0}}_{-g_\\theta(x)}\\bigg| \\leq O\\left( \\beta \\norm{\\theta-\\theta^0}_2^2 \\right).\n    \\end{align} \n    \\tnotelong{add a lemma in the toolbox section about this}\n    As shown above, the proof is as simple as plugging in $f_\\theta(x) = h(\\theta)$ and $g_\\theta(x)=h(\\theta^0) + \\inprod{\\nabla h(\\theta^0), \\Delta \\theta}$. \n\\end{proof}\n\n\\begin{remark}\nThe lemma above bounds the approximation error. Intuitively, as you move farther away from $\\theta^0$, the Taylor approximation gets worse; the approximation error is bounded above by a second order $\\Delta \\theta$ term.\n\\end{remark}\n\n\\begin{remark}\nNote that if $f_\\theta$ involves a $\\text{relu}$ function, then $\\nabla f_\\theta$ is not continuous everywhere. This requires a technical fix outside the scope of our discussion.\\footnote{A $\\text{relu}$ function is continuous almost everywhere, so we can make some minor fixes and still use some modified notion of Lipschitzness to derive an upper bound.} \\tnotelong{Tengyu will add a reference here}\n\\end{remark}\n\n\\subsec{Two examples of the NTK regime} \\label{sec:ntk:two_examples}\nBy \\eqref{lec13:eqn:lemma1bound}, we have now established a bound on our approximation error, but we have yet to analyze how good it is, as $\\beta n /\\sigma^2$ is neither obviously either big nor small. An important fact to notice is that $\\beta/\\sigma^2$ is not scaling invariant, so we can play with the scaling in order to drive this term to $0$. In particular, there are two notable cases (with specific parameterization, initialization, etc) where $\\beta/\\sigma^2 \\to 0$. In the literature, such situation is often referred to as the NTK regime or the lazy training regime~\\cite{chizat2018note}. \n\\begin{enumerate}\n    \\item  \\textbf{Reparameterize with a scalar} \\cite{chizat2018note}. Let $f_\\theta(x) = \\alpha \\cdot \\bar{f}_\\theta(x)$ where $\\bar{f}_\\theta(x)$ is an arbitrary neural net with fixed width and depth. We only vary $\\alpha$, i.e. the scaling, and we see how the crucial quantity $\\beta/\\sigma^2$ changes accordingly. Fix an initial $\\theta^0$, and let \n    \\begin{align}\n        \\bar{\\sigma} = \\sigma_{\\min}\\left( \\begin{bmatrix}  \\nabla_\\theta \\bar{f}_{\\theta^0} \\big( x^{(1)} \\big)^\\top \\\\ \\vdots \\\\ \\nabla_\\theta \\bar{f}_{\\theta^0} \\big(x^{(n)} \\big)^\\top \\end{bmatrix}\\right).\n    \\end{align} \n    Furthermore, let $\\bar{\\beta}$ be the Lipschitz parameter of $\\nabla_\\theta \\bar{f}_\\theta(x)$ in $\\theta$. A simple chain-rule gradient argument shows that scaling $\\bar{f}_{\\theta}$ by $\\alpha$ also scales $\\sigma$ and $\\beta$ accordingly, i.e. $\\sigma = \\alpha \\bar{\\sigma}$, and $\\beta = \\alpha \\bar{\\beta}$. Some straightforward algebra yields \n    \\begin{align} \n        \\frac{\\beta}{\\sigma^2}= \\frac{\\bar{\\beta}}{\\bar{\\sigma}^2} \\cdot \\frac{1}{\\alpha} \\to 0 \\quad \\text{as} \\quad \\alpha \\to \\infty.\n    \\end{align}\n    Once $\\alpha$ becomes big enough, then by Lemma~\\ref{lec13:lma:accurate_approximation}, the approximation $|f_\\theta(x) - g_\\theta(x)| \\leq O\\left( \\beta n / \\sigma^2 \\right)$ becomes very good. \n    \n\\begin{remark} A priori, such a phenomenon may appear to be too good to be true. To understand it better, we first note that this re-parameterizaton does not change the scale of the loss, but rather change the shape of the loss function. Intuitively, as $\\alpha$ becomes larger, the function $f_\\theta$ becomes sharper and more non-smooth (leading to higher approximation error). However, on the other hand, we note that we only need to travel a little bit away from $\\theta^0$ to find a global minimum given that there is a global minimum within radius $O(\\sqrt{n}/\\sigma)$. It turns out that the radius needed shrinks faster than the smoothness grows. \n    \n    To visualize this effect, we can consider the following example with only 1 data point with 1-dimensional input $(x,y) = (1,1)$ and the quadratic model $\\bar{f}_\\theta(x) = x(\\theta + \\beta \\theta^2) = \\theta + \\beta \\theta^2$. Using the squared loss, we have \n    \\begin{align}\n    \\hatL(\\bar{f}_\\theta) = (1- (\\theta + \\beta \\theta^2))^2 \n    \\end{align}\n    Let $\\theta^0 = 0$. Taylor expanding at $\\theta^0$ gives the linear approximation $\\bar{g}_\\theta(x) = \\theta x$ = $\\theta$, and the resulting loss function that is quadratic \n    \\begin{align}\n    \\hatL(\\bar{g}_\\theta) = (1- \\theta)^2 \n    \\end{align}\n\tIn this case,  $\\nabla f_{\\theta^0}(x) = 2\\beta \\theta x = 2\\beta \\theta$ is $2\\beta$-Lipschitz, and $ \\sigma = 1$. \n\t\n\tNow we vary $\\alpha$ and get \n    \\begin{align}\n\\hatL(\\alpha \\bar{f}_\\theta) = (1- \\alpha(\\theta + \\beta \\theta^2))^2 \n\\end{align}\t\nand \n\\begin{align}\n\\hatL(\\alpha\\bar{g}_\\theta) = (1- \\alpha\\theta)^2 \n\\end{align}\nNote that the minimizer of $\t\\hatL(\\alpha\\bar{g}_\\theta) $ is $1/\\alpha$, which is closer to $\\theta^0$ as $\\alpha\\rightarrow \\infty$. We zoom into the region $[0, 1/\\alpha]$ and find out the difference between $\\alpha \\bar{f}_\\theta$ and $\\alpha \\bar{g}_\\theta$ is $\\alpha \\beta\\theta^2 \\le  \\beta/\\alpha$, which is much smaller than the value of $\\alpha \\bar{g}_\\theta \\approx O(1)$. \n\nWe visualize the these functions in Figure~\\ref{fig:ntk-1d}. We observe that $\\hatL(\\alpha\\bar{g}_\\theta) $ becomes a better approximation of $\\hatL(\\alpha\\bar{f}_\\theta)$ in the region $[0,1/\\alpha]$ as $\\alpha \\rightarrow \\infty$ (though $\\hatL(\\alpha\\bar{g}_\\theta)$ is a worse approximation of $\\hatL(\\alpha\\bar{f}_\\theta)$ globally.)\n\n\n\\begin{figure}[t]\n\t\\centering\n\t\\includegraphics[width = 0.6\\textwidth]{figures/ntk-1d.png}\n\t\\caption{\\label{fig:ntk-1d} The approximation $\\hatL(\\alpha\\bar{g}_\\theta) $ becomes a better approximation of $\\hatL(\\alpha\\bar{f}_\\theta)$ in the region $[0,1/\\alpha]$ as $\\alpha \\rightarrow \\infty$ (though $\\hatL(\\alpha\\bar{g}_\\theta)$ is a worse approximation of $\\hatL(\\alpha\\bar{f}_\\theta)$ globally).}\n\\end{figure}\n\t%The minimum norm solution with $\\alpha g_\\theta$ is\n%    \\[\n%    \\argmin{\\theta} (1-\\alpha  \\theta)^2 = 1/\\alpha.\n%    \\]\n%    Now, for $\\alpha\\geq 1$ we compute\n%    \\[\n%    D(\\alpha) = \\sup_{\\theta\\in [0,1/\\alpha ]} |\\alpha f_\\theta(1) - \\alpha g_\\theta(1)|. \n%    \\]\n%    We will plot $D(\\alpha)$ as well as $\\hat L(\\alpha f_\\theta(1))$ and $\\hat L(\\alpha g_\\theta(1))$ for $\\alpha = 1,2,4$. In the following plots, we use $\\beta = 2$. \n%   \n\\end{remark}\n    \\item \\textbf{Overparametrization (with specific initialization)}. Early papers on the NTK take this approach (e.g., ~\\cite{li2018learning,du2019width}). Consider a  two-layer network with $m$ neurons. \n    \\begin{align}\n        \\hat{y} = \\frac{1}{\\sqrt{m}} \\sum_{i=1}^m a_i \\sigma(w_i^\\top x )\n    \\end{align} \n    The scaling $1/\\sqrt{m}$ is to ensure that a random initialization with constant scale will have output on the right order, as we see momentarily. We make the following assumptions regarding the network and its inputs.\n    \\begin{align}\n        W &= \\begin{bmatrix} w_1^\\top \\\\ \\vdots \\\\ w_m^\\top \\end{bmatrix} \\in \\R^{m \\times d} \\\\\n        \\sigma &\\text{ is $1$-Lipschitz and twice-differentiable} \\\\\n        a_i &\\sim \\{\\pm 1\\} \\quad &\\text{(not optimized)} \\\\\n        w_i^0 &\\sim \\cN(0, I_d) \\\\\n        \\norm{x}_2 &= \\Theta(1) \\\\\n        \\theta &= \\text{vec}(W) \\in \\R^{dm} \\quad &\\text{(vectorized $W$)}\n    \\end{align} \n    We will assume $m \\to \\infty$ polynomially in $n$ and $d$. In particular, for fixed $n,d$, we have $m = \\textsf{poly}(n,d)$.\n    \n    Why do we use the $1/\\sqrt{m}$ scaling? Note that $\\sigma\\big({w_i^0}^\\top x\\big) \\approx 1$ because $\\norm{x}_2 = \\Theta(1)$ and $w_i^0$ is drawn from a spherical Gaussian. Thus, as some $a_i$ are positive and others are negative, $\\left|\\sum_{i=1}^m a_i \\sigma \\big({w_i^0}^\\top x\\big) \\right| = \\Theta \\left( \\sqrt{m} \\right)$, and finally $f_{\\theta^0} (x) = \\Theta(1)$. \n    \n    Now we analyze $\\sigma$ and $\\beta$. We let\n    \\begin{align}\n        \\sigma = \\sigma_{\\min} (\\Phi) = \\sqrt{\\sigma_{\\min} \\left( \\Phi \\Phi^\\top \\right)}\n    \\end{align}\n    where \n    \\begin{align}\n        \\left( \\Phi \\Phi^\\top \\right)_{ij} = \\inprod{\\nabla_\\theta f_{\\theta^0} \\big(x^{(i)} \\big) , \\nabla_\\theta f_{\\theta^0} \\big(x^{(j)} \\big)} \\label{lec13:eqn:phifeature} \n    \\end{align} \n    Note that the gradient with respect to $w_i$ is given by \n    \\begin{align}  \n        \\frac{\\partial f_\\theta(x) }{\\partial w_i} = \\frac{1}{\\sqrt{m}} \\sigma'(w_i^\\top x ) \\cdot x \n    \\end{align} \n    Now observe that\n    \\begin{align}\n        \\norm{\\nabla f_\\theta(x)}_2^2 & = \\frac{1}{m}\\sum_{i=1}^m \\norm{\\sigma'\\big({w_i}^\\top x \\big) \\cdot x }_2^2 \\\\ \n        & = \\frac{1}{m}\\norm{x}_2^2 \\cdot \\sum_{i=1}^m \\left( \\sigma' \\big({w_i}^\\top x \\big) \\right)^2 \\\\ \n        &\\to \\Exp_{w \\sim \\cN(0,I_d)} \\left[ \\sigma' \\big( w^\\top x \\big)^2 \\right] \\cdot \\norm{x}_2^2 \\quad \\text{as} \\quad m\\to\\infty \\\\ \n        &= O(1) &\\text{(not depending on $m$)}\n    \\end{align} \n    where the penultimate line follows from the law of large numbers, as $\\frac{1}{m} \\sum_{i=1}^m \\left( \\sigma'(w_i^\\top x ) \\right)^2$ can be interpreted as a mean. \n    \n    Note that the scale of $\\norm{\\nabla_\\theta f_{\\theta^0} (x)}_2$ does not depend on $m$, so the inner product in \\eqref{lec13:eqn:phifeature} also does not depend on $m$ either. As above, we can show \n    \\begin{align} \n        \\inprod{\\nabla_\\theta f_{\\theta^0} (x), \\nabla_\\theta f_{\\theta^0} (x')} & = \\frac{1}{m}\\inprod{ x,x'} \\sum_{i=1}^m \\sigma'(w^\\top x) \\sigma'(w^\\top x')  \\\\ \n        & \\to \\Exp_{w \\sim \\cN(0,I_d)} \\left[ \\sigma'(w^\\top x) \\sigma'(w^\\top x') \\right] \\inprod{ x, x'} \\label{lec13:eqn:kernelcalc} \n    \\end{align}\n    \n    \\eqref{lec13:eqn:kernelcalc} implies that as $m \\to \\infty$, $\\Phi \\Phi^\\top$ converges to a constant matrix denoted by \n    \\begin{align}\n        K^\\infty = \\lim_{m \\to \\infty} \\Phi\\Phi^\\top \n    \\end{align}\n    This is precisely the NTK with $m=\\infty$.  Though we omit the proof of this claim, it can be shown that $K^\\infty$ is full rank. Then, let \\begin{align}\n        \\sigma_{\\min} \\triangleq \\sigma_{\\min} (K^\\infty) > 0.\n    \\end{align}\n    We can show that \n    \\begin{align}\n        \\sigma = \\sigma_{\\min} \\left( \\Phi \\Phi^\\top \\right) > \\frac{1}{2}\\sigma_{\\min} \n    \\end{align} \n    Intuitively, $\\Phi \\Phi^\\top \\to K^\\infty$, so the spectrum of the matrix should also converge. Thus, in some sense, we have shown that $\\sigma$ is constant in the limit. \n    \n    Now what about $\\beta$? If we can show $\\beta \\to 0$ as $m \\to \\infty$, we are done. We begin by analyzing this key expression:  \n    \\begin{align}\n        \\nabla_\\theta f_\\theta(x) - \\nabla_\\theta f_{\\theta'} (x) = \\left[ \\frac{1}{\\sqrt{m}} \\left( \\sigma' \\big( w_i^\\top x \\big) - \\sigma' \\big({w_i'}^\\top x \\big) \\right) \\cdot x \\right]_{i=1}^m \\label{lec13:eqn:lipschitzmatrix}\n    \\end{align}\n    Note that \\eqref{lec13:eqn:lipschitzmatrix} above consists of matrices, as $\\theta$ is a vectorized matrix. Then,\n    \\begin{align}\n        \\norm{\\nabla_\\theta f_\\theta(x) - \\nabla_\\theta f_{\\theta'}(x)}_2^2 & = \\frac{1}{m}\\sum_{i=1}^m \\norm{x}_2^2 \\left( \\sigma' \\big(w_i^\\top x \\big) - \\sigma' \\big({w_i}'^\\top x \\big) \\right)^2  \\\\ \n        & \\leq O \\left( \\frac{1}{m}\\sum_{i=1}^m \\norm{ x}_2^2 \\big( w_i^\\top x - {w_i'}^\\top x \\big)^2 \\right) \\\\ \n        & =  O \\left( \\frac{1}{m}\\sum_{i=1}^m \\norm{ w_i - w_i'}_2^2 \\right) \\\\ \n        & = O \\left(\\frac{1}{m} \\norm{ \\theta - \\theta' }_2^2 \\right)\n    \\end{align} \n    The first line follows from the fact that $\\frac{1}{\\sqrt{m}} \\left( \\sigma' \\big( w_i^\\top x \\big) - \\sigma' \\big({w_i'}^\\top x \\big) \\right)$ is a scalar. The second line uses the assumption that $\\sigma'$ is $O(1)$-Lipschitz. The third line uses Cauchy-Schwarz and the fact that $\\norm{x}_2^2 \\approx 1$. Taking the square root, we have that\n    \\begin{align} \n        \\norm{\\nabla_\\theta f_\\theta(x) - \\nabla_\\theta f_{\\theta'}(x)}_2 \\lesssim \\frac{1}{\\sqrt{m} } \\norm{ \\theta -\\theta' }_2\n    \\end{align} \n    Thus, the Lipschitz parameter is $\\beta = O(1/\\sqrt{m})$. Thus, our key quantity $\\beta/\\sigma^2$ goes to $0$ as $m$ grows. Namely,\n    \\begin{align} \n        \\frac{\\beta}{\\sigma^2} \\approx \\frac{1}{\\sqrt{m} }\\cdot \\frac{1}{\\sigma_{\\min}^2} \\to 0 \\quad \\text{as} \\quad m\\to\\infty.\n    \\end{align} \n    Recall here that $\\sigma_{\\min}$ does not depend on $m$. Concretely, this result tells us that our function becomes more smooth (the gradient has a smaller Lipschitz constant) as we add more neurons. \n\\end{enumerate}\n\n\\subsec{Optimizing \\texorpdfstring{$\\hat{L}(g_\\theta)$}{L(g)} vs. \\texorpdfstring{$\\hat{L}(f_\\theta)$}{L(f)}}\nWe now discuss how to establish the last of the three conditions under which we claimed a Taylor approximation is reasonable. We need to show that  optimizing $\\hat{L} (f_\\theta)$ is similar to optimizing $\\hat{L}(g_\\theta)$. To do so, we require two steps:\n\\begin{enumerate}[label=\\alph*]\n    \\item[(A)] Analyze optimization of $\\hat{L}(g_\\theta)$.\n    \\item[(B)] Analyze optimization of $\\hat{L}(f_\\theta)$ by re-using or modifying the proofs in (A).\n\\end{enumerate}\nThere are two approaches in the literature for (A), which implies that there exist two approaches for (B) as well. \n\\begin{enumerate}\n    \\item[(i)] We leverage the strong convexity of $\\hat{L} (g_\\theta)$, and then show an exponential convergence rate.\\footnote{Recall that a differentiable function $f$ is $\\mu$-strongly convex if \n    \\begin{align} \n        f(y) \\geq f(x) + \\nabla f(x)^\\top (y-x) + \\frac{\\mu}{2} \\norm{y-x}_2^2\n    \\end{align} for some $\\mu>0$ and all $x,y$.} \n    \\item[(ii)] Instead of strong convexity, we rely on the smoothness of $f_\\theta$ (i.e. bounded second derivative). \n\\end{enumerate}\nWe will only discuss the first of these two methods in the sequel.\n\n\\begin{remark} In both either approach (i) or (ii), we will implicitly or explicitly use the following simple fact. \nSuppose at any $\\theta^t$, we take the Taylor expansion of $f_\\theta$ at $\\theta^t$:\n\\begin{align} \n    g_\\theta^t(x) = f_{\\theta^t} (x) + \\inprod{ \\nabla f_{\\theta^t} (x),\\theta-\\theta^t } \n\\end{align} \nConsider the gradient we are interested in taking: $\\nabla \\hat{L} ( f_{\\theta^t})$. Notice that: \\begin{align} \n    \\nabla \\hat{L} ( f_{\\theta^t}) = \\nabla \\hat{L} ( g_{\\theta^t}^t)\n\\end{align} \nThis is really saying that $f_\\theta$ and $g_\\theta^t$ agree up to first-order at $\\theta^t$. This implies that $L(f_\\theta)$ and $L(g_\\theta^t)$ also agree to first-order at $\\theta^t$. This also means that $T$ steps of gradient descent on $\\hat{L}(f_\\theta)$ is the same as performing online gradient descent\\footnote{Online gradient descent is the algorithm that takes one gradient descent step upon receiving a new objective function. See Chapter~\\ref{chap:OL} for more discussions about online learning.} on a sequence of changing objectives $L(g_\\theta^0), \\ldots, L(g_\\theta^T)$, and this online learning perspective is useful in the approach (ii). \n\\end{remark} \n\nWe will now show that under the strong convexity regime, optimizing a neural network $f_\\theta$ is equivalent to optimizing a linear model $g_\\theta$. We will also observe that this regime is not particularly practically relevant, but this analysis is nevertheless of interest to us for two reasons. First, the approach used in the subsequent exposition is of technical interest and second, it remains quite interesting that optimizing $f_\\theta$ and optimization $g_\\theta$ yields the same results under \\emph{any} regime. \n\n\\subsubsec{Optimizing $g_\\theta$}\nWe relate the optimization of $g_\\theta$ to performing linear regression. Recall that we can think of $\\nabla f_{\\theta^0}(x)$ as a feature map. Then, the problem of choosing $\\Delta \\theta$ to get $g_\\theta(x)$ to be close to $\\vec{y}$ is a linear regression. In particular, we use gradient descent to minimize\n\\al{\n\\norm{\\vec{y} - \\Phi\\Delta \\theta}_2^2,\n}\nwhere \n\\al{\n\\Phi =\n\\begin{bmatrix}\n\\nabla f_{\\theta^0}(x^{(1)})^\\top \\\\\n\\vdots \\\\\n\\nabla f_{\\theta^0}(x^{(n)})^\\top\n\\end{bmatrix}\n\\in \\R^{n \\times p}. \n\\quad \\quad \\vec{y} = \\begin{bmatrix} y\\sp{1} \\\\ \\vdots\\\\ y\\sp{n} \\end{bmatrix} \\in \\R^n\n}\nFor learning rate $\\eta$, the gradient descent update rule is \n\\al{\n{\\Delta \\theta}^{t+1} = \\Delta \\theta^{t} - \\eta \\Phi^\\top (\\Phi \\Delta \\theta^t - \\vec{y}). \\label{lec14:eqn:update-rule}\n}\nThis analysis considers changes in the output space. Define $\\hat{y}^t = \\Phi \\Delta \\theta^t$. Then, we're interested in changes in \n\\al{\n\\hat{y}^{t+1} - \\vec{y} &= \\Phi \\Delta \\theta^{t+1} - \\vec{y}\\\\\n&= \\Phi \\left( \\Delta \\theta^{t} - \\eta \\Phi^\\top (\\Phi \\Delta \\theta^t - \\vec{y})\\right) - \\vec{y} &\\text{(by \\eqref{lec14:eqn:update-rule})}\\\\\n&= \\left( \\Phi - \\eta \\Phi \\Phi^\\top \\Phi\\right)\\Delta \\theta^t - (I - \\eta \\Phi \\Phi^\\top)\\vec{y}\\\\\n&= (I - \\eta \\Phi \\Phi^\\top )\\Phi \\Delta \\theta^t - (I - \\eta \\Phi \\Phi^\\top )\\vec{y}\\\\\n&= (I - \\eta \\Phi \\Phi^\\top) (\\Phi \\Delta \\theta^t - \\vec{y})\\\\\n&= (I - \\eta \\Phi \\Phi^\\top)(\\hat{y}^t - \\vec{y}). \\label{lec14:eqn:g_decomp}\n}\nFrom this decomposition, we see that the residuals, $\\hat{y}^t - \\vec{y}$, are monotonically shrinking since $\\eta \\Phi \\Phi^\\top$, i.e. the term we are subtracting from $I$ in \\eqref{lec14:eqn:g_decomp}, is positive semidefinite. Next, we quantify how quickly we are shrinking the residuals. Define \n\\begin{align}\n    \\tau^2 &= \\sigma_{\\text{max}}(\\Phi \\Phi^\\top) \\\\\n    \\sigma &= \\sigma_\\text{min}(\\Phi) = \\sqrt{\\sigma_\\text{min}(\\Phi\\Phi^\\top)}. \\label{lec14:eqn:sigma_def}\n\\end{align}\nThen, we claim that when $\\eta \\leq \\frac{1}{\\tau^2}$,\n\\al{\n\\norm{I - \\eta \\Phi \\Phi^\\top }_{\\text{op}} \\leq 1-\\eta \\sigma^2. \\label{lec14:eqn:g_decomp_op}\n}\nWhy? Let the eigenvalues of $\\Phi \\Phi^\\top$ be (in descending order) $\\tau_1^2, \\dots , \\tau_n^2$. By definition, $\\tau_1^2 = \\tau^2$ and $\\tau_n^2 = \\sigma^2$. Now, given the singular value decomposition, $\\Phi = U\\Sigma V^\\top$, we obtain the eigendecomposition: \n\\al{\nI - \\eta \\Phi \\Phi^\\top &= I - \\eta U \\Sigma^2 U^\\top \\\\\n&= U U^\\top - \\eta U \\Sigma^2 U^\\top \\\\ \n&= U(I - \\eta \\Sigma^2)U^\\top \\label{lec14:eqn:g_coeff_eigendecomposition}.\n}\n\\eqref{lec14:eqn:g_coeff_eigendecomposition} is the eigendecomposition of $I - \\eta \\Phi \\Phi^\\top$, so $I - \\eta \\Phi \\Phi^\\top$ has eigenvalues $1 - \\eta \\tau_1^2, \\dots, 1 - \\eta \\tau_n^2$.\n\\tnotelong{add more backgrounds about pseudo-inverse and SVD, and linear algebra} Note that assuming $\\eta \\leq \\frac{1}{\\tau^2}$ ensures that all eigenvalues of $I - \\eta \\Phi \\Phi^\\top$ are non-negative. Thus,\n\\al{\n\\norm{I - \\eta \\Phi \\Phi^\\top}_\\text{op} &\\leq \\max_j |1-\\eta \\tau_j^2|\\\\\n&= 1 - \\eta \\tau_n^2 \\label{lec14:eqn:eigenvalue_bound}\\\\\n&= 1 - \\eta \\sigma^2,\n}\nwhere the non-negativity of $1 - \\eta \\tau_j^2$ for all $j$ implies \\eqref{lec14:eqn:eigenvalue_bound}.\n\nUsing this result, we obtain our desired result. Namely, assuming $\\eta \\leq \\frac{1}{\\tau^2}$,\n\\al{\n\\norm{\\hat{y}^{t+1} - \\vec{y}}_2 &= \\norm{I - \\eta \\Phi \\Phi^\\top }_\\text{op} \\cdot \\norm{\\hat{y}^t - \\vec{y}}_2 \\\\\n&\\leq (1-\\eta\\sigma^2)\\norm{\\hat{y}^t - \\vec{y}}_2 \\\\\n&\\leq (1 - \\eta \\sigma^2 )^{t+1}\\norm{\\hat{y}^0 - \\vec{y}}_2.\n}\nThis yields the desired exponential decay in the error. Thus, after $T = O \\l( \\frac{\\log 1/\\epsilon}{\\eta \\sigma^2}\\r)$ iterations, \n\\al{\n\\norm{ \\hat{y}^T - \\vec{y} }_2 \\leq \\epsilon \\norm{\\hat{y}^0 - \\vec{y}}_2. \\label{lec14:eqn:g_exponential_decay}\n}\n\n\\subsubsec{Optimizing $f_\\theta$}\nWe now transition to an analysis of the optimization of $f_{\\theta}$. Our key result is Theorem \\ref{lec14:thm:optimization_f}. If we compare it against what we have in \\eqref{lec14:eqn:g_exponential_decay}, we see the claimed similarity between $f_\\theta$ and $g_\\theta$ in error decay under optimization. \n\n\\begin{theorem}\nThere exists a constant $c_0 \\in (0, 1)$ such that for $\\frac{\\beta}{\\sigma^2} \\leq \\frac{c_0}{n}$ and sufficiently small $\\eta$ (which could depend on $\\beta, \\sigma$, or $p$), $\\hat{L}\\l(f_{\\theta^T}\\r) \\leq \\epsilon$ after $T = O \\l(\\frac{\\log 1/\\epsilon}{\\eta\\sigma^2}\\r)$ steps. \\label{lec14:thm:optimization_f} \n\\end{theorem}\n\n\\begin{proof}\n\n(This is actually a proof sketch that elides a few technical details for the sake of a simpler exposition.) Our approach is to follow the preceding analysis of $g_\\theta$, making changes where necessary.\n\nLet  \n\\al{\n\\Phi^t =\n\\begin{bmatrix}\n\\nabla f_{\\theta^t}(x^{(1)})^\\top \\\\\n\\vdots \\\\\n\\nabla f_{\\theta^t}(x^{(n)})^\\top\n\\end{bmatrix}\n\\in \\R^{n \\times p}.\n}\nTo obtain our gradient descent update rule, we find, using the chain rule,\n\\begin{align}\n    \\nabla \\hat{L}\\l(f_{\\theta^t}\\r) &=  \\sum_{i=1}^n\\l(f_{\\theta^t}\\l(x^{(i)}\\r) - y^{(i)}\\r)\\nabla f_{\\theta^t}\\l(x^{(i)}\\r) \\\\ \n    &=  \\sum_{i=1}^n\\l(\\hat{y}^{(i), t} - y^{(i)}\\r)\\nabla f_{\\theta^t}\\l(x^{(i)}\\r) \\\\\n    &= (\\Phi^t)^\\top\\l(\\hat{y}^t - \\vec{y}\\r).\n\\end{align}\nThis results in the policy\n\\begin{align}\n    \\theta^{t+1} &= \\theta^t - \\eta \\nabla \\hat{L}\\l(f_{\\theta^t}\\r) \\\\\n    &= \\theta^t - \\eta (\\Phi^t)^\\top\\l(\\hat{y}^t - \\vec{y}\\r) \\\\ \n    &= \\theta^t - \\eta b^t,\n\\end{align}\nwhere we have let $b^t = (\\Phi^t)^\\top\\l(\\hat{y}^t - \\vec{y}\\r)$. Following our treatment of $g_\\theta$, we want to express $\\hat{y}^{t+1}$ as a function of $\\hat{y}^{t}$. The challenge now is that $f$ is nonlinear. To deal with this, we Taylor expand $f_\\theta$ at $\\theta_t$:\n\n\\begin{align}\n   f_{\\theta^{t+1}}(x^{(i)}) &= f_{\\theta^{t}}(x^{(i)}) + \\l<\\nabla f_{\\theta^t}(x^{(i)}), \\theta^{t+1} - \\theta^t \\r> + \\text{high order terms} \\\\\n   &= f_{\\theta^{t }}(x^{(i)}) + \\l<\\nabla f_{\\theta^t}(x^{(i)}), -\\eta b^t \\r> + O\\l(\\norm{\\theta^{t+1} - \\theta^t}_2^2\\r). \\label{lec14:eqn:f_taylor_expansion}\n \\end{align}\nSince $O\\l(\\norm{\\theta^{t+1} - \\theta^t}_2^2\\r)$ is $O\\l(\\eta^2\\r)$, we can ignore this term as $\\eta \\rightarrow 0$. Vectorizing \\eqref{lec14:eqn:f_taylor_expansion} without $O\\l(\\norm{\\theta^{t+1} - \\theta^t}_2^2\\r)$,\n\\begin{align}\n    \\hat{y}^{t+1} &= \\hat{y}^t - \\eta \\Phi^t b^t \\\\\n    &= \\hat{y}^t + \\eta \\Phi^t\\l(\\Phi^t\\r)^\\top(\\vec{y} - \\hat{y}^t).\n\\end{align}\nSubtracting $\\vec{y}$ and re-arranging,\n\\begin{align}\n    \\hat{y}^{t+1} - \\vec{y} &= \\hat{y}^t - \\vec{y} + \\eta \\Phi^t\\l(\\Phi^t\\r)^\\top(\\vec{y} - \\hat{y}^t) \\\\ \n    &= \\l(I - \\eta \\Phi^t\\l(\\Phi^t\\r)^\\top\\r)\\l(\\hat{y}^t - \\vec{y}\\r). \\label{lec14:eqn:f_decomposition}\n\\end{align}\nComparing \\eqref{lec14:eqn:f_decomposition} with \\eqref{lec14:eqn:g_decomp}, we see one difference: in \\eqref{lec14:eqn:f_decomposition}, our convergence depends on $\\eta \\Phi^t\\l(\\Phi^t\\r)^\\top$, which is a matrix that changes as we iterate, whereas in \\eqref{lec14:eqn:g_decomp}, convergence is controlled by a matrix that is fixed as we iterate. \n\nTo understand the convergence implications of \\eqref{lec14:eqn:f_decomposition}, we examine the eigenvalues of $I - \\eta \\Phi^t \\l(\\Phi^t\\r)^\\top$. For now, suppose \n\\begin{equation}\n    \\norm{\\theta^t - \\theta^0}_2 \\leq \\sigma/(4\\sqrt{n}\\beta)\n\\end{equation} \nat time $t$. This implies that $\\norm{\\Phi^t - \\Phi}_F \\leq \\frac{\\sigma}{4}$ by the Lipschitzness of $\\nabla f_\\theta(x)$ in $\\theta$. Then, we claim that \n\\begin{align}\n    \\sigma_{\\text{min}}(\\Phi^t) \\geq 3\\sigma/4. \\label{lec14:eqn:phi_t_eigenvalue_bound}\n\\end{align}\nWhy does \\eqref{lec14:eqn:phi_t_eigenvalue_bound} hold? Observe that\n\\begin{align}\n    \\sigma_\\text{min}(\\Phi^t) &= \\underset{\\norm{x}_2=1}{\\text{min}} x^\\top \\Phi^tx \\\\\n   &\\geq \\underset{\\norm{x}_2=1}{\\text{min}} x^\\top (\\Phi^t - \\Phi)x + \\underset{\\norm{x}_2=1}{\\text{min}}  x^\\top \\Phi x. \\label{lec14:eqn:eigenbound_phi_t}\n\\end{align}\nWe can lower bound the first term of \\eqref{lec14:eqn:eigenbound_phi_t} as follows:\n\\begin{align}\n    x^\\top (\\Phi^t - \\Phi)x &\\geq -|\\l<x, (\\Phi^t - \\Phi)x\\r>| \\\\\n    &\\geq -\\norm{x}_2 \\norm{(\\Phi^t - \\Phi)x}_2 &\\text{(Cauchy-Schwarz)}\\\\ \n    &\\geq -\\norm{\\Phi^t - \\Phi}_2 &\\text{($\\norm{x}_2 = 1$)}\\\\ \n    &\\geq -\\sigma/4 &\\text{(Lipschitzness of $\\Phi$)}. \\label{lec14:eqn:termone_eigenbound}\n\\end{align}\nNext, we note that the second term of \\eqref{lec14:eqn:eigenbound_phi_t} is lower bounded by $\\sigma$ by simplifying and applying the definition of $\\sigma$ given in \\eqref{lec14:eqn:sigma_def}. Combining this observation with \\eqref{lec14:eqn:termone_eigenbound}, we conclude that \\eqref{lec14:eqn:phi_t_eigenvalue_bound} must hold.\n\nApplying this lower bound on the eigenvalues of $\\Phi^t$, we can use the same argument we used to establish \\eqref{lec14:eqn:g_decomp_op} to conclude that\n\\begin{align}\n    \\norm{I - \\eta \\Phi^t \\l(\\Phi^t\\r)^\\top}_{\\text{op}} \\leq 1 - 3\\eta \\sigma/4 \\label{lec14:eqn:op_norm_bound},\n\\end{align}\nand \n\\begin{align}\n    \\norm{\\hat{y}^{t+1} - \\vec{y}}_{2} \\leq \\l(1 - 3\\eta \\sigma/4\\r)^{t+1} \\norm{\\hat{y}^{0} - \\vec{y}}_{2}.\n\\end{align}\nSo, as desired, we see exponential decay in the error at each iteration and after $T = O \\l( \\frac{\\log 1/\\epsilon}{n\\sigma^2}\\r)$ iterations,\n\\al{\n\\hat{L}(f_{\\theta^T}) \\leq \\epsilon.\n}\n\nTo complete our proof, observe that this argument is predicated upon the assumption that $\\norm{\\theta^t - \\theta^0}_2 \\leq \\sigma/(4\\sqrt{n}\\beta)$. This assumption is reasonable, however, given what we have already proven. Recall that in Lemma~\\ref{lec13:lma:nearest_minimum}, we proved that \n\\begin{align}\n    \\norm{\\Delta \\hat{\\theta}}_2 = \\norm{\\hat{\\theta} - \\theta^{0}}_2 \\lesssim \\sqrt{n}/\\sigma.\n\\end{align}\nThus, when $\\beta/\\sigma^2 \\rightarrow 0$, eventually, $\\sqrt{n}/\\sigma \\ll \\sigma/(4\\sqrt{n}\\beta)$. To extend this to $\\norm{\\hat{\\theta} - \\theta^t}_2$ for arbitrary $t$, we heuristically argue that since the empirical minimizer is within $\\sigma/(4\\sqrt{n}\\beta)$ of $\\theta^0$, we would not expect to have traveled more than $\\sigma/(4\\sqrt{n}\\beta)$ from $\\theta^0$ at \\emph{any} iteration. \n\nMore formally, we claim that for all $t \\in \\mathbb{N}$, \n\\al{\n\\norm{\\hat{y}^t - \\vec{y}}_2 \\leq \\cO (\\sqrt{n}).  \\label{lec14:eqn:induction}\n}\nWe proceed via induction. For $t=0$, because each element of $\\hat{y}$ is of order $1$, we know that: \n\\al{\n\\frac{1}{\\sqrt{n}} \\norm{\\hat{y}^0 - \\vec{y}}_2 \\leq O (1).\n}\nNow, suppose that \\eqref{lec14:eqn:induction} holds for some $t$. Then, because the errors are monotonically decreasing, (cf. \\eqref{lec14:eqn:f_decomposition} and \\eqref{lec14:eqn:op_norm_bound}), \n\\al{\n\\frac{1}{\\sqrt{n}} \\norm{\\hat{y}^{t+1} - \\vec{y}}_2 \\leq \\frac{1}{\\sqrt{n}}  \\norm{\\hat{y}^t - \\vec{y}}_2 \\leq O(1).\n}\nThus, \\eqref{lec14:eqn:induction} holds for all $t \\in \\mathbb{N}$. \n\nNext, applying Lemma~\\ref{lec13:lma:accurate_approximation} with $\\theta = \\theta^t$ and our assumption that $\\frac{\\beta}{\\sigma^2} \\lesssim \\frac{1}{n}$, we conclude that:\n\\begin{align}\n    \\frac{1}{\\sqrt{n}} \\norm{\\Phi \\theta^t - \\hat{y}^t}_2 \\leq O(1)\n\\end{align}\nUsing this result and \\eqref{lec14:eqn:induction}, we can show that $\\frac{1}{\\sqrt{n}} \\norm{\\Phi(\\theta^t - \\hat{\\theta})}_2$ is $O(1)$.\n\\al{\n    \\frac{1}{\\sqrt{n}} \\norm{\\Phi(\\theta^t - \\hat{\\theta})}_2 &= \\frac{1}{\\sqrt{n}} \\norm{\\Phi \\theta^t - \\vec{y}}_2 &\\text{($\\vec{y} = \\Phi \\hat{\\theta}$)} \\\\\n    &= \\frac{1}{\\sqrt{n}} \\norm{\\Phi \\theta^t - \\hat{y}^t + \\hat{y}^t - \\vec{y}}_2 \\\\\n    &\\leq \\frac{1}{\\sqrt{n}} \\norm{\\Phi \\theta^t - \\hat{y}^t}_2 + \\frac{1}{\\sqrt{n}}\\norm{\\hat{y}^t - \\vec{y}}_2 &\\text{(triangle ineq.)} \\\\\n    &\\leq O(1).\n}\nThen, leveraging the definition of $\\sigma$ given in \\eqref{lec14:eqn:sigma_def} and rearranging, we obtain (nearly) the desired result:\n\\al{\n\\norm{\\theta^t - \\hat{\\theta}}_2 \\leq \\frac{1}{\\sigma}\\norm{\\Phi (\\theta^t - \\hat{\\theta})}_2 &\\leq O(\\sqrt{n}/\\sigma).\n}\nRecall that in Lemma~\\ref{lec13:lma:nearest_minimum}, we proved that \n\\al{\n\\norm{\\hat{\\theta} - \\theta^0}_2 \\leq O(\\sqrt{n}/\\sigma).\n}\nIf $\\beta/\\sigma^2 \\ll 1/n$, we conclude that\n\\al{\n\\norm{\\theta^t - \\theta^0}_2 &\\leq \\norm{\\hat{\\theta} - \\theta^0}_2 + \\norm{\\theta^t - \\hat{\\theta}}_2 &\\text{(triangle ineq.)}\\\\\n&\\leq O\\l (\\frac{\\sqrt{n}}{\\sigma} \\r ) \\leq \\frac{\\sigma}{4\\sqrt{n}\\beta}. \n}\n\\end{proof}\n\\subsec{Limitations of NTK}\n\nThe NTK approach has its limitations.\n\\begin{itemize}\n    \\item Empirically, optimizing $g_\\theta(x)$ as described in the theory does not work as well as state-of-the-art (or even standard) deep learning methods. For example, using the NTK approach (i.e., taking the Taylor expansion and optimizing $g_{\\theta}(x)$) with a ResNet generally does not perform as well as ResNet with best-tuned hyperparameters.\n    \n    \\item The NTK approach requires a specific initialization scheme and learning rate which may not coincide with what is commonly used in practice.\n    \n    \\item The analysis above was for gradient descent, while stochastic gradient descent is used in practice, introducing noise in the procedure. This means that NTK with stochastic gradient descent requires a small learning rate to stay in the initialization neighborhood. Deviating from the requirements can lead to leaving the initialization neighborhood.\n\\end{itemize}\n\nOne possible explanation for the gap between theory and practice is because NTK effectively requires a fixed kernel, so there is no incentive to select the right features. Furthermore, the minimum $\\ell_2$-norm solution is typically dense. This is similar to the difference between sparse and dense combinations of features observed in the $\\ell_1$-SVM/two-layer network versus the standard kernel method SVM (or $\\ell_2$-SVM) analyzed previously.\n\nTo make these ideas more concrete, consider the following example \\cite{wei2020regularization}. \n\\begin{example}\\label{lec12:ex:sparse123}\nLet $x \\in \\R^d$ and $y \\in \\{-1, 1\\}$. Assume that each component of $x$ satisfies $x_i \\in \\{ -1, 1\\}$. Define the output $y = x_1x_2$, that is, $y$ is only a function of the first two components of $x$.\n\nThis output function can be described exactly by a neural network consisting of a sparse combination of the features (4 neurons to be exact):\n\\begin{align}\n\\hat y &= \\frac{1}{2} \\left[ \\phirelu(x_1 + x_2) + \\phirelu(-x_1 - x_2)  - \\phirelu(x_1 - x_2) -  \\phirelu(x_2 - x_1)  \\right] \\\\\n&= \\frac{1}{2}\\left( |x_1 + x_2| - |x_1 - x_2| \\right) \\label{lec12:eqn:ex1} \\\\\n&= x_1x_2. \\label{lec12:eqn:ex2}\n\\end{align}\n\\eqref{lec12:eqn:ex1} follows from the fact that $\\phirelu(t) + \\phirelu(-t) = |t|$ for all $t$, while \\eqref{lec12:eqn:ex2} follows from evaluating the 4 possible values of $(x_1, x_2)$. Thus, we can solve this problem exactly with a very sparse combination of features.\n\nHowever, if we were to use the NTK approach (kernel method), the network's output will always involve $\\sigma(w^\\top x)$ where $w$ is random so it includes all components of $x$ (i.e. a dense combination of features), and cannot isolate just the relevant features $x_1$ and $x_2$. This is illustrated in the following informal theorem:\n\\begin{theorem}\nThe kernel method with NTK requires $n = \\Omega(d^2)$ samples to learn Example \\ref{lec12:ex:sparse123} well. In contrast, the neural network regularized by $\\sum_{j = 1}^m | u_j| \\| w_j\\|_2$ only requires $n = O(d)$ samples.\n\\end{theorem}\n\\end{example}\n\n\n"
  },
  {
    "path": "tex/collection/07-05-ntk-limitation.tex",
    "content": "\\subsec{Limitations of NTK}\n\nThe NTK approach has its limitations.\n\\begin{itemize}\n    \\item Empirically, optimizing $g_\\theta(x)$ as described in the theory does not work as well as state-of-the-art (or even standard) deep learning methods. For example, using the NTK approach (i.e., taking the Taylor expansion and optimizing $g_{\\theta}(x)$) with a ResNet generally does not perform as well as ResNet with best-tuned hyperparameters.\n    \n    \\item The NTK approach requires a specific initialization scheme and learning rate which may not coincide with what is commonly used in practice.\n    \n    \\item The analysis above was for gradient descent, while stochastic gradient descent is used in practice, introducing noise in the procedure. This means that NTK with stochastic gradient descent requires a small learning rate to stay in the initialization neighborhood. Deviating from the requirements can lead to leaving the initialization neighborhood.\n\\end{itemize}\n\nOne possible explanation for the gap between theory and practice is because NTK effectively requires a fixed kernel, so there is no incentive to select the right features. Furthermore, the minimum $\\ell_2$-norm solution is typically dense. This is similar to the difference between sparse and dense combinations of features observed in the $\\ell_1$-SVM/two-layer network versus the standard kernel method SVM (or $\\ell_2$-SVM) analyzed previously.\n\nTo make these ideas more concrete, consider the following example \\cite{wei2020regularization}. \n\\begin{example}\\label{lec12:ex:sparse123}\nLet $x \\in \\R^d$ and $y \\in \\{-1, 1\\}$. Assume that each component of $x$ satisfies $x_i \\in \\{ -1, 1\\}$. Define the output $y = x_1x_2$, that is, $y$ is only a function of the first two components of $x$.\n\nThis output function can be described exactly by a neural network consisting of a sparse combination of the features (4 neurons to be exact):\n\\begin{align}\n\\hat y &= \\frac{1}{2} \\left[ \\phirelu(x_1 + x_2) + \\phirelu(-x_1 - x_2)  - \\phirelu(x_1 - x_2) -  \\phirelu(x_2 - x_1)  \\right] \\\\\n&= \\frac{1}{2}\\left( |x_1 + x_2| - |x_1 - x_2| \\right) \\label{lec12:eqn:ex1} \\\\\n&= x_1x_2. \\label{lec12:eqn:ex2}\n\\end{align}\n\\eqref{lec12:eqn:ex1} follows from the fact that $\\phirelu(t) + \\phirelu(-t) = |t|$ for all $t$, while \\eqref{lec12:eqn:ex2} follows from evaluating the 4 possible values of $(x_1, x_2)$. Thus, we can solve this problem exactly with a very sparse combination of features.\n\nHowever, if we were to use the NTK approach (kernel method), the network's output will always involve $\\sigma(w^\\top x)$ where $w$ is random so it includes all components of $x$ (i.e. a dense combination of features), and cannot isolate just the relevant features $x_1$ and $x_2$. This is illustrated in the following informal theorem:\n\\begin{theorem}\nThe kernel method with NTK requires $n = \\Omega(d^2)$ samples to learn Example \\ref{lec12:ex:sparse123} well. In contrast, the neural network regularized by $\\sum_{j = 1}^m | u_j| \\| w_j\\|_2$ only requires $n = O(d)$ samples.\n\\end{theorem}\n\\end{example}"
  },
  {
    "path": "tex/collection/08-01-algorithmic.tex",
    "content": "% reset section counter\n\\setcounter{section}{0}\n\n%\\metadata{lecture ID}{Your names}{date}\n\\metadata{13}{Rohith Kuditipudi and Kefan Dong}{Mar 1st, 2021}\n\nOne of the miracles of modern deep learning is the phenomenon of \\textit{algorithmic regularization} (also known as \\textit{implicit regularization} or \\textit{implicit bias}): although the loss landscape may contain infinitely many global minimizers, many of which do not generalize well, in practice our optimizer (e.g. SGD) tends to recover solutions with good generalization properties.\n\nThe focus of this chapter will be to illustrate algorithmic regularization in simple settings. In particular, we first show that gradient descent (with the right initialization) identifies the minimum norm interpolating solution in overparametrized linear regression. Next, we show that for a certain non-convex reparametrization of the linear regression task where the data is generated from a sparse ground-truth model, gradient descent (again, suitably initialized) approximately recovers a sparse solution with good generalization. Finally, we discuss algorithmic regularization in the classification setting, and how stochasticity can contribute to algorithmic regularization.\n\n\\sec{Implicit regularization effect of zero initialization in overparametrized linear regression}\\label{lec13:sec:olr}\nWe prove that gradient descent initialized at the origin converges to the minimum norm interpolating solution (assuming such a solution exists). \n\nLet $X \\defeq \\l[x\\sp{1},...,x\\sp{n} \\r]^\\top \\in \\bbR^{n \\times d}$ denote our data matrix and $\\vec{y} \\defeq \\l[y\\sp{1},...,y\\sp{n} \\r]^\\top \\in \\bbR^n$ denote our label vector, where $n < d$. Assume $X$ is full rank. Our goal is to find a weight vector $\\beta$ that minimizes our empirical loss function $\\hatL (\\beta) \\defeq \\frac{1}{2}\\|\\vec{y} - X\\beta\\|_2^2$.\n\n%\\subsec{Analysis of algorithmic regularization}\nAs we are in the overparametrized setting with $n < d$ and $X$ full rank, there exist infinitely many global minimizers that interpolate the data and hence achieve zero loss. In fact, the following lemma shows that the set of global minimizers forms a subspace.\n\n\\begin{lemma}\\label{lec13:lem:soln-subspace}\nLet $X^+$ denote the pseudoinverse\\footnote{Since $X$ is full rank, $XX^\\top$ is invertible and so we have $X^+ = X^\\top (X X^\\top)^{-1}$. Note that $X X^+ X = X$.} of $X$. Then $\\beta$ is a global minimizer if and only if $\\beta = X^+ \\vec{y} + \\zeta$ for some $\\zeta$ such that $\\zeta \\perp x_1,...,x_n$.\n\\end{lemma}\n\n\\begin{proof}\nFor any $\\beta \\in \\R^d$, we can decompose it as $\\beta = X^+ + \\zeta$ for some $\\zeta \\in \\R^d$. Since\n\\begin{equation}\nX\\beta = X (X^+ \\vec{y} + \\zeta) = \\vec{y} + X\\zeta,\n\\end{equation}\n\n$\\beta$ is a global minimizer if and only if $X\\zeta = 0$, which happens if and only if $\\zeta \\perp x_1,...,x_n$.\n\n\\end{proof}\n\nFrom Lemma~\\ref{lec13:lem:soln-subspace}, we can derive an explicit formula for the minimum norm interpolant $\\beta^\\star \\defeq \\argmin_{\\beta : \\hatL(\\beta) = 0} \\|\\beta\\|_2$.\n\\tnotelong{add some basics about SVD }\n\\begin{corollary}\n$\\beta^\\star = X^+ \\vec{y}$.\n\\end{corollary}\n\n\\begin{proof}\nTake any $\\beta$ such that $\\hatL(\\beta) = 0$, and write $\\beta = X^+ \\vec{y} + \\zeta$. Then from the definition of $X^+$ and the fact that $X \\zeta = 0$ (see the proof of Lemma~\\ref{lec13:lem:soln-subspace}), we have \n\\begin{align}\n    ||\\beta||_2^2 &= ||X^+ \\vec{y}||_2^2 + ||\\zeta||_2^2 + 2 \\langle X^+ \\vec{y}, \\zeta \\rangle \\\\\n    &= ||X^+ \\vec{y}||_2^2 + ||\\zeta||_2^2 + 2 \\langle X^\\top(X X^\\top)^{-1} \\vec{y}, \\zeta \\rangle \\\\\n    &= ||X^+ \\vec{y}||_2^2 + ||\\zeta||_2^2 + 2 \\langle (X X^\\top)^{-1} y, X \\zeta \\rangle \\\\\n    &= ||X^+ \\vec{y}||_2^2 + ||\\zeta||_2^2 &\\text{(because $X\\zeta = 0$)} \\\\\n    &\\geq ||X^+ \\vec{y}||_2^2,\n\\end{align}\nwith equality if and only if $\\zeta = 0$.\n\n\\end{proof}\n\nNow, suppose we learn $\\beta$ using gradient descent with initialization $\\beta^0$, where at iteration $t$ we set $\\beta^t = \\beta^{t-1} - \\eta \\nabla \\hatL(\\beta^{t-1})$ for some learning rate $\\eta$. Since $\\hatL (\\beta)$ is convex, we know from standard results in convex optimization that gradient descent will converge to a global minimizer for a suitably chosen learning rate $\\eta$ (in particular, taking $\\eta$ to be sufficiently small). Assuming $\\beta^0 = 0$, we will in fact recover the minimum norm interpolating solution.\n\\begin{theorem}\\label{lec13:thm:linear-main}\nSuppose gradient descent on $\\hatL(\\beta)$ with initialization $\\beta^0 = 0$ converges to a solution $\\hat \\beta$ such that $\\hatL(\\hat \\beta) = 0$. Then $\\hat \\beta = \\beta^\\star$.\n\\end{theorem}\n\nThe main idea of the proof is that the iterates of gradient descent always lie in the span of the $x\\sp{i}$'s (see Figure \\ref{lec13:fig:1} for an illustration).\n\n\\begin{figure}\n\\centering\n\\includegraphics[width=.35\\linewidth]{figures/subspace-global-min.png}\n\\caption{Visualization of proof intuition for Theorem~\\ref{lec13:thm:linear-main}. The solution $\\beta^\\star$ is the projection of the origin onto the subspace of global minima.}\n\\label{lec13:fig:1}\n\\end{figure}\n\n\\begin{proof}\nWe first show via induction that $\\beta^t \\in \\text{span}\\l\\{ x\\sp{1}, \\dots,x\\sp{n} \\r\\}$ for all $t$. For the induction base case, note that $\\beta^0 = 0 \\in \\text{span}\\l\\{ x\\sp{1}, \\dots,x\\sp{n} \\r\\}$. Now suppose $\\beta^{t-1} \\in \\text{span}\\l\\{ x\\sp{1}, \\dots,x\\sp{n} \\r\\}$. Recall that $\\beta^t = \\beta^{t-1} - \\eta \\nabla \\hatL(\\beta^{t-1})$. Since left-multiplying any vector by $X^\\top$ amounts to taking a linear combination of the rows of $X$, it follows that $\\eta \\nabla \\hatL(\\beta^{t-1}) = \\eta X^\\top(X\\beta^{t-1} - \\vec{y}) \\in \\text{span}\\l\\{ x\\sp{1}, \\dots,x\\sp{n} \\r\\}$, and so $\\beta^t = \\beta^{t-1} - \\eta \\nabla \\hatL(\\beta^{t-1}) \\in \\text{span}\\l\\{ x\\sp{1}, \\dots,x\\sp{n} \\r\\}$. This proves the induction step.\n\nNext, we show that $\\hat \\beta \\in \\text{span}\\l\\{ x\\sp{1}, \\dots,x\\sp{n} \\r\\}$ and $\\hatL(\\hat \\beta) = 0$ implies $\\hat \\beta = \\beta^\\star$. By definition, $\\hat \\beta \\in \\text{span}\\l\\{ x\\sp{1}, \\dots,x\\sp{n} \\r\\}$ implies $\\hat \\beta = X^\\top v$ for some $v \\in \\bbR^n$. Since $\\hatL(\\hat \\beta) = 0$, we have $0 = X\\hat \\beta - \\vec{y} = X X^\\top v - \\vec{y}$. This implies $v = (X X^\\top)^{-1}y$, and so $\\hat \\beta = X^\\top v = X^\\top (X X^\\top)^{-1} \\vec{y} = X^+ \\vec{y} = \\hat \\beta^\\star$.\n\\end{proof}\n\n\\sec{Implicit regularization of small initialization in nonlinear models} \\label{sec:small_init_nonlinear}\nWe give another example of implicit regularization effect of small initialization in a non-convex version of the overparametrized linear regression task considered in the previous section. The results in this subsection are largely simplifications of the paper~\\citet{li2017algorithmic} which studies over-parameterized compressed sensing and two-layer neural nets with quadratic activation. \n\nWe assume $x\\sp{1},...,x\\sp{n} \\iid \\cN(0,I_{d \\times d})$ and $y\\sp{i} = f_{\\beta^\\star}(x\\sp{i})$, where the ground truth vector $\\beta^\\star$ is $r$-sparse (i.e. $\\|\\beta^\\star\\|_0 = r$). For simplicity, we assume $\\beta_i^\\star = \\mathbf{1} \\{i \\in S\\}$ for some $S \\subset [d]$ such that $|S| = r$. We again analyze the overparametrized setting, where this time $n \\ll d$ but also $n \\geq \\widetilde \\Omega(r^2)$.\n\nOur goal is to find a weight vector that minimizes our empirical loss function\n\\begin{equation}\n\\hatL(\\beta) \\defeq \\frac{1}{4n}\\sum_{i=1}^n \\left(y\\sp{i} - f_\\beta(x\\sp{i})\\right)^2, \\label{lec13:eqn:hadamard_model_1}\n\\end{equation}\nwhere $f_\\beta(x) \\defeq \\langle \\beta \\odot \\beta, x\\rangle$. The operation $\\odot$ denotes the Hadamard product: for $u,v \\in \\bbR^d$, $u \\odot v \\in \\bbR^d$ is defined by $(u \\odot v)_i \\defeq u_i v_i$ for $i = 1, \\dots, d$.\n\n\n\\subsec{Main results of algorithmic regularization}\nNote that while $f_\\beta$ is still linear over $x$, our loss is no longer convex over $\\beta$. (To see this, suppose $\\beta \\neq 0$ is a global minimizer. Then we have $\\hatL(0) > \\hatL(\\beta) = \\hatL(-\\beta)$.) Thus, the effect of algorithmic regularization induced by gradient descent will be much different from the overparametrized linear regression setting. \n\nIn the previous setting of linear regression, solutions with low $\\ell_2$ norm are desirable as they tend to generalize well. In the present setting, we know our ground-truth parameter $\\beta^\\star$ is sparse. Thus, we want to learn a sparse solution $\\hat \\beta$, avoiding non-sparse solutions that may not generalize well. One approach to finding sparse solutions, called \\textit{lasso regression}, is to minimize the $\\ell_1$-regularized proxy loss\n\\begin{equation}\n\\sum_{i=1}^n \\left(\\langle \\theta, x\\sp{i} \\rangle - y\\sp{i} \\right)^2 + \\lambda \\| \\theta \\|_1\n\\end{equation}\nwith respect to $\\theta$, where $\\theta = \\beta \\odot \\beta$. However, it turns out that we can equivalently learn a sparse solution by running gradient descent from a suitable initialization on the original \\textit{unregularized} loss.\n\nTo be specific, let $\\beta^0=\\alpha \\mathbf{1} \\in \\R^d$ be the initialization where $\\alpha$ is a small positive number. The update rule of gradient descent algorithm is given by $\\beta^{t+1}=\\beta^t-\\eta\\nabla \\hatL(\\beta^{t}).$ The next theorem shows that when $n=\\widetilde{\\Omega}(r^2)$, gradient descent on $\\hatL(\\beta)$ converges to $\\beta^\\star.$\n\n\\begin{theorem}\\label{lec13:thm:non-linear-main}\nLet $c$ be a sufficiently large universal constant. Suppose $n\\ge cr^2\\log^2(d)$ and $\\alpha\\le 1 / d^c$, then when $\\dfrac{\\log(d/\\alpha)}{\\eta}\\lesssim T\\lesssim \\dfrac{1}{\\eta\\sqrt{d\\alpha}},$ we have\n\\begin{equation}\\label{lec13:eqn:non-linear-main}\n    \\l\\|\\beta^\\top\\odot\\beta^\\top-\\beta^\\star\\odot\\beta^\\star \\r\\|_2^2\\le O \\l( \\alpha\\sqrt{d} \\r).\n\\end{equation}\n\n(Here, $T$ indexes the gradient descent steps.)\n\\end{theorem}\n\nWe make several remarks about Theorem~\\ref{lec13:thm:non-linear-main} before presenting the proof.\n\n\\begin{remark}\nIn this problem we do not use $\\beta^0=0$ as the initialization point because $\\beta=0$ is a critical point, that is, $\\nabla\\hatL(0)=0$. Note that the lower bound on $T$ depends logarithimically on $1/\\alpha$, so we can take $\\alpha$ to be a small inverse polynomial on $d$ and the lower bound won't change much. Also, the upper bound depends polynomially on $1/\\alpha$ (which is considered very big when $c$ is sufficiently large), so we do not need to use early stopping in a serious way.\n\\end{remark}\n\n\\begin{remark}\nTheorem~\\ref{lec13:thm:non-linear-main} is a simplified version of Theorem 1.1 in \\cite{li2018algorithmic}.\n\\end{remark}\n\n\\begin{remark}\n$\\hatL(\\beta)$ has many global minima. To see this, observe that the number of parameters is $d$ and the number of constraints to fit all the examples is $O(n)$ because there are only $n$ examples. Recall that for overparameterized model we have $d\\gg n$; consequently, there exists many global minima of $\\hatL(\\beta)$.\n\\end{remark}\n\n\\begin{remark}\n$\\beta^\\star$ is the min-norm solution in this case. That is,\n    \\begin{align}\\label{lec13:eqn:opt}\n        \\beta^\\star=\\argmin \\|\\beta\\|_2^2\\qquad \\text{s.t. }\\hatL(\\beta)=0.\n    \\end{align}\n    Informally, this is because we can view $\\beta\\odot \\beta$ as a vector $\\theta\\in \\R^{d}$, which leads to $\\|\\beta\\|_2^2 =\\|\\theta\\|_1.$ Then in the $\\theta$ space (and with a little abuse of notation), the optimization problem~\\eqref{lec13:eqn:opt} becomes\n    \\begin{align}\\label{lec13:eqn:opt-theta}\n        \\theta^\\star=\\argmin \\|\\theta\\|_1 \\qquad \\text{s.t. }\\hatL(\\theta)=0,\n    \\end{align}\n    which is a lasso regression, whose solution is sparse.\n\\end{remark}\n\n\\begin{remark}    \nIn this non-linear case and the linear case before, gradient descent with small initialization converges to minimum $\\ell_2$-norm solution. Similarly, in the NTK regime, gradient descent converges to a solution that is very close to the initialization. Therefore, it seems conceivable that GD generally prefers global minima nearest to the initialization. However, we do not have a general theorem for this phenomenon (and the instructor also believes that this is not universally true without other conditions). \n\\end{remark}\n\n\\subsec{Ground work for proof and the restricted isometry property}\\label{lec13:sec:rip}\n\nIn this section we prepare the ground work for the proof of Theorem~\\ref{lec13:thm:non-linear-main}.\n\nWe start by showing several basic properties about $\\hatL(\\beta)$. Note that for any fixed vector $v\\in\\R^{d}$ and $x\\in \\R^{d}$, when $x$ is drawn from $\\cN(0,I)$, we have\n\\begin{equation}\\label{lec13:eqn:gaussian-product}\n    \\Exp \\l[\\langle x, v\\rangle^2 \\r]=\\Exp \\l[ v^\\top xx^\\top v \\r]=v^\\top\\Exp \\l[ xx^\\top \\r]v=\\|v\\|_2^2.\n\\end{equation}\n\nIt follows that \n\\begin{align}\n    L(\\beta)&=\\frac{1}{4}\\Exp_{x\\sim \\cN(0,I)} \\l[(y-\\langle \\beta\\odot\\beta,x\\rangle^2 \\r] \\\\\n    &=\\frac{1}{4}\\Exp_{x\\sim \\cN(0,I)} \\l[\\langle \\beta^\\star\\odot\\beta^\\star-\\beta\\odot\\beta,x\\rangle^2 \\r] &\\text{(by definition of $y$)} \\\\\n    &=\\frac{1}{4} \\l\\| \\beta^\\star\\odot\\beta^\\star-\\beta\\odot\\beta \\r\\|_2^2.\\label{lec13:eqn:loss-form} &\\text{(by \\eqref{lec13:eqn:gaussian-product})}\n\\end{align}\nNote that \\eqref{lec13:eqn:loss-form} is the metric that we use to characterize how close $\\beta$ is to the ground-truch parameter $\\beta^\\star$ (see \\eqref{lec13:eqn:non-linear-main}).\n\nIn the following lemma we show that $\\hatL(\\beta) \\approx L(\\beta)$ by uniform convergence. Generally speaking, uniform convergence of the loss function for all $\\beta$ requires $n\\ge \\Omega(d)$ samples, so in our setting (where $n\\ll d$) $\\hatL(\\beta) \\approx L(\\beta)$ does not always hold. However, since we assume $\\beta^\\star$ is sparse, the analysis only requires uniform convergence for sparse vectors.\n\n\\begin{lemma}\\label{lec13:lem:RIP}\nAssume $n\\ge \\widetilde\\Omega(r^2)$. With high probability over the randomness in $x^{(1)},\\cdots,x^{(n)}$, $\\forall v$ such that $\\|v\\|_0\\le r$ we have\n\\begin{equation}\\label{lec13:eqn:RIP}\n(1-\\delta)\\|v\\|_2^2\\le \\frac{1}{n}\\sum_{i=1}^{n}\\langle v,x^{(i)}\\rangle^2\\le (1+\\delta)\\|v\\|_2^2.\n\\end{equation}\n\\end{lemma}\n\nLemma~\\ref{lec13:lem:RIP} is a special case of Lemma 2.2 in \\cite{li2018algorithmic} so the proof is omitted here. We say the set $\\l\\{ x^{(1)},\\cdots,x^{(n)} \\r\\}$ (or $X=[x^{(1)},\\cdots,x^{(n)}]$) satisfies $(r,\\delta)$\\textit{-RIP condition} (\\textit{restricted isometric property}) if \\eqref{lec13:eqn:RIP} holds.\n\nBy algebraic manipulation, \\eqref{lec13:eqn:RIP} is equivalent to \n\\begin{align}\\label{lec13:eqn:RIP-2}\n(1-\\delta)\\|v\\|_2^2\\le v^\\top \\left(\\frac{1}{n}\\sum_{i=1}^{n}x^{(i)}(x^{(i)})^\\top\\right)v\\le (1+\\delta)\\|v\\|_2^2.\n\\end{align}\nIn other words, from the point of view of a sparse vector $v$ we have $\\sum_{i=1}^{n}x^{(i)}(x^{(i)})^\\top\\approx I$. (Note however that $\\sum_{i=1}^{n}x^{(i)}(x^{(i)})^\\top$ is not close to $I_{d\\times d}$ in other notions of closeness. For example, $\\sum_{i=1}^{n}x^{(i)}(x^{(i)})^\\top$ is not close to $I_{d\\times d}$ in spectral norm. Another way to see this is that $\\sum_{i=1}^{n}x^{(i)}(x^{(i)})^\\top$ is a $d \\times d$ matrix but only has rank $n \\ll d$.)\n\nAs a result, with the RIP condition we have $\\hatL(\\beta)\\approx L(\\beta)$ if $\\beta$ is sparse. With more tools we can also get $\\nabla \\hatL(\\beta)\\approx \\nabla L(\\beta)$. Let us define the set $S_r=\\{\\beta:\\|\\beta\\|_0\\le O(r)\\}$, the set where we have uniform convergence of $\\hatL$ and $\\nabla \\hatL$. Informally, as long as we are in the set $S_r$, $\\hatL$ and $\\nabla\\hatL$ have similar behavior to their population counterparts. (Note, on the other hand, that there exists a dense $\\beta\\not\\in S_r$ such that $\\hatL(\\beta)=0$ but $L(\\beta)\\gg 0.$)\n\nThe RIP condition also gives us the following lemma which will be needed for the proof of Theorem \\ref{lec13:thm:non-linear-main}.\n\n\\begin{lemma}\\label{lec14:lem:rip}\n    Suppose $x^{(1)}, x^{(2)}, \\dots x^{(n)}$ satisfy the $(r, \\delta)$-RIP condition. Then, $\\forall v, w$ such that $\\Norm{v}_{0} \\leq r$ and $\\Norm{w}_{0} \\leq r$, we have that\n    \\begin{align}\n        \\left| \\frac{1}{n} \\sum_{i=1}^{n} \\langle x^{(i)}, v \\rangle \\langle x^{(i)}, w \\rangle  - \\langle v, w \\rangle \\right| &= \\left|  v^{T} \\l(\\frac{1}{n} \\sum_{i=1}^{n}  x^{(i)} (x^{(i)})^\\top \\r)  w  - \\langle v, w \\rangle \\right| \\\\\n        &\\leq 4 \\delta \\Norm{v}_{2} \\cdot \\Norm{w}_{2}.\n    \\end{align} \n\\end{lemma}\n\n\\tnotelong{To add proof of this lemma in the future.}\n\\begin{corollary}\\label{lec14:cor:rip}\n    Taking $w = e_1, \\dots, e_d$ in Lemma~\\ref{lec14:lem:rip}, we can conclude that\n    \\begin{align}\n        \\Norm{ \\frac{1}{n} \\sum_{i=1}^n \\langle x^{(i)}, v\\rangle x^{(i)} - v }_\\infty &= \\Norm{ \\l(\\frac{1}{n} \\sum_{i=1}^n x^{(i)}(x^{(i)})^\\top \\r)v - v }_\\infty \\\\\n        &\\leq 4\\delta \\Norm{v}_2.\n    \\end{align}\n\\end{corollary}\n\n\\subsec{Warm-up for analysis: Gradient descent on population loss}\n\nThe main intuition for proving Theorem~\\ref{lec13:thm:non-linear-main} is to leverage the uniform convergence when $\\beta$ belongs to the set $S_r$ (see Figure~\\ref{lec13:fig:uc-sr}). Note that the initialization $\\beta^0$ is not exactly $r$-sparse, but taking $\\alpha$ to be sufficiently small, $\\beta^0$ is approximately $0$-sparse. The proof is decomposed into the following steps:\n\n\\begin{enumerate}\n    \\item Gradient descent on $L(\\beta)$ converges to $\\beta^\\star$ without leaving $S_r$, and\n    \\item Gradient descent on $\\hatL(\\beta)$ is similar to gradient descent on $L(\\beta)$ inside $S_r$.\n\\end{enumerate}\n\nCombining the two steps we can show that gradient descent on $\\hatL(\\beta)$ does not leave $S_r$ and converges to $\\beta^\\star.$\n\n\\begin{figure}\n\\centering\n\\includegraphics[width=.7\\linewidth]{figures/uc-sr.png}\n\\caption{Visualization of proof intuition for Theorem~\\ref{lec13:thm:non-linear-main}.}\n\\label{lec13:fig:uc-sr}\n\\end{figure}\n\nAs a warm up, we prove the following theorem for gradient descent on $L(\\beta).$\n\\begin{theorem}\nFor sufficiently small $\\eta$, gradient descent on $L(\\beta)$ converges to $\\beta^\\star$ in $\\Theta\\left(\\dfrac{\\log (1/ (\\epsilon\\alpha) )}{\\eta}\\right)$ iteration with $\\epsilon$-error in $\\ell_2$-distance.\n\\end{theorem}\n\n\\begin{proof}\n\nSince\n\\begin{equation}\n\\nabla L(\\beta) = (\\beta\\odot \\beta-\\beta^\\star\\odot\\beta^\\star)\\odot\\beta,\n\\end{equation}\n\nthe gradient descent step is\n\\begin{equation}\n\\beta^{t+1} = \\beta^t - \\eta (\\beta^t \\odot \\beta^t -\\beta^\\star \\odot \\beta^\\star)\\odot\\beta^t.\n\\end{equation}\n\nRecall that $\\beta^\\star=\\mathbf{1} \\{i \\in S \\}$ and $\\beta^0=\\alpha \\mathbf{1}$, and the update rule above decouples across the coordinates of $\\beta^t$. Thus, we only need to show that $| \\beta_i^\\star - \\beta^t | \\leq \\epsilon$ for the number of iterations stated in the Theorem.\n\n\\underline{Case 1: $i\\in S$.} For $i \\in S$, the update rule for coordinate $i$ is\n\\begin{align}\n\\beta_i^{t+1} &= \\beta_i^t - \\eta (\\beta_i^t \\cdot \\beta_i^t - 1 \\cdot 1) \\cdot\\beta_i^t \\\\ \n&= \\beta_i^t - \\eta \\l[ \\left(\\beta_i^t\\right)^2 - 1 \\r] \\beta_i^t.\n\\end{align}\n\nConsider the following two cases:\n\n\\begin{itemize}\n\\item If $\\beta_i^t\\le 1/2$, we have\n\\begin{align}\n\\beta_i^{t+1}&=\\beta_i^{t} \\l[ 1+\\eta \\l(1- \\l(\\beta_i^t \\r)^2 \\r) \\r] \\\\\n&\\ge \\beta_i^t \\l( 1+\\frac{3}{4}\\eta \\r).\n\\end{align}\n\nConsequently, $\\beta_i^{t+1}$ grow exponentially, and it takes $\\Theta\\left(\\dfrac{\\log (1/\\alpha)}{\\eta}\\right)$ iterations for $\\beta_i^t$ to grow from $\\alpha$ to at least $1/2.$\\footnote{This is because $(1+\\eta)^{1/\\eta}\\approx e$, so $(1+\\eta)^{c/\\eta}\\approx e^{c}.$} This will bring us into the second case.\n    \n\\item if $\\beta_i^t\\ge 1/2$, we have\n\\begin{align}\n1-\\beta_i^{t+1}&=1-\\beta_i^t+\\eta \\l[ \\l(\\beta_i^t \\r)^2-1 \\r] \\beta_i^t\\\\\n&=1-\\beta_i^{t}-\\eta \\l( 1-\\beta_i^t \\r) \\l(1+\\beta_i^t \\r)\\beta_i^t\\\\\n&\\le 1-\\beta_i^t-\\eta \\l( 1-\\beta_i^t \\r)\\beta_i^t &\\text{(because $1+\\beta_i^t\\ge 1$)} \\\\\n&= \\l(1-\\beta_i^t \\r) \\l( 1-\\eta \\beta_i^t \\r) \\\\\n&\\le \\l( 1-\\beta_i^t \\r) \\l(1-\\eta/2 \\r). &\\text{(because $\\beta_i^t\\ge 1/2$)}\n\\end{align}\n\nTherefore it takes $\\Theta\\left(\\dfrac{\\log (1/\\epsilon)}{\\eta}\\right)$ iterations to achieve $1-\\beta_i^t\\le \\epsilon.$\n\\end{itemize}\n\n\\underline{Case 2: $i \\notin S$.} For all $i \\notin S$, we claim (informally) that it is sufficient to show that when $t \\leq 1 / (10 \\eta \\alpha^{2})$, $\\beta_{i}^{t} \\leq 2\\alpha$. This is because when $i \\notin S$, $\\beta_{i}$ stays small and will take many iterations before it even gets to $2\\alpha$, which is close to $0$ since $\\alpha$ is chosen to be small.\n\nFor a coordinate $i\\notin S$, the gradient descent update for this problem becomes\n\\begin{align}\n    \\beta_i^{t+1} &= \\left[ \\beta^{t} - \\eta (\\beta^{t} \\odot \\beta^{t} - \\beta^\\star \\odot \\beta^\\star) \\odot \\beta^{t} \\right]_i \\\\\n    &= \\beta_i^{t} - \\eta (\\beta_i^{t} \\cdot \\beta_i^{t}) \\cdot \\beta_i^{t} & (\\text{since } \\beta_{i}^\\star = 0 \\ \\forall i \\notin S) \\\\\n    &= \\beta_i^{t} - \\eta (\\beta_i^{t})^{3}.\n\\end{align}\n\nSince our initialization $\\beta^{0}$ was small, the update to these coordinates will be even smaller because $(\\beta_{i}^{t})^{3}$ is small. We can prove the desired claim using strong induction. Suppose $\\beta_{i}^{s} \\leq 2\\alpha$ for all $s \\leq t$ and $i \\notin S$, and that $t+1 \\leq 1 / (10\\eta \\alpha^{2})$. Then, for all $s \\leq t$,\n\\begin{align}\n\\beta_{i}^{s+1} %&= \\beta^{s}_{i} - \\eta (\\beta_{i}^{s})^{3} \\\\\n    &= (1 - \\eta (\\beta_{i}^{s})^{2})\\beta_{i}^{s} \\\\\n    &\\leq (1 + \\eta (\\beta_{i}^{s})^{2}) \\beta_{i}^{s} \\\\\n    &\\leq (1 + 4\\eta \\alpha^{2}) \\beta_{i}^{s}. & (\\text{since } \\beta_{i}^{s} \\leq 2\\alpha)\n\\end{align}\n\nWith strong induction, we can repeatedly apply this gradient update starting from $t=0$ to obtain\n\\begin{align}\n    \\beta_{i}^{t+1} &\\leq \\beta_{0} \\cdot (1 + 4 \\eta \\alpha^{2})^t \\\\\n    &\\leq \\beta_{0} ( 1 + 4 \\eta \\alpha^{2})^{\\frac{1}{10 \\eta \\alpha^{2} }} \\\\\n    &\\leq \\beta_{0} \\exp \\bigg(\\frac{4\\eta \\alpha^{2}}{10 \\eta \\alpha^{2}} \\bigg) \\\\\n    &=  \\beta_{0} \\cdot e^{2/5} \\\\\n    &\\leq 2 \\alpha,\n \\end{align}\n which completes the inductive proof of the claim.\n\n\\end{proof}"
  },
  {
    "path": "tex/collection/08-02-algorithmic.tex",
    "content": "% reset section counter\n%\\setcounter{section}{0}\n\n%\\metadata{lecture ID}{Your names}{date}\n\\metadata{14}{Roshni Sahoo and Sarah Wu}{Mar 3rd, 2021}\n\n\\subsec{Proof of main result: gradient descent on empirical loss}\n\nAnalyzing gradient descsent on the empirical risk $\\empL$ is more complicated than analyzing gradient descent on the population risk, so we focus on the case when $\\beta^\\star$ is $1$-sparse, i.e. $r=1$. (When $r>1$, the main idea is the same but requires some more advanced analysis techniques.) \n\nNote that in our setup, i.e. when $x^{(1)} \\ldots x^{(n)} \\iid \\mathcal{N}(0, I_{d\\times d})$ and when $n\\geq \\widetilde{\\Omega}(r/\\delta^2)$, with high probability the data satisfy the $(r, \\delta)$-RIP condition. It follows that when $r=1$ and $\\delta = \\tilO(1/\\sqrt{n})$, the data are $(1, \\delta)$-RIP. This will allow us to use the lemmas involving the RIP condition for the proof.\n\nWe restate the case of $r=1$ in the following theorem.\n \n\\begin{theorem} \\label{lec14:thm:main}\nSuppose $\\eta \\geq \\widetilde{\\Omega}(1).$ Then, gradient descent on $\\empL$ with $t = \\Theta \\l(\\frac{\\alpha \\log (1/\\delta)}{\\eta}\\r)$ steps satisfies \n\\begin{equation}\n\\Norm{\\beta^{t} \\odot \\beta^{t} - \\beta^\\star \\odot \\beta^\\star}_{2}^{2} \\leq \\tilO\\l(\\frac{1}{\\sqrt{n}}\\r).\n\\end{equation} \n\\end{theorem}\n\n\\begin{remark}\nNote that Theorem~\\ref{lec14:thm:main} is a slightly weaker version of Theorem~\\ref{lec13:thm:non-linear-main} for $r=1$, since the bound on the RHS depends on the number of examples and not the initialization $\\alpha$. In Theorem~\\ref{lec13:thm:non-linear-main}, we could take $\\alpha$ as small as we like to drive the bound to zero; we cannot do this for Theorem~\\ref{lec14:thm:main}.\n\\end{remark}\n\nWe proceed to prove Theorem~\\ref{lec14:thm:main} with the follow steps:\n\\begin{enumerate}\n\\item Computing the gradient update $\\nabla \\empL$,\n\\item Dynamics analysis of noise $\\zeta_t$, \n\\item Dynamics analysis of signal $r_t$, and\n\\item Putting it all together.\n\\end{enumerate}\n\n\\underline{Computing the gradient update $\\nabla \\empL$}\n\nWLOG, assume that $\\beta^\\star = e_{1}.$ We can decompose the gradient descent iterate $\\beta^{t}$ as\n\\begin{equation}\n    \\beta^{t} = r_{t} \\cdot e_{1} + \\zeta_{t},\n\\end{equation}\nwhere $\\zeta_t \\perp e_1$. The idea is to prove convergence to $\\beta^\\star$ by showing that (i) $r_{t} \\rightarrow 1$ as $t \\rightarrow \\infty$, and (ii) $\\norm{\\zeta_{t}}_{\\infty} \\leq O(\\alpha)$ for $t \\leq \\tilO\\big(1/\\eta).$ In other words, the \\textit{signal} $r_{t}$ converges quickly to $1$ while the \\textit{noise} $\\zeta_t$ remains small for some number of initial iterations. One may be concerned that it is possible for the noise to amplify after many iterations, but we will not have to worry about this scenario if we can guarantee that $\\beta^{t}$ converges to $\\beta^\\star$ first.\n\nWe can compute the gradient of $\\empLt$ as follows. Since $y\\sp{i} = \\langle \\beta^\\star \\odot \\beta^\\star, x\\sp{i} \\rangle$ and $\\beta^{t} = r_{t}e_{1} + \\zeta_{t} = r_{t}\\beta^\\star + \\zeta_{t}$,\n\\begin{align}\n    \\nabla \\empLt &= \\frac{1}{n} \\sum_{i=1}^{n} (\\langle \\beta^{t} \\odot \\beta^{t}, x\\sp{i} \\rangle - y\\sp{i} ) x\\sp{i} \\odot \\beta^{t} \\\\\n    &= \\frac{1}{n} \\sum_{i=1}^{n} ( \\langle \\beta^{t} \\odot \\beta^{t} - \\beta^\\star \\odot \\beta^\\star, x\\sp{i} \\rangle ) x\\sp{i} \\odot \\beta^{t} \\\\\n    &= \\frac{1}{n} \\sum_{i=1}^{n} \\langle r_{t}^{2} \\beta^\\star \\odot \\beta^\\star + \\zeta_{t} \\odot \\zeta_{t} - \\beta^\\star \\odot \\beta^\\star, x\\sp{i}  \\rangle x\\sp{i} \\odot \\beta^{t} \\\\\n    &= \\underbrace{\\frac{1}{n} \\sum_{i=1}^{n} \\Big\\langle \\big(r_{t}^{2} - 1\\big) \\beta^\\star \\odot \\beta^\\star + \\zeta_{t} \\odot \\zeta_{t}, x\\sp{i}  \\Big\\rangle x\\sp{i}}_{m_t} \\odot \\beta^{t}.\n\\end{align}\n\nTo simplify the analysis, we can rearrange some of the terms that are part of the gradient. Define $m_{t} $ such that $\\nabla \\empLt = m_{t} \\odot \\beta^{t}.$ Also, let $X = \\frac{1}{n} \\sum_{i=1}^{n} x\\sp{i}(x\\sp{i})^\\top.$ Then,\n\\begin{align}\n    m_{t} &= \\frac{1}{n} \\sum_{i=1}^{n} \\Big\\langle \\big(r_{t}^{2} - 1\\big) \\beta^\\star \\odot \\beta^\\star + \\zeta_{t} \\odot \\zeta_{t}, \\ x\\sp{i}  \\Big\\rangle \\ x\\sp{i} \\\\\n    &= \\l( \\frac{1}{n} \\sum_{i=1}^{n} x\\sp{i}\\big(x\\sp{i}\\big)^\\top \\r) \\l(r_{t}^{2} - 1\\r) \\cdot \\l(\\beta^\\star \\odot \\beta^\\star\\r) + \\l( \\frac{1}{n} \\sum_{i=1}^{n} x\\sp{i}(x\\sp{i})^\\top \\r) \\l(\\zeta_{t} \\odot \\zeta_{t}\\r) \\\\\n    &= \\underbrace{X \\big(r_{t}^{2} - 1\\big) \\cdot \\big(\\beta^\\star \\odot \\beta^\\star\\big)}_{\\text{part of } u_t} + \\underbrace{X \\big(\\zeta_{t} \\odot \\zeta_{t}\\big)}_{v_t}.\n\\end{align}\n\nNow, define $u_{t} \\defeq (r_{t}^{2} - 1) (\\beta^\\star \\odot \\beta^\\star) - X (r_{t}^{2} - 1) (\\beta_{*} \\odot \\beta_{*})$ and $v_{t} \\defeq X \\big(\\beta_{t} \\odot \\beta_{t}\\big)$. Then we can rewrite the gradient as\n\n\\begin{equation}\n    \\nabla \\empLt = m_{t} \\odot \\beta^{t} = [(r_{t}^{2} -1) \\beta^\\star \\odot \\beta^\\star - u_{t} + v_{t}] \\odot \\beta_{t}. \\label{lec14:eqn:emp-gradient}\n\\end{equation}\n\nOur goal is to show that both $u_t$ and $v_t$ are small, so that $\\nabla \\empLt$ is close to its population version $\\nabla L(\\beta^t)$. Observe that $X$ appears in both $u_{t}$ and $v_{t}$. This matrix is challenging to deal with mathematically because it does not have full rank (because $n < d$). Instead, we rely on the RIP condition to reason about the behavior of $X$: the idea is that $X$ behaves like the identity for sparse vector multiplication. Applying Corollary~\\ref{lec14:cor:rip}, we can bound $\\Norm{u_{t}}_{\\infty}$ as\n\\begin{equation} \\label{lec14:eqn:u-inf-norm}\n    \\Norm{u_{t}}_{\\infty} \\leq 4\\delta \\Norm{(r_t^2 - 1)  \\beta^\\star \\odot \\beta^\\star }_{2} \n    \\leq 4\\delta ||\\beta^\\star \\odot \\beta^\\star||_{2} \\leq 4\\delta.\n\\end{equation}\n\n(In the second inequality, we assume that $|r_t| < 1$. We can do this because $r_t$ starts out at $\\alpha$ which is small; if $r_t \\geq 1$, then we are already in the regime where gradient descent has converged.) We can bound $\\Norm{v_{t}}_{\\infty}$ in a similar manner: since Corollary~\\ref{lec14:cor:rip} implies $\\Norm{v_t - \\zeta_t \\odot \\zeta_t}_\\infty \\leq 4\\delta \\Norm{\\zeta_{t} \\odot \\zeta_{t}}_{2}$,\n\\begin{align}\n    \\Norm{v_{t}}_{\\infty} &\\leq \\Norm{\\zeta_{t} \\odot \\zeta_{t}}_{\\infty} + 4\\delta \\Norm{\\zeta_{t} \\odot \\zeta_{t}}_{2} &(\\text{by the triangle inequality}) \\\\\n    &\\leq \\Norm{\\zeta_{t}}_{\\infty}^{2} + 4\\delta \\Norm{\\zeta_{t} \\odot \\zeta_{t}}_{1} &(\\text{since } \\zeta_t \\text{ very small}) \\\\\n    &= \\Norm{\\zeta_{t}}_{\\infty}^{2} + 4\\delta \\Norm{\\zeta_{t}}_{2}^{2}. \\label{lec14:eqn:v-inf-norm}\n\\end{align}\n\nNote that the size of $v_t$ depends on the size of the noise $\\zeta_t$. Thus, by bounding $\\zeta_t$ (e.g. with a small initialization), we can ensure that $v_t$ is also small. (Ensuring bounds on $u_t$ is more difficult because it depends only on $\\delta$.) In the next two subsections, we analyze the growth of $\\zeta_t$ and $r_t$.\n\n\\underline{Dynamics analysis of $\\zeta_t$}\n\nFirst, we analyze the dynamics of the noise $\\zeta_t$, which we want to ensure does not grow too fast.\n\n\\begin{lemma} \\label{lec14:lem:dynamics_noise}\n    For all $t\\leq 1 / (c\\eta\\delta)$ with sufficiently large constant $c$, we have\n    \\begin{equation} \\label{lec14:eqn:dynamics_noise}\n        \\Norm{\\zeta_t}_\\infty \\leq 2\\alpha, \\quad \\quad \\Norm{\\zeta_t}_2^2 \\leq \\frac{1}{2}, \\quad \\quad \\text{and} \\quad \\Norm{\\zeta_{t+1}}_\\infty \\leq \\big(1 + O(\\eta\\delta)\\big) \\Norm{\\zeta_t}_\\infty.\n    \\end{equation}\n\\end{lemma}\nNote that this result is weaker than what we were able to show for the population gradient (exponential growth with a small fixed rate), but we will ultimately show that the growth of the signal will be even faster.\n\n\\begin{proof}\nRecall that the empirical gradient \\eqref{lec14:eqn:emp-gradient} is $\\nabla \\hat{L}(\\beta) = \\big[(r_{t}^{2} - 1) \\beta^\\star \\odot \\beta^\\star - u_{t} + v_{t} \\big] \\odot \\beta^{t}$. Hence, the gradient update to $\\beta^{t}$ is\n\n\\begin{align}\n\\beta^{t+1} &= \\beta^{t} - \\eta \\l[\\l(r_{t}^{2} - 1\\r) \\beta^\\star \\odot \\beta^\\star - u_{t} + v_{t} \\r] \\odot \\beta^{t} \\\\\n&= \\underbrace{\\beta^{t} - \\eta \\l(r_{t}^{2} - 1\\r) \\beta^\\star  \\odot \\beta^\\star \\odot \\beta^{t}}_{\\text{GD update for population loss}} - \\eta \\l(- u_{t} + v_{t}\\r) \\odot \\beta^{t}. \\label{lec14:eqn:gd-update}\n\\end{align}\n    \nRecall that $\\zeta_{t+1}$ is simply $\\beta^{t+1}$ except for the first coordinate (where it has a zero instead of $r_{t+1}$), i.e. $\\zeta_{t+1}$ is the projection of $\\beta^{t+1}$ onto the subspace orthogonal to $e_1$. Hence,\n\\begin{align}\n\\zeta_{t+1} &= \\l(I - e_{1} e_{1}^\\top\\r) \\beta^{t+1} \\\\\n&= \\l(I - e_{1} e_{1}^\\top\\r) \\cdot \\beta^{t} - \\eta \\l(I - e_{1} e_{1}^\\top\\r) (v_{t} - u_{t}) \\odot \\beta^{t} &\\text{(by \\eqref{lec14:eqn:gd-update}, second term = 0)} \\\\\n&= \\zeta_{t} - \\eta \\l[\\l(I - e_{1}e_{1}^{T}\\r) (v_{t} - u_{t}) \\odot \\l(I - e_{1}e_{1}^{T}\\r) \\beta^{t}\\r] &(\\text{by distribution law for $\\odot$}) \\\\\n&= \\zeta_t - \\eta \\underbrace{\\l[ \\l(I - e_{1}e_{1}^{T}\\r) \\l(v_t - u_t\\r)\\r]}_{\\rho_t} \\odot \\zeta_t.\n\\end{align}\n    \nIf we define $\\rho_t$ such that $\\zeta_{t+1} = \\zeta_t - \\eta \\rho_t \\odot \\zeta_t$, then the growth of $\\zeta_t$ is dictated by the size of $\\rho_t$. We can bound this as\n\\begin{equation}\n\\Norm{\\zeta_{t+1}}_{\\infty} \\leq (1 + \\eta \\Norm{\\rho_{t}}_{\\infty}) \\Norm{\\zeta_{t}}_{\\infty}. \\label{lec14:eqn:zeta-growth-bd}\n\\end{equation}\n\nNow, we will prove the lemma by using strong induction on $t$. Suppose that the first two pieces of \\eqref{lec14:eqn:dynamics_noise} hold for all iterations up to $t$. We can show that\n\\begin{align}\n\\Norm{\\rho_{t}}_{\\infty} &\\leq \\Norm{u_{t}}_{\\infty} + \\Norm{v_{t}}_{\\infty}  \\\\\n&\\leq 4\\delta + \\Norm{\\zeta_t}_{\\infty}^{2} + 4\\delta \\Norm{\\zeta_t}_{2}^{2} &(\\text{by \\eqref{lec14:eqn:u-inf-norm} and \\eqref{lec14:eqn:v-inf-norm}}) \\\\\n&\\leq  4\\delta + (2\\alpha)^2 + 4\\delta \\cdot \\frac{1}{2} &(\\text{by the inductive hypothesis})\\\\\n\\label{lec14:eqn:diff-inf-norm}\n&\\leq 8\\delta,\n\\end{align}\nwhere the last step holds because we can take $\\alpha$ to be arbitrarily small (e.g. $\\alpha \\leq \\text{poly}(1/n) \\leq O(\\delta)$). Plugging this into \\eqref{lec14:eqn:zeta-growth-bd}, we have\n\\begin{equation}\n\\Norm{\\zeta_{t+1}}_\\infty \\leq (1 + 8\\eta \\delta) \\Norm{\\zeta_t}_{\\infty} = \\big(1 + O(\\eta\\delta)\\big) \\Norm{\\zeta_t}_\\infty,\n\\end{equation}\nwhich proves the third piece of the lemma. Using this piece, we can show that\n\\begin{equation}\n\\Norm{\\zeta_{t+1}}_{\\infty} \\leq \\l(1 + 8 \\eta \\delta\\r)^{t+1} \\Norm{\\zeta_{0}}_{\\infty} \\leq \\big(1 + 8\\eta \\delta\\big) ^{1/(c\\eta \\delta)} \\cdot \\alpha  \\leq 2\\alpha\n\\end{equation}\nfor a sufficiently large constant $c$, which proves the second piece. Finally, we show that\n\\begin{equation}\n\\Norm{\\zeta_{t+1}}_{2}^2 \\leq \\big(1 + 8\\eta \\delta\\big)^{t+1}\\Norm{\\zeta_{0}}_{2}^2 \\leq \\big(1 + 8\\eta \\delta)^{1/(c\\eta \\delta)} \\cdot \\alpha \\sqrt{d} \\leq \\frac{1}{2},\n\\end{equation}\nif $\\alpha \\leq \\frac{1}{n^{O(1)}}$, which proves the first piece.\n\n\\end{proof}\n\n\\underline{Dynamics analysis of $r_t$}\n\nNext, we analyze the dynamics of the signal $r_t$, which we want to show converges to 1.\n\n\\begin{lemma} \\label{lec14:lem:dynamics_signal}\n    For all $t\\leq 1 / (c\\eta\\delta)$ with sufficiently large constant $c$, we have that\n    \\[ r_{t+1} = \\big(1 + \\eta\\big( 1 - r_t^2 \\big) \\big) r_t + O\\big(\\eta\\delta\\big) r_t. \\]\n\\end{lemma}\nNote that the first term on the RHS is $r_{t+1}$ during gradient descent on the population loss, and the second term captures the error.\n\n\\begin{proof}\n    Recall that the gradient descent update from the empirical gradient~\\eqref{lec14:eqn:emp-gradient} is\n    \\begin{equation}\n        \\beta^{t+1} = \\beta^t - \\eta \\big[\\big(r_{t}^{2} -1\\big) \\beta^\\star \\odot \\beta^\\star - u_{t} + v_{t}\\big] \\odot \\beta_{t}.\n    \\end{equation} \n    We have that\n    \\begin{align}\n        r_{t+1} &= \\big\\langle \\beta^{t+1}, e_1\\big\\rangle \\\\\n        &= \\big\\langle \\beta^t, e_1\\big\\rangle - \\eta \\big(r_{t}^{2} -1\\big)\\big\\langle \\beta^t, e_1\\big\\rangle - \\eta \\big\\langle v_t-u_t, e_1\\big\\rangle \\big\\langle \\beta^t, e_1 \\big\\rangle \\\\\n        &= r_t - \\eta \\big(r_{t}^{2} -1\\big) r_t - \\eta \\big\\langle v_t-u_t, e_1\\big\\rangle r_t \\\\\n        &= \\Big(1 + \\eta\\big( 1 - r_t^2 \\big) \\Big) r_t + \\eta \\big\\langle u_t-v_t, e_1\\big\\rangle r_t\n    \\end{align}\n    so all we need to do is bound the second term as follows:\n    \\begin{align}\n        |\\eta \\langle v_t - u_t, e_1\\rangle r_t| &\\leq \\eta \\cdot r_t \\Norm{v_t-u_t}_\\infty \\\\\n        &\\leq \\eta \\cdot r_t \\cdot 8\\delta &(\\text{by \\eqref{lec14:eqn:diff-inf-norm}}) \\\\\n        &= O(\\eta\\delta) \\cdot r_t.\n    \\end{align}\n\\end{proof}\n\n\\underline{Putting it all together}\nFinally, we return to the proof of Theorem~\\ref{lec14:thm:main}. By Lemma~\\ref{lec14:lem:dynamics_signal}, we know that as long as $r_t \\leq 1/2$ it will grow exponentially fast, since\n\\begin{equation}\n    r_{t+1} \\geq \\Big(1 + \\eta\\big(1-r_t^2\\big) - O(\\eta\\delta) \\Big) \\cdot r_t \\geq \\bigg(1 + \\frac{\\eta}{2}\\bigg)\\cdot r_t.\n\\end{equation} \nThis implies that at some $t_0 = O\\Big(\\frac{\\log (1/\\alpha)}{\\eta}\\Big)$, we'll observe $r_{t_0} > 1/2$ for the first time. Consider what happens after this point.\n\n\\begin{itemize}\n    \\item When $1/2 < r_t \\leq 1$, we have that\n    \\begin{align}\n        1 - r_{t+1} &\\leq 1 - r_t - \\eta \\big(1 - r_t^2\\big) r_t + O(\\eta\\delta) \\cdot r_t \\\\\n        &\\leq 1 - r_t - \\frac{\\eta \\big(1 - r_t^2\\big)}{2} + O(\\eta\\delta) \\\\\n        &\\leq 1 - r_t - \\frac{\\eta \\big(1 - r_t\\big)}{2} + O(\\eta\\delta) \\\\\n        &= \\bigg(1 - \\frac{\\eta}{2}\\bigg) (1 - r_t) + O(\\eta\\delta).\n    \\end{align}\n    Thus, we can achieve $1 - r_{t+1} \\leq 2 \\cdot \\frac{O(n\\delta)}{\\eta/2} = O(\\delta)$ in $\\Theta\\Big(\\frac{\\log(1/\\delta)}{\\eta}\\Big)$ iterations.\n    \n    \\item When $r_t > 1$, we can show in a similar manner that\n    \\begin{equation}\n        r_{t+1} - 1 \\leq (1 - \\eta) (r_t - 1) + O(\\eta\\delta) \\leq O(\\delta),\n    \\end{equation} \n    implying that $r_t$ remains very close to 1 after the same order of iterations.\n\\end{itemize}\n\nThis completes the proof of Theorem~\\ref{lec14:thm:main}, bounding the number of iterations needed for gradient descent on the empirical loss to converge to $\\beta^*$.\n\\qed \n"
  },
  {
    "path": "tex/collection/08-03-algorithmic-new.tex",
    "content": "\\metadata{16}{Leah Reeder and Trevor Maxfield}{Nov 10th, 2021}\n\n\\sec{From small to large initialization: a precise characterization}\n\nWe have previously discussed how certain initializations of gradient descent converge to minimum-norm solutions. In the sequel, we characterize the effect of initialization more precisely---we will show that in a variant of the settings in Section~\\ref{sec:small_init_nonlinear}, we can precisely compute the corresponding regularizer induced by any initialization. We will see that when the initialization is small, we obtain the bias towards minimum norm solution (in the parameter space used in optimization), whereas when the initialization is large, we are in the NTK regime (Section~\\ref{sec:ntk_approach}) where the implicit bias is towards the min norm solution under the NTK kernel. The materials in this subsection are simplifications of results in the recent paper~\\citet{woodworth2020kernel}.\n\n\\subsection{Preparation: gradient flow}\nTo simplify the analysis, we will consider the concept of gradient flow, i.e. gradient descent with an infinitesimal learning rate.  This allows us omit the second order effect from the learning rate and simplify the analysis. \n\nWe begin by recalling the gradient descent update formula. In our previous description of gradient descent, we indexed the updated parameters by $t = 1,2,\\dots$. Anticipating our generalization to infinitesimal steps, we will index the updated parameters using parentheses instead of subscripts. In particular, the standard gradient descent update given a loss function $L(w)$ is\n\\al{\nw(t+1) = w(t) - \\eta \\nabla L(w(t)).\n}\nIf we scale the time by $\\eta$ so that each update by gradient descent corresponds to a time step of size $\\eta$ (rather than size 1), the update becomes\n\\al{\nw(t + \\eta) = w(t) - \\eta \\nabla L(w(t)).\n}\nTaking $\\eta \\to 0$ yields a differential equation, which can be thought of as a continuous process rather than discrete updates:\n\\al{\nw(t+dt) = w(t) - dt \\cdot \\nabla L(w(t)).\n}\nThis can also be written as:\n\\al{\n\\dot{w}(t) = -\\nabla L(w(t) \\quad \\text{ with } \\quad \\dot{w}(t) = \\frac{\\partial w(t)}{\\partial t}\n}\nThis allows us to ignore the $\\eta^2$ term (alternatively the $(dt^2)$ term), which will simplify some of the technical details that follow.\n\n\\subsec{Characterizing the implicit bias of initialization}\nThe results in this section are slight simplification of the recent paper by~\\citet{woodworth2020kernel}. The model is a variant of the one we introduced in \\eqref{lec13:eqn:hadamard_model_1}. Recalling that $x^{\\odot 2} = x \\odot x$, let\n\\al{\nf_w(x) = \\left(w_+^{\\odot 2} - w_-^{\\odot 2}\\right)^\\top x.\n}\nwhere $w_+, w_- \\in \\R^d$. Let $w$ denote the concatenation of the two parameter vectors, i.e. $= (w_+, w_-)$.  In \\eqref{lec13:eqn:hadamard_model_1}, we defined $f_\\beta(x) = (\\beta \\odot \\beta)^\\top x$; this model can only represent positive linear combinations of $x$.  By contrast, $f_w(x)$ can represent any linear model. Moreover, if we choose our initialization for $w$ such that $w_+(0) = w_-(0)$, we obtain $f_{w(0)}(x) \\equiv 0$ for all $x$. Similar to our analysis of the NTK, this initialization will simplify the subsequent derivations.\n\nNext, we define the following loss function,\n\\al{\n\\hatL(w) = \\frac{1}{2} \\sum_{i=1}^n \\left( y\\sp{i} - f_w(x\\sp{i})\\right)^2,\n}\nand consider the initialization\n\\al{\nw_+(0) = w_-(0) = \\alpha \\cdot \\vec{\\mathbf{1}}\n}\nwhere $\\vec{\\mathbf{1}}$ denotes the all-ones vector. The analysis technique still applies to any general initializations as long as all the dimension are initialized to be non-zero, but the the initialization scale is the most important factor, and therefore we chose this simplification for the ease of exposition. \n\nNote that every $w = (w_+, w_{-})$ corresponds to a de facto linear function of $x$. We denote the resulting linear model as $\\theta_w$:\n\\al{\n\\theta_w = w_+^{\\odot 2} - w_-^{\\odot 2}.\n}\nNote that $\\theta_w^\\top x = f_w(x)$. \n\nLet $w(\\infty)$ denote the limit of the gradient flow, i.e.\n\\al{\nw(\\infty) = \\lim_{t \\to \\infty} w(t).\n}\nThen, the converged linear model in the $\\theta$ space is defined by $\\theta_\\alpha(\\infty) = \\theta_{w(\\infty)}$---we are interested in understanding its properties.  For simplicity, we will omit the $\\infty$ index and refer to this quantity as $\\theta_\\alpha$. We assume throughout that the limit exists and all corresponding regularity conditions are met.\n\nLet\n\\al{\nX = \\begin{bmatrix} x^{(1)^\\top} \\\\ \\vdots \\\\ x^{(n)^\\top} \\end{bmatrix} \\in \\R^{n \\times d} \\quad \\text{ and } \\quad \\vec{y} = \\begin{bmatrix} y^{(1)} \\\\ \\vdots \\\\ y^{(n)} \\end{bmatrix}.\n}\nIn the sequel, we formally state our result relating the complexity of the solution discovered by gradient flow to the size of the initialization.\n\\begin{theorem}[Theorem 1 in \\citet{woodworth2020kernel}]\n\t \\label{lec16:thm:interpolatingAlpha}\nFor any $0 < \\alpha < \\infty$, assume that gradient flow with initialization $w_+(0) = w_-(0) = \\alpha \\cdot \\vec{\\mathbf{1}}$ converges to a solution that fits the data exactly: $X \\theta_{\\alpha} = \\vec{y}$.\\footnote{This assumption can likely be proved to be true and thus not required. Here we still include the condition because the original paper~\\citet{woodworth2020kernel} assumed it.}  Then, the solution satisfies the following notion of minimum complexity:\n\\al{ \n\\theta_\\alpha = \\argmin_\\theta Q_\\alpha(\\theta)\\\\\n \\quad \\textup{ s.t. } \\quad X \\theta = \\vec{y} \\label{lec16:eqn:constrained_complexity}\n}\nwhere\n\\al{\nQ_\\alpha(\\theta) = \\alpha^2 \\cdot \\sum_{i=1}^n q\\left(\\frac{\\theta_i}{\\alpha^2} \\right)\n}\nand\n\\al{\nq(z) = 2 - \\sqrt{4 + z^2} + z \\cdot \\textup{arcsinh}\\left(\\frac{z}{2}\\right)\n}\n\\end{theorem}\nIn words, Theorem~\\ref{lec16:thm:interpolatingAlpha} claims that $\\theta_\\alpha$ is the minimum complexity solution for the complexity measure $Q_\\alpha$.\n\n%23 minutes.\n\\begin{remark}\nIn particular, when $\\alpha \\to \\infty$ we have that \n\\begin{align}\n    q(\\theta_i /\\alpha^2) \\asymp \\theta_i^2/\\alpha^4\n\\end{align}\nand so \n\\begin{align}\n    Q_\\alpha(\\theta) \\asymp \\frac{1}{\\alpha^2} \\Norm{\\theta}_2^2.\n\\end{align}\nThis means that if $\\alpha \\to \\infty$ than the complexity measure $Q_\\alpha$ is the $\\ell_2$-norm, $||\\theta||_2$.  If $\\alpha \\to 0$, then the complexity measure becomes\n\\al{\nq\\left(\\frac{\\theta_i}{\\alpha^2}\\right) &\\asymp \\frac{\\left|\\theta_i\\right|}{\\alpha^2} \\log\\left(\\frac{1}{\\alpha^2}\\right) \\quad\\text{(by Taylor expansion)}\n}\nand so,\n\\al{\nQ_\\alpha\\left(\\theta\\right) &\\asymp \\frac{\\Norm{\\theta}_1}{\\alpha^2} \\log\\left(\\frac{1}{\\alpha^2}\\right)\n}\nTo summarize, for $\\alpha \\to \\infty$, the constrained minimization problem we solve in \\eqref{lec16:eqn:constrained_complexity} yields the minimum $\\ell_2$-norm solution of $\\theta$ (i.e. the $\\ell_4$-norm for $w$).  When $\\alpha \\to 0$, solving \\eqref{lec16:eqn:constrained_complexity} yields the minimum $\\ell_1$-norm $\\theta$ (which is the $\\ell_2$-norm for $w$).  For $0 < \\alpha < \\infty$, we obtain some interpolation of $\\ell_1$ and $\\ell_2$ regularization of the optimum.\n\\end{remark}\n\n%27.30 minutes\n\\begin{remark}\nNote that when $\\alpha \\to 0$, the intuition is similar to what we had observed in previous analyses; in particular, the solution is the global minimum closest to the initialization.  Note however, that when $\\alpha \\neq 0$, the solution discovered by gradient descent will not \\textit{exactly} correspond to the solution closest to the initialization.\n\\end{remark}\n\n\\begin{remark}\nWhen $\\alpha \\to \\infty$, we claim that the model optimization is in the neural tangent kernel (NTK) regime.  Recall that we had two parameters, $(\\sigma, \\beta)$, that determined if we could treat the optimization problem as a kernel regression. Further recall that $\\sigma$ denotes the minimum singular value of $\\Phi$ and $\\beta$ is the Lipschitzness of the gradient. Let us now compute $\\sigma$ and $\\beta$ for large $\\alpha$ initializations of our model.\n\nFor $w_-(0) = w_+(0) = \\alpha \\vec{\\mathbf{1}}$,\n\\al{\n\\nabla f_{w(0)}(x) = 2 \\begin{bmatrix} w_{+}(0) \\cdot x \\\\ -w_{-}(0) \\odot x \\end{bmatrix} = 2 \\alpha \\begin{bmatrix} x \\\\ -x \\end{bmatrix}\n}\nby the chain rule.  It is clear then that both $\\sigma$ and $\\beta$ linearly depend on $\\alpha$.  This implies that\n\\al{\n\\frac{\\beta}{\\sigma^2} \\to 0 \\quad \\text{ as } \\alpha \\to \\infty\n}\nsince the denominator is $O(\\alpha^2)$, while the numerator is $O(\\alpha)$.  In particular, the features used in this kernel method are:\n\\al{\n\\phi(x) = \\nabla f_{w(0)} (x) = 2 \\alpha \\begin{bmatrix} x \\\\ - x \\end{bmatrix}\n}\nThe neural tangent kernel perspective then gives an alternative proof of this complexity minimization result for $\\alpha \\to \\infty$. In the NTK regime, the solution (to our convex problem) is always the minimum $\\ell_2$-norm solution for the feature matrix, which in this case equals $\\begin{bmatrix} X \\\\ - X \\end{bmatrix}$. \n\nNote that practice tends not to follow the assumptions made here. Often, people either do not use large initializations or do not use infinitesimally small step sizes. But this is a good thing  because we do not want to be in the NTK regime; being in the NTK regime implies that we are doing no different or better than just using a kernel method.\n\\end{remark}\n\nWe can now prove Theorem~\\ref{lec16:thm:interpolatingAlpha}, which is similar to the overparametrized linear regression proof of Theorem~\\ref{lec13:thm:linear-main}.\n\nThis proof follows in two steps:\n\\begin{enumerate}\n\\item We find an invariance maintained by the optimizer. In the overparametrized linear regression proof of Theorem~\\ref{lec13:thm:linear-main}, we required $\\theta \\in \\text{span}\\{x\\sp{i}\\}$.  For this proof, we will use a slightly more complicated invariance.\n\\item We characterize the solution using this invariance.  The invariance, which depends on $\\alpha$, will tell us which zero error solution the optimization converges to.\n\\end{enumerate}\nNote also that all of these conditions only depend upon the empirically observed samples. The invariance and minimum is not defined with respect to any population quantities.\n\\begin{proof}  \nLet\n\\al{\n\\tilde{X} = \\begin{bmatrix}X & -X\\end{bmatrix} \\in \\R^{n \\times 2d} \\quad \\text{ and } \\quad w(t) = \\begin{bmatrix} w_+(t) \\\\ w_-(t) \\end{bmatrix} \\in \\mathbb{R}^{2d}.\n}\nThen, the model output on $n$ data points can be described in matrix notation as follows:\n\\al{\n\\tilde{X} w(t)^{\\odot 2} = \\begin{bmatrix}X & -X\\end{bmatrix} \\begin{bmatrix} w_+(t)^{\\odot 2} \\\\ w_-(t)^{\\odot 2} \\end{bmatrix} = \\begin{bmatrix} f_{w(t)} (x\\sp{1}) \\\\ \\vdots \\\\ f_{w(t)}(x\\sp{n})\\end{bmatrix} \\in \\R^n.\n}\nGiven the loss function,\n\\al{\nL(w(t)) = \\frac{1}{2} \\Norm{\\tilde{X} w(t)^{\\odot 2} - \\vec{y}}_2^2,\n}\nthe gradient of $w(t)$ can be computed as\n\\al{\n\\dot{w}(t) &= -\\nabla L(w(t)) \\\\\n&= - \\nabla \\left( \\Norm{\\tilde{X} w(t)^{\\odot 2} - \\vec{y}}_2^2 \\right) \\\\\n&= \\left(\\tilde{X}^\\top r(t)\\right) \\odot w(t) \\quad \\quad \\quad \\text{(chain rule)}\\label{lec16:eqn:Xtrtwt}\n}\nwhere $r(t) = \\tilde{X} w(t)^{\\odot 2} - \\vec{y}$ denotes the residual vector.  We see that the $\\tilde{X}^\\top r(t)$ term in \\eqref{lec16:eqn:Xtrtwt} is reminiscent of linear regression for which it would correspond to the gradient, although the $\\odot w(t)$ reminds us that this problem is indeed quadratic.\n\nWe cannot directly solve this differential equation, but we claim that\n\\al{ \\label{lec16:eqn:w_claim}\nw(t) = w(0) \\odot \\text{exp}\\left(-2\\tilde{X}^\\top \\int_0^\\top r(s) ds \\right) \\quad \\text{(exp is applied entry-wise)}\n}\nwhich is not quite a closed form solution of equation \\ref{lec16:eqn:Xtrtwt} since $r(s)$ is still a function of $w(t)$.  To understand how we obtained this ``solution,'' we consider a more abstract setting. Suppose that\n\\al{\n\\dot{u}(t) &= v(t) \\dot u(t)\n}\nWe can then ``solve'' this differential equation as follows. Rearranging, we observe that\n\\al{\n\\frac{\\dot{u}(t)}{u(t)} &= v(t) \\\\\n\\frac{d \\log u(t)}{dt} &= v(t) \\quad \\text{(chain rule)} \\\\\n\\log u(t) - \\log u(0) &= \\int_0^t v(s) ds \\quad \\text{(integration)} \\\\\n\\frac{u(t)}{u(0)} &= \\text{exp} \\left( \\int_0^t v(s) ds\\right)\n}\nIn our problem, $u \\leftrightarrow w_i$ and $v \\leftrightarrow (\\tilde{X}^\\top r(t))_i$.\n\nWe have characterized $w$, but we want to transform this to a characterization that involves $\\theta$.\nRecall that $w_+(0) = \\alpha \\vec{\\mathbf{1}}$ and $w_-(0) = \\alpha \\vec{\\mathbf{1}}$ so that $w(0) = \\alpha \\vec{\\mathbf{1}} \\in \\R^{2d}$. Additionally, we have that $\\theta(t) = w_+(t)^{\\odot 2} - w_-(t)^{\\odot 2} $.\nWe can now apply \\eqref{lec16:eqn:w_claim} to expand $w(t)$ and simplify. \n\nNote that if we have $\\tilde{X}^\\top = \\begin{bmatrix} X^\\top \\\\ -X^\\top \\end{bmatrix} \\in \\R^{2n\\times d}$, then for some vector $v$,\n\\al{\n    \\left(\\exp(-2\\tilde{x}^\\top v) \\right)^{\\odot 2} &=\n    \\begin{bmatrix}\n    \\exp(-2X^\\top v) \\\\\n    \\exp(2X^\\top v)\n    \\end{bmatrix}^{\\odot 2} \\\\\n    &= \\begin{bmatrix}\n    \\exp(-4X^\\top v) \\\\\n    \\exp(4X^\\top v)\n    \\end{bmatrix}.\n}\nApplying this result for $v = \\int_0^T r(s) ds$, we obtain that:\n\\al{\n    \\theta(t) &= w_+(t)^{\\odot 2} - w_-(t)^{\\odot 2} \\\\\n    &= \\alpha^2 \\left[ \\exp \\left( -4 X^\\top \\int_0^t r(s) ds \\right) - \\exp \\left( 4 X^\\top \\int_0^t r(s) ds \\right)\\right] \\\\\n    &= 2 \\alpha^2 \\sinh \\left(-4 X^\\top \\int_0^t r(s) ds \\right).\n}\nLetting $t \\to \\infty$, we have that\n\\al{\\label{lec16:eqn:theta_infty}\n    \\theta_\\alpha = 2 \\alpha^2 \\sinh \\left(-4X^\\top \\int_0^\\infty r(s) ds \\right).\n}\nLastly, we also know \n\\al{\n    X \\theta_\\alpha = \\vec{y} \\label{lec16:eqn:theta_constraint}\n } \n since this is the assumption by the theorem (which should can be proven because the optimization should converge to a zero-error solution). We next show that \\eqref{lec16:eqn:theta_infty} and \\eqref{lec16:eqn:theta_constraint} are also sufficient conditions for a solution to the constrained optimization problem given by \\eqref{lec16:eqn:constrained_complexity}. In particular, \\eqref{lec16:eqn:theta_infty} and \\eqref{lec16:eqn:theta_constraint} correspond to the Karush-Kuhn-Tucker (or KKT) conditions of \\eqref{lec16:eqn:constrained_complexity}.\n\nA KKT condition is an optimality condition for constrained optimization problems. While these conditions can have a variety of formulations and typically one can invoke some off-the-shelf theorems to use them, we can motivate the conditions we encountered by considering the following general optimization program:\n\\al{\n    \\argmin \\quad &Q(\\theta) \\\\\n    \\text{s.t.} \\quad &X\\theta = \\vec{y}.\n}\nWe say that $\\theta$ satisfies the (first order) KKT conditions if\n\\begin{align}\n    \\nabla Q(\\theta) &= X^\\top \\nu \\text{ for some } \\nu \\in \\R^n \\\\\n    X\\theta &= \\vec{y}\n\\end{align}\nMore intuitively, we know that optimality implies that there are no first order local improvements that satisfy the constraint (up to first order). Then, consider a perturbation $\\Delta \\theta$. In order to satisfy the constraint, we must enforce the following:\n\\begin{align}\n\\Delta \\theta \\perp \\text{row-span}\\{X\\}  \\quad \\text{ so } \\quad X \\Delta \\theta = 0\n\\end{align}\nSo, if we look at $\\theta + \\Delta \\theta $ satisfying the constraint, we can use a Taylor expansion to show that\n\\al{\nQ(\\theta + \\Delta \\theta) = Q(\\theta) + \\langle \\Delta \\theta, \\nabla Q(\\theta) \\rangle \\leq Q(\\theta)\n}\nbecause if $ \\langle \\Delta \\theta, \\nabla Q(\\theta) \\rangle$ is positive it violates the optimality assumption.\nIn fact, it is very easy to make the sign flip for $ \\langle \\Delta \\theta, \\nabla Q(\\theta) \\rangle$ because you can flip $\\Delta \\theta$ to be the opposite direction. This means that\n\\al{\n    \\forall \\, \\Delta \\theta \\perp \\text{row-span}\\{X\\}, \\quad \\langle \\Delta \\theta, \\nabla Q(\\theta) \\rangle = 0\n}\nbecause if it is negative, you can equivalently flip it to be positive which violates optimality.\nThis means that $Q(\\theta) \\subseteq \\text{row-span}\\{X\\}$, or $Q(\\theta) = X^\\top \\nu$ for some $\\nu$.\n\nReturning to our problem, the KKT condition gives\n\\al{\n    \\nabla Q(\\theta) = X^\\top \\nu\n}\nand the invariance gives us\n\\al{\n    \\theta_\\alpha &= 2 \\alpha^2 \\sinh\\left(-4X^\\top \\int_0^\\infty r(s) ds \\right) \\\\\n    &= 2\\alpha^2 \\sinh \\left( -4X^\\top v'\\right)\n}\nwhere we let $v' = \\int_0^\\infty r(s) ds$ for simplicity.\nTaking the gradient of $Q$ gives\n\\al{\n    \\nabla Q_\\alpha (\\theta) = \\operatorname{arcsinh}\\left(\\frac{1}{2\\alpha^2} \\theta \\right)\n}\nPlugging in $\\theta_\\alpha$, we get\n\\al{\n    \\nabla Q(\\theta_\\alpha) = \\operatorname{arcsinh}\\left (\\frac{1}{2\\alpha^2} \\theta_\\alpha \\right ) = -4 X^\\top v'\n}\nThus, $\\theta_\\alpha$ satisfies both KKT conditions. Even further, since our optimization problem~\\eqref{lec16:eqn:constrained_complexity} is convex (we do not formally argue this), we conclude that $\\theta_\\alpha$ is a global minimum.\n\\end{proof}\n\n\\sec{Implicit regularization towards max-margin solutions in classification}\nWe now switch our focus to classification problems. We consider linear models (though these results also apply to nonlinear models with a weaker version of the conclusion). We assume that our data is separable and will prove that gradient descent converges to the max-margin solution. This result holds for any initialization and does not require any additional regularization; we only require the use of gradient descent and the standard logistic loss function. The results in this subsection are originally given by~\\citet{soudry2018implicit}, and our exposition heavily depends on those in~\\cite{ji2018risk,mjt_dlt}. \n\nAssume we have data $\\{(x\\sp{i}, y\\sp{i}) \\}_{i=1}^n $, where $x\\sp{i} \\in \\R^d$ and $y\\sp{i} \\in \\{\\pm 1 \\}$. We consider the linear model $ h_w(x) = w^\\top x$ and the cross entropy loss function $\\hatL (w) = \\sum_{i=1}^n \\ell\\left(y\\sp{i}, h_w\\l (x\\sp{i} \\r )\\right)$, where $ \\ell(t) = \\log(1 + \\exp(-t))$ is the logistic loss.\n\nAs we have separable data, there can be multiple global minima, as you can trivially take an infinite number of separators. More formally, there are an infinite number of unit vectors $\\bar{w}$ such that $\\bar{w}^\\top x\\sp{i} y\\sp{i} > 0$ for all $i$ as one can perturb any strict separator  while still maintaining a separation of classes. Then, we can scale the separator to make the loss arbitrarily small---we have that $ \\hatL(\\alpha \\bar{w}) \\to 0$ as $ \\alpha \\to \\infty$. Thus, informally, for any unit vector $\\bar{w}$ that separate the data, $\\infty \\cdot \\bar{w}$ is a global minimum. %Thus, even if we arbitrarily scale the unit vector, you still have that the loss goes to zero as $\\ell(t)$ approaches zero as $t$ gets large. Thus, all choices of $w$ correspond to global minima, as the loss function goes to zero for infinite scalings.\n\nWe would like to understand which global minimum gradient descent converges to. We will now show that it finds the max-margin solution. Before we can do so, we recall/introduce the following definitions.\n\n\\begin{definition}[Margin]\nLet $\\{(x\\sp{i}, y\\sp{i}) \\}_{i=1}^n $ be given data. Assuming $\\{(x\\sp{i}, y\\sp{i}) \\}_{i=1}^n$ is linearly separable, the \\textit{margin} is defined as\n\\al{\n    \\min_{i \\in [n]} y\\sp{i} w^\\top x\\sp{i}\n}\n\\end{definition}\n\n\\begin{definition}[Normalized Margin]\\label{lec16:def:norm_margin}\nLet $\\{(x\\sp{i}, y\\sp{i}) \\}_{i=1}^n $ be given data. Assuming $\\{(x\\sp{i}, y\\sp{i}) \\}_{i=1}^n$ is linearly separable, the \\textit{normalized margin} is defined as\n\\al{\n    \\gamma(w) = \\frac{\\min_{i \\in [n]} y\\sp{i} w^\\top x\\sp{i}}{\\norm{w}_{2}}\n}\n\\end{definition}\n\n\\begin{definition}[Max-Margin Solution]\nUsing the normalized margin $\\gamma$ defined in Definition~\\ref{lec16:def:norm_margin}, we define a \\textit{max-margin solution} as\n\\al{\n    \\bar{\\gamma} = \\max_{w} \\gamma(w)\n}\nand let $w^*$ be the unit-norm maximizer. \\footnote{The normalized margin $\\bar{\\gamma}$ is scale-invariant. For $c \\neq 0$, $\\gamma(cw) = \\min_{i \\in [n]} \\frac{y\\sp{i} cw^\\top x\\sp{i}}{\\norm{cw}_2} = \\min_{i \\in [n]} \\frac{y\\sp{i} w^\\top x\\sp{i}}{\\norm{w}_2} = \\gamma(w)$.}\n\\end{definition}\n\nUsing these definitions, we claim the following result.\n\\begin{theorem} \\label{lec16:thm:maxmargin_gd}\nGradient flow converges to the direction of max-margin solution in the sense that\n\\al{\n    \\gamma(w(t)) \\to \\bar{\\gamma} \\text{  as  } t \\to \\infty\n}\nwhere $w(t)$ is the iterate at time $t$.\n\\end{theorem}\n\nThe following observations provide some intuition for Theorem~\\ref{lec16:thm:maxmargin_gd}.\n\\begin{enumerate}\n    \\item $\\hatL(w(t)) \\to 0$ by a standard optimization argument. Namely, if the objective is monotone decreasing at each iteration, $\\hatL(w(t)) \\approx 0$ for large enough $t$.\n    \\item Using a Taylor expansion, we can show that $ \\ell(z) = \\log(1 + \\exp(-z)) \\approx \\exp(-z)$ for large $z$. Thus, logistic loss is close to exponential loss when $z$ is very large.\n    \\item Using observation 1, we see that $\\norm{w(t)}_{2} \\to \\infty$ because if $\\norm{w(t)}_{2}$ were instead bounded, then the loss $\\hatL (w(t))$ will be bounded below by a constant that is strictly greater than zero, contradicting observation 1. Formally, if\n    $\\norm{w(t)}_{2} \\leq B,$\n    then\n    \\al{\n        |y\\sp{i} w^t x\\sp{i}| \\leq B \\norm{x\\sp{i}},\n    }\n    and therefore we get\n    \\al{\n        \\hatL(w(t)) \\geq \\sum_{i=1}^n \\exp\\left(-B\\norm{x\\sp{i}}_{2} \\right)> 0.\n    }\n    \\item Suppose we have $w$ such that $\\norm{w}_{2} = q $ is very big. Then, using observation 2, we see that\n    \\al{\n        \\hatL(w) &= \\sum_{i=1}^n \\ell(y\\sp{i} w^\\top x\\sp{i}) \\\\\n        &\\approx \\sum_{i=1}^n \\exp\\left(-y\\sp{i} w^\\top x\\sp{i} \\right) \\\\\n        \\log \\hatL(w) &\\approx \\log \\sum_{i=1}^n \\exp\\left(-y\\sp{i} w^\\top x\\sp{i} \\right) \\\\\n        &= \\log \\sum_{i=1}^n \\exp \\left(-q y\\sp{i} \\bar{w}^\\top x\\sp{i} \\right) \\\\\n        &\\approx \\max_{i \\in [n]} -q y\\sp{i} \\bar{w}^\\top x\\sp{i}\n    }\n    where $ \\bar{w} = \\frac{w}{\\norm{w}_{2}}$ and the last step holds because the log of a sum of exponentials (\\textit{log-sum-exp}) is a smooth approximation to the maximum function. To motivate this claim, observe that:  \n    \\al{\n         \\log \\sum_{i=1}^n \\exp(a u_i) &\\geq q \\max_i u_i  \\\\\n        \\log \\sum_{i=1}^n \\exp(a u_i) &\\leq \\log \\left(n \\exp(q \\max_i u_i)\\right) \\\\\n        &= \\log n + q \\max_i u_i \\\\\n        &\\approx q \\max_{i \\in [n]} u_i + o(q) \\text{ as } q \\to \\infty\n    }\n    Thus, minimizing the loss is the same as\n    \\al{\n    \\min_w \\max_{i \\in [n]} -qy\\sp{i} \\bar{w}^\\top x\\sp{i}\n    }\n    which can be reformulated as\n    \\al{\n    \\max_w \\min_{i \\in [n]} qy\\sp{i} \\bar{w}^\\top x\\sp{i}\n    }\n\n\\end{enumerate}\n\nThe above observations heuristically demonstrate that minimizing the logistic loss with gradient descent is equivalent (in the limit) to maximizing the margin. Below, we prove Theorem~\\ref{lec16:thm:maxmargin_gd} rigorously for the exponential loss function $\\ell(t) = \\exp(-t)$, which is nearly the same as the logistic loss.\n\n\\begin{proof}[Proof of Theorem~\\ref{lec16:thm:maxmargin_gd}]\nWe begin by defining the smooth margin as\n\\begin{align}\n    \\tilde \\gamma (w) &\\defeq \\frac{-\\log \\hat{L}(w)}{\\|w\\|_2} \\label{lec18:eqn:smooth_margin} \\\\ \n    &=\n    \\frac{-\\log\\l( \\sum_{i=1}^{n} \\exp(-y^{(i)}w^\\top x^{(i)})\\r)}{\\|w\\|_2}.\n\\end{align}\nNote that $\\tilde\\gamma(w)$ approximates $\\gamma(w)$ by the log-sum-exp approximation. To make this precise, recall that $\\gamma(w) \\geq \\tilde\\gamma(w)$ because $y^{(i)}w^\\top x^{(i)} \\geq \\gamma(w) \\|w\\|_2$ for all $i$.\n\nThen, since $\\gamma(w) \\leq \\bar\\gamma$ by definition, it suffices to show that\n\\begin{align}\n    \\lim_{t\\to\\infty}\\tilde\\gamma(w(t)) = \\bar\\gamma. \\label{lec18:eqn:target}\n\\end{align}\n\nLet $\\dot{w}(t)= -\\nabla \\hat L(w(t))$. Then,\n\\begin{align}\n    \\diffp{}{t} \\l (-\\log \\hat{L}(w(t)) \\r ) &= \\inprod{\\nabla\\l ( -\\log  \\hat{L}(w(t)) \\r ),\\dot{w}(t)} \\\\\n    &=\\left \\langle -\\frac{\\nabla\\hat{L}(w(t))}{\\hat{L}(w(t))},\\dot{w}(t) \\right \\rangle \\\\\n    &=\\frac{\\norm{\\nabla\\hat{L}(w(t))}_2^2}{\\hat{L}(w(t))} \\\\\n    &=\\frac{\\norm{\\dot{w}(t)}_2^2}{\\hat{L}(w(t))} \\geq 0 \\label{lec18:eqn:maxmargin_gf_deriv}\n\\end{align}\nThis result tells us that the log loss is decreasing at each infinitesimal step of the gradient flow. By integrating \\eqref{lec18:eqn:maxmargin_gf_deriv}, we can also evaluate the log loss at time $T$:\n\\begin{align}\n    -\\log\\hat{L} (w(T) ) &= -\\log\\hat{L}(w(0)) + \\int_0^T \\diffp{}{t}\\log\\hat{L}(w(t)) dt \\label{lec18:eqn:gf_wt_expansion} \\\\\n    &= -\\log \\hat{L}(w(0)) + \\int_0^T \\frac{\\|\\dot{w}(t)\\|_2^2}{\\hat{L}(w(t))} dt.\n\\end{align}\nWhile the derivation above tells us how the numerator of \\eqref{lec18:eqn:smooth_margin} is changing, we have yet to relate this to the denominator, i.e. the norm of $w$. Recall that $w^*$ is the direction of the max-margin solution. Then, we have\n\\begin{align}\n    \\|\\dot{w}(t)\\|_2 &\\geq \\inprod{\\dot{w}(t), w^*} &\\text{(Cauchy-Schwarz)} \\label{lec18:eqn:cs-dotw} \\\\\n    & = \\inprod{-\\nabla \\hat L(w(t)), w^*} \\\\\n    &= \\inprod{ \\sum_{i=1}^n y^{(i)}\\exp(-y^{(i)}w^\\top x^{(i)})\\cdot x^{(i)}, w^* } \\\\\n    &= \\sum_{i=1}^n y^{(i)}\\exp(-y^{(i)}w^\\top x^{(i)})\\cdot \\inprod{w^*,  x^{(i)}} \\\\\n    &\\geq \\bar{\\gamma}\\sum_{i=1}^n \\exp(-y^{(i)}w^\\top x^{(i)}) \\\\\n    &=\\bar{\\gamma} \\cdot \\hat{L}(w(t)).\n\\end{align}\nThis shows that $\\dot{w}(t)$ is correlated to $w^*$, and that this correlation depends on $\\bar{\\gamma}$ and the loss. In addition, $\\dot{w}(t)$ is not too small compared to the loss.\n\nNext, we substitute \\eqref{lec18:eqn:cs-dotw} into the second term of the right-hand-side of \\eqref{lec18:eqn:gf_wt_expansion}:\n\\begin{align}\n    \\int_{0}^T\\frac{\\norm{\\dot{w}(t)}_2^2}{\\hat{L}(w(t))}dt &\\geq \\bar{\\gamma}\\cdot\\int_{0}^{T}\\norm{\\dot{w}(t)}_2dt \\\\\n    &\\geq \\bar{\\gamma}\\cdot\\l \\|\\int_{0}^{T}\\dot{w}(t)dt \\r \\|_2 \\\\\n    &= \\bar{\\gamma}\\norm{w(T)}_2.\n\\end{align}\nApplying this bound to the RHS of \\eqref{lec18:eqn:gf_wt_expansion}, we obtain\n\\begin{align}\n    -\\log \\hat{L}(w(T)) \\geq -\\log \\hat{L}(w(0)) + \\bar{\\gamma}\\norm{w(T)}_2.\n\\end{align}\nDividing both sides by $\\norm{w(T)}_2$,\n\\begin{align}\n    -\\frac{\\log \\hat{L}(w(T))}{\\norm{w(T)}_2} \\geq -\\frac{\\log \\hat{L}(w(0))}{\\norm{w(T)}_2} + \\bar{\\gamma}. \\label{lec18:eqn:gf_wt_expansion_lb}\n\\end{align}\nSince $\\lim_{T \\to \\infty} \\|w(T)\\|_2 = \\infty$, we know that the first term on the RHS of \\eqref{lec18:eqn:gf_wt_expansion_lb} goes to $0$ in the limit. Thus,\n\\begin{align}\n    \\lim_{T\\to \\infty} - \\frac{\\log\\hat{L}(w(T))}{\\|w(T)\\|_2} \\geq \\bar{\\gamma}.\n\\end{align}\nRecognizing the LHS as the definition of the smooth margin, i.e. \\eqref{lec18:eqn:smooth_margin}, we conclude that\n\\begin{align}\n    \\lim_{T\\to \\infty} \\tilde{\\gamma} (w(T)) \\geq \\bar{\\gamma}.         \n\\end{align}\nMeanwhile, since we know that  \n\\begin{align}\n    \\bar{\\gamma} \\geq \\gamma(w(T)) \\geq \\tilde{\\gamma}(w(T)),\n\\end{align}\nwe conclude by the squeeze theorem that \n\\begin{align}\n    \\lim_{T\\to \\infty} \\gamma (w(T)) = \\lim_{T\\to \\infty} \\tilde{\\gamma} (w(T)) = \\bar{\\gamma}.\n\\end{align}\n\\end{proof}\n\n\\sec{Implicit regularization effect of noise in SGD}\n\nIn the previous section, we discussed implicit regularization via initialization and the implicit regularization of gradient descent for logistic loss-minimizing classifiers. \n%These methods were based on a specific model setup and limited to gradient flow. \nIn the sequel, we will move forward to the implicit regularization effect of SGD noise. Starting from the quadratic case, we analyze how the SGD noise will affect the optimization solution, and present (heuristically) a result for non-quadratic loss functions. In particular, the main (heuristic) results are:\n\\begin{enumerate}\n\\item On the one dimensional quadratic function, the iterate can be disentangled into a contraction part and a stochastic part, the latter of which is characterized by the Ornstein–Uhlenbeck (OU) process. The noise makes the iterate bounce around the global minimum.\n\\item On the multi-dimensional quadratic function, the iterate can be disentangled into multiple separate 1-D OU processes. The noise makes the iterate bounce around the global minimum, while the fluctuation is closely related to the shape of the noise.\n\\item On non-quadratic functions, SGD with \\textit{label noise} on empirical loss $\\hat{L}(\\theta)$ converges to a stationary point of the regularized loss $\\hat{L}(\\theta) + \\lambda \\textup{tr}(\\nabla^2\\hat{L}(\\theta))$, which is mainly due to the accumulation of a third order effect. \n\\end{enumerate}\n \nGiven the score of the lectures,  we will only be able to discuss some of these results informally and heuristically. For example, we refer to the paper~\\citet{damian2021label} for the a concrete, formal version result for the third bullet. \n\nFor the remainder of this section, let $g(x)$ denote the general loss function. Then, the formulation of SGD is: for $t$ in $[0,T]$,\n\\begin{align}\n\\theta_{t+1} = x_{t} - \\eta(\\nabla g(x_{t}) + \\xi_t),\n\\end{align} \nwhere $\\eta > 0$ is the learning rate, $\\xi_t$ denotes the SGD noise, and $\\Exp[\\xi_t] = 0$. Note that in the most general case, $\\xi_t$ can depend on $x_t$.\n\t\n\\subsec{Warmup: SGD on the one dimensional quadratic function}\nIn this section, we consider the one dimensional function $g(x) = \\frac{1}{2} x^2$. Suppose the noise $\\xi_t$ are independent Gaussians, i.e. $\\xi_t \\sim \\mathcal{N}(0,1)$,\n\\begin{align}\nx_{t+1} &= x_t - \\eta(\\nabla g(x_{t}) + \\sigma\\xi_t)\\\\\n&= x_t - \\eta(x_{t} + \\sigma\\xi_t)\\\\\n&= \\underbrace{(1 - \\eta)x_t}_{\\text{contraction}} - \\underbrace{\\eta\\sigma\\xi_t}_{\\text{stochastic}}\\label{lec17:eqn:ou}.\n\\end{align}\n$(1 - \\eta)x_t$ is called the contraction because $\\eta > 0$, which means that this term will shrink after each iteration. The random noise term $\\eta\\sigma\\xi_t$ will accumulate over time, and the scale of $\\eta\\sigma\\xi_t$ remains unchanged. When $x_t$ is large, the contraction term will dominate. When $x_t$ is small, the noise term will dominate. Without the noise term, as $x_t$ continues its contraction, we approach the global minimum $x = 0$. However, with the presence of the noise $\\sigma\\xi_t$, $x_t$ will not stay at $0$, but instead bounce around it. \n\nTo characterize this intuition more precisely, we have \n\\begin{align}\nx_{t+1} &= (1 - \\eta)x_t - \\eta\\sigma\\xi_t\\\\\n&= (1 - \\eta) ((1 - \\eta) x_{t - 1}  - \\eta \\sigma \\xi_{t - 1}) - \\eta \\sigma \\xi_t \\\\\n&= (1 - \\eta)^2 x_{t - 1} - (1 - \\eta) \\eta \\sigma \\xi_{t - 1} - \\eta \\sigma \\xi_{t} \\\\\n&= (1 - \\eta)^3 x_{t - 2} - (1 - \\eta)^2 \\eta \\sigma \\xi_{t - 2} - (1 - \\eta) \\eta \\sigma \\xi_{t - 1} - \\eta \\sigma \\xi_t \\\\\n&\\quad \\vdots \\\\\n&= (1 - \\eta)^{t+1} x_0 - \\eta\\sigma\\sum_{k=0}^{t} \\xi_{t-k} (1 - \\eta)^{k}. \\label{lec17:eqn:warmup_expansion}\n\\end{align}\nThe first term in \\eqref{lec17:eqn:warmup_expansion} becomes negligible when $\\eta t \\gg 1$ (since $(1 - \\eta)^{t} \\approx e^{-\\eta t}$). The second term in \\eqref{lec17:eqn:warmup_expansion} is the accumulation of noise, which is the sum of Gaussians. Leveraging the properties of Gaussian distributions, we know that its variance equals $\\eta^2\\sigma^2\\sum_{k=0}^{t} (1 - \\eta)^{2k}$.\n\nFrom the analysis above, we know that as $t \\rightarrow \\infty$, $\\Var(x_t) \\approx \\eta^2\\sigma^2\\sum_{k=0}^{\\infty} (1 - \\eta)^{2k} = \\frac{\\eta^2\\sigma^2}{2\\eta - \\eta^2} = {\\Theta}(\\eta\\sigma^2)$. Therefore, as $t \\rightarrow \\infty$, $x_t \\sim \\mathcal{N}(0, {\\Theta}(\\eta\\sigma^2))$.\n\n\\paragraph{Interpretation.} In the one dimensional case, the noise only makes it harder to converge to the global minimum. Classical convex optimization tells us: (1) noisy GD leads to a less accurate solution and (2) noisy GD is faster than GD. What we do in practice is achieve a balance between (1) and (2). This does \\textit{not} lead to implicit regularization since $\\Exp[x_t] \\rightarrow 0$ as $t \\rightarrow \\infty$. However, this case is important for further analysis because \\eqref{lec17:eqn:ou} corresponds to the Ornstein–Uhlenbeck (OU) process which we use more extensively in the multi-dimensional cases.\n\n\\subsec{SGD on multi-dimensional quadratic functions}\nConsider a PSD matrix $A \\in \\R^{d\\times d}$. In this section, $g(x) = \\frac{1}{2}x^\\top A x$. Suppose $\\xi_t \\sim \\mathcal{N}(0, \\Sigma)$. For ease of presentation, assume that $A$ and $\\Sigma$ are simultaneously diagonizable (they have the same set of eigenvectors). We use $K$ to denote the span of the eigenvectors of $A$/$\\Sigma$. Then, consider the following SGD iterate:\n\\begin{align}\nx_{t+1} &= x_t - \\eta(\\nabla g(x_{t}) + \\xi_t)\\\\\n&= x_t - \\eta(Ax_t + \\xi_t)\\\\\n&= (I- \\eta A)x_t - \\eta\\xi_t\\\\\n&= \\underbrace{(I- \\eta A)^{t+1} x_0}_{\\text{contraction}} - \\underbrace{\\eta\\sum_{k=0}^{t} (I- \\eta A)^{k}\\xi_{t-k}}_{\\text{noise accumulation}}.\n\\end{align}\nSimilar to the analysis in the 1-D case above, we have $x_t \\sim \\mathcal{N}(0, \\eta^2\\sum_{k=0}^{\\infty} (I- \\eta A)^{k}\\Sigma (I- \\eta A)^{k})$ as $t \\rightarrow \\infty$. \\footnote{For random variable $\\xi\\in \\R^d$, $\\Exp[(W\\xi)(W\\xi)^\\top] = W\\Exp[\\xi\\xi^\\top]W^\\top$}\n\nSince $A$ and $\\Sigma$ are simultaneously diagonizable, we can easily disentangle the iterates into d separate OU process in the eigencoordinate system. Concretely, by eigendecomposition, suppose that $A = U^\\top \\text{diag}(d_i) U$ and $\\Sigma = U^\\top \\text{diag}(\\sigma_i^2) U$, where $U$ is the orthogonal matrix consisting of the eigenvectors of $A$ and $\\Sigma$. We can express the covariance of the stationary distribution as\n\\begin{align}\n\\eta^2\\sum_{k=0}^{\\infty} (I- \\eta A)^{k}\\Sigma (I- \\eta A)^{k} &= \\eta^2 U\\text{diag}\\left(\\sum_{k=0}^{\\infty}\\sigma_i^2(1-\\eta d_i)^{2k}\\right)U^\\top\\\\\n&= \\eta U\\text{diag}\\left(\\frac{\\sigma_i^2}{d_i}\\right)U^\\top.\n\\end{align}\n\\paragraph{Interpretation.} Intuitively, $\\frac{\\sigma_i^2}{d_i}$ here is the iterate fluctuation in the direction of the $i$-th eigenvector. This results tell us that the fluctuation of the iterates depends on the shape of $\\Sigma$ and $A$. If $\\Sigma$ is not full rank, the fluctuations will be limited to the subspace $K$. Also note that $\\Exp[\\|x_t\\|_2] = \\Theta(\\sqrt{\\eta})$. This reflects the noise accumulation since the scale of noise in each step is $\\Theta({\\eta})$. However, we still do not have any implicit regularization effect. This is because the Hessian of the quadratic objective is unchanged. When we have the change in Hessian, SGD noise will exert an implicit bias on the iterate. See Figure~\\ref{lec17:fig:bias} for an example.\n\n\\begin{figure}[ht]\n\\includegraphics[width=1.00\\textwidth]{figures/bias2.png}\n\\centering\n\\caption{The effect of SGD noise with the change in Hessian when $x=0$. Consider the objective $F(x) = x^2$ when $x \\le 0$ and $F(x) = \\frac{1}{10}x^2$ when $x > 0$. Suppose we initialize SGD at $x=0$ and run 1024 steps of SGD with step size $0.01$. We plot the probability density of the iterate after various steps of SGD. Note that the density function and the mean gradually move to the left.} \n\\label{lec17:fig:bias}\n\\end{figure}\n\nIn the sequel, we separately analyze the second order and third order effects of SGD on a general non-quadratic function. The second order effect exactly corresponds to this section's analysis when $A$ equals the Hessian of the general non-quadratic function.\n\n\\subsec{SGD on non-quadratic functions}\nIn this section, we analyze SGD on non-quadratic functions based on \\cite{damian2021label}. Due to the complexity of the analysis, we provide heuristic derivations to convey the main insights. \n\nWithout loss of generality, suppose a global minimum of $g(x)$ is $x=0$. Therefore, $\\nabla_x g(0) = 0$ and $\\nabla_x^2 g(0)$ is PSD. We also assume the iterates $x_t$ are close to $0$, so we can Taylor expand around $0$.\n\\begin{align}\nx_{t+1} &= x_t - \\eta(\\nabla g(x_t) + \\xi_t)\\\\\n&= x_t - \\eta(\\nabla g(0) + \\nabla^2g(0)(x_t - 0) + \\nabla^3g(0)[x_t,x_t] + \\text{higher order terms} + \\xi_t). \\label{lec17:eqn:full_gradient_update}\n\\end{align}\n\nLet $H = \\nabla^2_x g(0)$ and $T = \\nabla^3_x g(0)$. Since $T$ is a tensor, we first clarify our notation. First, for $T \\in \\R^{d\\times d\\times d}$, $x,y \\in \\R^{d}$, $T[x,y]\\in \\R^d$, and \n\\begin{align}\\label{lec17:eqn:tensor}\n    T[x,y]_i \\defeq \\sum_{j,k\\in[d]}T_{ijk}x_jy_k.\n\\end{align} \nFor $S\\in \\R^{d\\times d}$, $T(S)\\in \\R^d$, and \n\\begin{align} \\label{lec17:eqn:tensor1}\n    T(S)_i \\defeq \\sum_{j,k\\in[d]}T_{ijk}S_{jk}\n\\end{align} \n\nNow returning to \\eqref{lec17:eqn:full_gradient_update}, after dropping the higher order terms, we obtain the following third-order Taylor expansion:\n\\begin{align}\nx_{t+1} &\\approx x_t - \\eta Hx_t - \\eta\\xi_t - \\eta T[x_t,x_t]\\\\\n&= (I-\\eta H)x_t - \\eta \\xi_t - \\eta T [x_t,x_t].\\label{lec17:eqn:iterate}\n\\end{align}\n\nIf we don't consider the third order term $\\eta T [x_t,x_t]$, the update reduces to the one we studied in the previous subsection. Next, recall that $\\|x_t\\|_2 \\approx \\sqrt{\\eta}$. Therefore, $\\eta T[x_t,x_t] \\approx \\eta^2$. This quantity is dominated by both $\\eta \\xi_t$ and $\\eta Hx_t \\approx {\\eta}^{1.5}$. \n\nSo, when $H$ is positive definite, the third order term can be negligible. However, in overparametrized models, $H$ is typically low-dimensional. For instance, if the NTK matrix is full rank, then the manifold of interpolators has dimension $d-n$. Then, in the direction orthogonal to the span of $H$, the contraction term disappears. Letting $\\Pi_{A}$ denote projections onto the subspace $A$, we see that $\\eta H \\Pi_{K^\\perp}(x_t) = 0$ and $T[x_t,x_t] \\approx \\eta^2$ will dominate the update in that direction.\n\nConsider the case in which both $H$ and $\\Sigma$ are not full rank. When the loss is quadratic as in the previous section, we know that the iterate $x_t$ bounces in the subspace $K$ and remains stable in the subspace $K^\\perp$. What happens when the loss is not quadratic, i.e. $T[x_t,x_t]$ affects the gradient update? \n\nTo answer this question, we decompose the effect of the update in \\eqref{lec17:eqn:iterate} between the two subspaces of interest, $K$ and $K^\\perp$. First, observe that $(I-\\eta H)x_t - \\eta \\xi_t$ is working in $K$, and $- \\eta T [x_t,x_t]$ is only working in $K^\\perp$ because in $K$ the effect of $\\eta T [x_t,x_t]$ is dominated by $(I-\\eta H)x_t - \\eta \\xi_t$. In previous section, we already well-characterized the effect of optimization without a third order effect. To refine our analysis of the gradient update, we define an iterate $u_{t+1} = (I - \\eta H)y_t - \\eta \\xi_t$ in which we do not have the third order effect.\\footnote{Note that $\\xi_t$ is the same for each $u_t$ and $x_t$.} Then, to analyze what the implicit regularization effect is, we study $r_t = x_t - u_t$.\n\\begin{align*}\nr_{t + 1} &= x_{t+1} - u_{t+1}\\\\\n&= (I-\\eta H)(x_t - u_t) - \\eta T[x_t,x_t]\\\\\n&= (I-\\eta H)r_t - \\eta T[x_t,x_t]\\\\\n&\\approx (I-\\eta H)r_t - \\eta T[u_t,u_t].\n\\end{align*}\nNote that we only have the contraction and the bias terms for the $r_t$ iterate. The stochasticity term $\\eta \\xi_t$ is canceled out. \n\nIn the subspace $K = \\text{span}(H)$, the effect of $\\eta T [x_t,x_t]$ is again dominated by $(I-\\eta H)x_t - \\eta \\xi_t$, so no meaningful regularization occurs. But letting $\\Pi_{A}$ denote the projection onto the subspace $A$, we have that in $K^\\perp$,\n\\begin{align}\n\\Pi_{K^\\perp}r_{t+1} &= \\Pi_{K^\\perp}r_t - \\eta \\Pi_{K^\\perp} T[u_t,u_t]\\\\\n&=\\Pi_{K^\\perp}r_0 - \\eta \\sum_{k=0}^{t}\\Pi_{K^\\perp}T[u_k,u_k].\n\\end{align}\nNamely, the effect of $T[u_k,u_k]$ is slowly accumulating in ${K^\\perp}$. In Figure~\\ref{lec17:fig:noise}, an illustration of this phenomenon is provided.\n\nNote that the OU process is a Markov chain and a Gaussian process. Here we assume that $H$ is constructed such that $u_t$ converges to its stationary distribution. Suppose the Markov chain $u_t$ mixes as $t\\rightarrow \\infty$. Then, $\\sum_{k=0}^{t}\\Pi_{K^\\perp}T[u_k,u_k] \\approx t \\Exp [T[u_\\infty,u_\\infty]]$. By equation~\\eqref{lec17:eqn:tensor} and equation~\\eqref{lec17:eqn:tensor1},\n\n\\begin{align}\n\\Exp [T[u,u]]_i &= \\Exp [\\sum_{j,k}T_{ijk}u_iu_j]\\\\\n&= \\sum_{j,k}T_{ijk}\\Exp[uu^\\top]_{jk} \\\\\n&= T(\\Exp[uu^\\top])_i.\n\\end{align}\n\nTherefore $\\sum_{k=0}^{t}\\Pi_{K^\\perp}T[u_k,u_k] \\approx tT(S)$ where $S \\defeq \\Exp[u_{\\infty}u_{\\infty}^\\top]$ is the covariance of the stationary distribution.\n\n\\begin{figure}[ht]\n\\includegraphics[width=0.50\\textwidth]{figures/labelnoise-static.png}\n\\centering\n\\caption{The effect of SGD noise on non-quadratic functions. $K$ is the span of the noise covariance $\\Sigma$. In the quadratic case, the iterates will fluctuate in $K$, but remains unchanged in $K^\\perp$. When the function is non-quadratic, the third order effect slowly accumulates in $K^\\perp$, resulting in implicit regularization. } \n\\label{lec17:fig:noise}\n\\end{figure}\n\n\\paragraph{Interpretation.} Intuitively, the direction of the implicit regularization is $T(S) = \\nabla_x \\left(\\langle\\nabla_x^2g(0), S\\rangle\\right)$. In other words, the implicit bias $-T(S)$ is trying to make $\\langle\\nabla^2_x g(0), S\\rangle$ small. \\cite{damian2021label} further prove that SGD with label noise on loss $\\hat{L}(\\theta)$ converges to a stationary point of the regularized loss $\\hat{L}(\\theta) + \\lambda \\textup{tr}(\\nabla^2_\\theta \\hat{L}(\\theta))$. In the next subsection, we will heuristically explain why this regularization term is useful.\n\n\n\\subsec{SGD with label noise}\nWe previously claimed that SGD with label noise minimizes the regularized loss \n\\begin{equation}\n    \\hat{L}(\\theta) + \\lambda \\textup{tr}(\\nabla^2_\\theta \\hat{L}(\\theta)).\n\\end{equation} \n\nBut why is $\\textup{tr}(\\nabla^2_\\theta \\hat{L}(\\theta))$ a useful regularizer? This question has been the subject of recent study in the implicit regularization literature. \\cite{wei2019improved} show that the complexity of neural networks can be controlled by its Lipschitzness. Indeed, we will see that $\\textup{tr}(\\nabla^2\\hat{L}(\\theta))$ is intimately related to the Lipschitzness of the networks. \\cite{foret2020sharpness} also discover empirically that regularizing the sharpness of the local curvature leads to better generalization performance on a wide range of tasks. In the sequel, we will unpack some of these arguments to justify regularizing by $R(\\theta) \\defeq \\textup{tr}\\left (\\nabla^2 \\hat{L}(\\theta) \\right )$.\n\nWe first consider the case of one data point, i.e. $\\hat{L}(\\theta) = \\ell(f_\\theta(x), y)$. For notational simplicity, let $f \\defeq f_{\\theta}(x)$ denote the model output, $p$ be the number of parameters, and $\\ell(f,y)$ be the loss function. Then,\n\\begin{align}\n    \\nabla^2\\hat{L}(\\theta) &= \\nabla_\\theta \\left(\\diffp{\\ell}{f}  \\cdot \\diffp{f}{\\theta}\\right) \\\\\n    &= \\nabla_\\theta \\left(\\diffp{\\ell}{f} \\cdot \\nabla_\\theta f_\\theta(x) \\right) \\\\\n    &= \\diffp[2]{\\ell}{f} \\cdot \\nabla_\\theta f_{\\theta}(x)\\nabla_\\theta f_{\\theta}(x)^\\top + \\diffp{\\ell}{f}\\underbrace{\\nabla_\\theta^2 f_{\\theta}(x)}_{\\in \\R^{p \\times p}}.\n\\end{align}\nSuppose the loss function is $\\ell(f,y) = \\frac{1}{2}(f-y)^2$. Then, observing that $\\ell$ is simply a quadratic function of $f$, we have\n\\begin{align}\\label{lec18:eq:gn-decom}\n    \\nabla^2\\hat{L}(\\theta) = 1 \\cdot \\nabla_\\theta f(x) \\nabla_\\theta f_\\theta(x)^\\top + (f-y)\\cdot\\nabla^2_\\theta f_\\theta (x),   \n\\end{align}\nNote that the first term of \\eqref{lec18:eq:gn-decom} is positive semi-definite (PSD), while the second term is not necessarily PSD. In general, \\eqref{lec18:eq:gn-decom} is referred to as the Gauss-Newton decomposition. Note also that for convex losses $\\ell$, \n\\begin{align}\n    \\diffp[2]{\\ell}{f} \\geq 0,\n\\end{align}\nwhich further implies that\n\\begin{align}\n    \\diffp[2]{\\ell}{f}\\nabla f_\\theta (x) \\nabla f_\\theta (x)^\\top \\succcurlyeq 0.   \n\\end{align}\n\nEmpirically, we observe that the second term $(f-y)\\nabla^2f_{\\theta}(x)$ is generally smaller. This is especially evident when $\\theta$ is at a global minimum for which $\\ell(f_\\theta, y) = 0$. In this case, $(f-y)\\nabla^2f_{\\theta}(x)=0$ because $f_{\\theta}(x)=y$. These two observations suggest that we can ignore the second term. In that case,\n\\begin{align}\n    \\mathrm{tr}\\l (\\nabla^2\\hat{L}(\\theta) \\r ) &\\approx \\diffp[2]{\\ell}{f} \\cdot \\mathrm{tr}\\l (\\nabla f(x)\\nabla f(x)^\\top\\r ) \\\\\n    &=\\diffp[2]{\\ell}{f} \\cdot \\norm{\\nabla  f_{\\theta}(x)}_2^2\n\\end{align}\nThus, minimizing $\\mathrm{tr}\\l (\\nabla^2\\hat{L}(\\theta)\\r )$ is approximately equivalent to minimizing the Lipschitzness of the model output with respect to $\\theta$, which is approximately equivalent to minimizing the Lipschitzness of the model output with respect to hidden variables.\n\nFor example, let $\\theta = (w_1, \\dots, w_r)$, then we have \n\\begin{align}\n    \\diffp{f}{{w_i}} = \\diffp{f}{{h_{i+1}'}} \\cdot h_i^\\top,\n\\end{align}\nwhere $h'_{i+1} = w_i h_i$, and $h_i$ denotes the hidden variables of the $i$-th layer and  $h'_{i+1}$ is the pre-activation of the $(i+1)$-th layer. Then,\n\\begin{align}\n    \\l \\|\\diffp{f}{{w_i}}\\r \\|_F = \\l \\|\\diffp{f}{{h_{i+1}}}\\r \\|_2\\cdot \\|h_i\\|_2.\n\\end{align}\nThis validates our claim that minimizing the Lipschitzness of the model output with respect to the parameters is (approximately) equivalent to minimizing the Lipschitzness of the model output with respect to the hidden variables. We have previously connected the latter concept to generalization of deep neural networks. See Section~\\ref{sec:all_layer_margin} for a discussion of the all-layer margin, a measure of Lipschitzness of the model with respect to hidden layer variables that can be directly used to bound generalization error of a deep net."
  },
  {
    "path": "tex/collection/08-03-algorithmic.tex",
    "content": "% reset section counter\n%\\setcounter{section}{0}\n\n%\\metadata{lecture ID}{Your names}{date}\n\\metadata{17}{Jeff Z. HaoChen and Carrie Wu}{Mar 15th, 2021}\n\n\\sec{Algorithmic regularization for classification}\n\nIn this section, we will discuss algorithmic regularization for classification problem. In particular, we consider binary classification with logistic loss. Let $\\{(x_i, y_i)\\}_{i=1}^n$ be a separable dataset with $y_i\\in \\{\\pm 1\\}$, $x_i\\in \\R^d$. We have a linear model $h_w(x) = w^\\top x$, and we minimize the empirical logistic loss \n\\begin{align}\n\t\\hatL (w) &= \\frac{1}{n} \\sum_{i=1}^{n} \\ell(y_i h_w(x_i))\\\\\n\t&= \\frac{1}{n} \\sum_{i=1}^{n} \\ell(y_i w^\\top x_i),\n\\end{align}\nwhere $\\ell(t) = \\log(1+\\exp(-t))$ is the logistic loss.\n\nIn order to observe algorithmic regularization, we need to ensure that there exist multiple global minima for this setup. This is the case here: because the dataset is linearly separable, there exists some $w$ such that $y_i w^\\top x_i > 0$ for all $i$. Clearly any $w'$ in a small neighborhood of $w$ also classifies all the data correctly; hence, there exists an infinite number of separating classifiers $\\overline{w}$ with unit norm. For any of these $\\overline{w}$, note that $\\hatL(\\alpha\\overline{w}) \\rightarrow 0$ as $\\alpha \\rightarrow \\infty$, hence intuitively all of ``$\\infty \\overline{w}$'' classifiers are global minima.\n\nHaving shown the existence of multiple global minima, we now show that gradient descent will actually converge to the solution which maximizes the \\textit{margin}. We first define the \\textit{normalized margin} for a separating classifier $w$ as\n\\begin{align}\n\t\\gamma(w) = \\frac{\\min_{i\\in [n]} y_iw^\\top x_i }{\\|w\\|_2}.\n\\end{align}\nWe call $\\overline{\\gamma} = \\max_w \\gamma(w)$ the \\textit{max margin}. Now we are ready to state the theorem:\n\n\\begin{theorem}[\\cite{soudry2018implicit}]\n\tGradient descent with iterates $w_t$ converges to the direction of a max-margin solution:\n\t\n\t\\begin{align}\n\t\t\\gamma(w_t) \\rightarrow \\overline{\\gamma} \\quad  \\text{as} \\quad t \\rightarrow \\infty.\n\t\\end{align}\n\tIn other words, gradient descent on logistic loss is equivalent to the SVM.\\footnote{This result is still very limited it only works without regularization, and one needs to run gradient descent for a long time before this convergence in direction happens. Also, SVM is not always the best possible solution.}\n\\end{theorem}\n\nHere, we provide the intuition behind the proof. The proof of this theorem follows these steps:\n\\begin{enumerate}\n\t\\item By standard convex optimization arguments, $\\hatL (w_t) \\rightarrow 0$ as $t\\rightarrow \\infty$.\n\t\\item For sufficiently large $t$, $\\|w_t\\|_2 \\rightarrow \\infty$.\n\t\\item For sufficiently large $t$, $w_t$ will separate the data (since the loss goes to 0).\n\t\\item As $z \\rightarrow \\infty$, $l(z) = \\log(1 + \\exp (-z)) \\approx \\exp(-z)$ (i.e. logistic loss is similar to exponential loss). \n\t\\item When $\\|w\\|_2 = q $ is big, the loss $\\hatL(w)$ mainly depends on supporting data $\\{(x_i, y_i) : y_i\\overline{w}^\\top x_i = \\gamma(w)\\}$.\n\\end{enumerate}\n\nTo see the last bullet point: letting $\\overline{w} = w / \\|w\\|_2$, we notice that \n\\begin{align}\n\t\\hatL &= \\frac{1}{n} \\sum_{i=1}^{n} \\ell(y_i w^\\top x_i)\\\\\n\t&\\approx \\frac{1}{n} \\sum_{i=1}^{n} \\exp\\left(-qy_i \\overline{w}^\\top x_i\\right) \\label{lec17:eqn:reg-approx1} \\\\\n\t&\\approx \\frac{1}{n} \\sum_{i=1}^{n} \\exp\\left(-qy_i \\overline{w}^\\top x_i\\right) 1\\left[y_i \\overline{w}^\\top x_i = \\gamma(w)\\right] \\label{lec17:eqn:reg-approx2} \\\\\n\t&= \\frac{1}{n} \\sum_{i=1}^{n} \\exp\\left(-q\\gamma(w) \\right) 1\\left[y_i \\overline{w}^\\top x_i = \\gamma(w)\\right]. \\label{lec17:eqn:reg-approx3}\n\\end{align}\nHere the first approximation~\\eqref{lec17:eqn:reg-approx1} is because of the logistic loss vs. exponential loss approximation, while the second approximation~\\eqref{lec17:eqn:reg-approx2} is because for any data $x_i, y_i$ that is not a support vector, i.e.\n\\begin{align}\n\t\\overline{w}^\\top x_i y_i \\ge \\gamma(w)+\\epsilon,\n\\end{align}\nfor $\\epsilon>0$, then\n\\begin{align}\n\t\\exp(-qy_i\\overline{w}^\\top x_i) \\le \\exp(-q\\gamma(w))\\exp(-q\\epsilon),\n\\end{align}\nand as $q\\rightarrow \\infty$ the term $\\exp(-q\\epsilon)\\rightarrow 0$, making such terms negligible.\n\nIn conclusion, minimizing the (approximate) loss~\\eqref{lec17:eqn:reg-approx3} is (informally) equivalent to maximizing the margin. (Note that if you examine \\eqref{lec17:eqn:reg-approx3}, there are actually two ways to make the loss small: maximizing the margin or making $q$ large. The rigorous proof shows that when $q$ is large, the margin is already very close to the max margin. These are technical details that we will not concern ourselves with.)\n\n\\sec{Stochasticity in algorithmic regularization}\nFinally, we note that in general, when the loss has multiple global minima, any decisions we make about the optimization algorithm will make a difference. Another important source of algorithmic regularization (possibly the most important) comes from the \\textit{stochasticity} in the stochastic gradient descent (SGD) algorithm, where the parameters are optimized by updates of the form\n\\begin{equation}\n\\theta_{t+1} = \\theta_t - \\eta(\\nabla\\hat{L}(\\theta_t) + \\xi_t),\n\\end{equation}\n\nwhere $\\xi_t$ is a random noise term, typically with $\\Exp\\left[\\xi_t\\right]=0$ so the noise will not affect the result too much. The variance of $\\xi_t$ can sometimes be time-dependent: for example, $\\xi_t$ could be dependent on the parameter $\\theta_t$. \n\nIn practice, it turns out that larger gradient noise can lead to better generalization performance, as long as the algorithm can optimize under such level of noise. The intuition behind this phenomenon is that SGD converges to a ``flat'' global minimum, i.e. one with small curvature and small noise covariance. On the other hand, if you have a ``sharp'' local/global minimum with a large amount of noise, SGD will not converge to it stably. There are a number of works on this topic~\\cite{haochen2020shape,blanc2020implicit}, but a lot of questions in this space remained to be answered.\n"
  },
  {
    "path": "tex/collection/09-01-data-dependent.tex",
    "content": "% reset section counter\n\\setcounter{section}{0}\n\n%\\metadata{lecture ID}{Your names}{date}\n\\metadata{18}{Kaidi Cao, Ruocheng Wang}{Mar 17th, 2021}\n\n% ===============================================\n\nIn this chapter, we discuss the Lipschitzness of models and why they seem to generalize better than arbitrary networks. To do so, we introduce a refined notion of uniform convergence that is data-dependent, and use it to derive a generalization bound for generalized margin. We end by introducing the \\textit{all-layer margin}, a specific instance of generalized margin that captures model Lipschitzness, thus allowing us to use the data-dependent generalization bound.\n\n\\sec{Lipschitzness of models and generalization} \\label{sec:all_layer_margin}\nIt has been found that the Lipschitzness of the model plays an important role in algorithmic regularization. As an illustration, note that the curvature (Hessian) of the loss function, the Lipschitzness of the model, and the noise level in SGD are all closely-related. To give a sense of the connections between them, suppose we have a model $f(x; \\theta)$, a single example $(x, y)$, and a loss function $L(\\theta) = \\ell (f(x), y) = (f(x) - y)^2$. In this setup, we have the decomposition\n\\begin{align}\n\\nabla^2 L(\\theta) = \\underbrace{{\\frac{\\partial^2 \\ell}{\\partial f^2}}}_{\\text{scalar}} \\cdot \\underbrace{\\frac{\\partial f}{\\partial \\theta}}_{\\mathbb{R}^p} \\cdot \\underbrace{\\frac{\\partial f}{\\partial \\theta}^\\top}_{\\mathbb{R}^p} + \\frac{\\partial \\ell}{\\partial f} \\cdot  \\underbrace{\\frac{\\partial^2 f}{\\partial \\theta^2}}_{\\mathbb{R}^{p\\times p}}.\n\\end{align}\n\nThis decomposition is useful because it has been found empirically that the second term is relatively small. This implies that the Hessian is somewhat dominated by the first term. The first term, especially $\\frac{\\partial  f}{\\partial \\theta}$, relates to the Lipschitzness of the model with respect to the parameter. (There are similar connections between other quantities.)\n\nOur algorithmic choices (e.g. SGD) seem to prefer Lipschitz models\\footnote{By this we mean the Lipschitz constant of the model is small. Also, we are not distinguishing between the Lipschitz constant w.r.t to the input and that w.r.t. to the parameter because they are actually related (not covered in the lecture).}, which implies that such models generalize better. It remains to answer the question: \\textbf{Why do Lipschitz models generalize better than arbitrary networks?} We want to theoretically analyze the relationship between Lipschitzness and generalization performance, and derive some generalization bounds w.r.t to the Lipschitzness of the models.\n\nFirst, we note that the idea of using Lipschitzness to obtain generalization bounds is not new: it is the core of non-parametric statistics. However, such bounds suffer from the ``curse of dimensionality'', that is, the sample complexity grows exponentially as the data dimension $d$. Thus, using only Lipschitzness property is not enough to explain the generalization performance of neural networks: we need the help of parameterization. \n\nConsider a deep neural network $f(x) = \\sigma(W_r\\sigma(W_{r-1}\\cdots \\sigma(W_1x)))$ for binary classification. Recall, \\cite{bartlett2017} showed that:\n\\begin{equation}\nL(\\theta) \\leq \\frac{R_S (\\cF) }{\\gamma},\n\\end{equation}\nwhere $\\gamma$ is the margin of the model, and $R_S (\\cF)$ is some complexity that satisfies\n\\begin{equation}\nR_S (\\cF) \\leq \\underbrace{\\l(\\prod_{i=1}^r\\|W_i\\|_{\\textup{op}} \\r)}_{\\text{relatively large}} \\cdot \\underbrace{\\l( \\sum_{i=1}^r\\frac{\\|W_i^\\top\\|^{2/3}_{2,1}}{\\|W_i\\|_{\\textup{op}}^{2/3}}\\r)^{3/2}}_{\\text{relatively small}}.\n\\end{equation}\nThe first term is essentially the upper bound on the Lipschitzness of the model w.r.t. to the input over the entire space. \n\nThe limitation of this bound is that if $\\norm{W_i}_{\\textup{op}} >1$, then it grows exponentially in depth. On the other hand, if $\\norm{W_i}_{\\textup{op}} < 1$, then the $f_\\theta(x)$ is exponentially small. Thus, it is very hard to make the spectral norm small while keeping the margin large. In the typical case, we have\n\\begin{align}\n    \\norm{W_1x} &\\approx \\norm{W_1}_{\\textup{op}}\\norm{x}, \\\\\n    \\norm{\\sigma(W_1x)} &\\approx \\frac{1}{\\sqrt{2}}\\norm{W_1}_{\\textup{op}}\\norm{x}.\n\\end{align}\n(The second approximation comes from the heuristic that the ReLU function $\\sigma$ will zero out about half of the entries.) Thus, heuristically the output shrinks by a factor of $\\sqrt{2}$ when passing through each layer. To make the output $f(x)\\approx \\Theta(1)$, we need $\\norm{W_i}_{\\textup{op}}\\approx \\sqrt{2}$, which makes the generalization bound very large. \n\nThe deeper cause to this problem is that $\\prod_i \\norm{W_i}_{\\textup{op}}$ is a worst-case bound on the Lipschitzness of models, since it is data-independent and assumes that the input spans over the entire space. Thus one way to improve the bound is by replacing ``worse-case Lipschitzness'' with the Lipschitzness at the data points $x^{(1)}, \\cdots, x^{(n)}$. This also allows us to estimate the Lipschitzness on the empirical data, and gives us a regularizer roughly in accordance with what SGD prefers.\n\nWe want to prove a bound of the form\n\\begin{equation}\\label{lec18:eqn:data_dependent}\nL(w)\\leq \\text{poly}(\\text{Lipschitzness of $f_w$ on $x^{(1)}, \\cdots, x^{(n)}$}, \\text{norms of $W_i$'s}).\n\\end{equation}\nThe RHS of \\eqref{lec18:eqn:data_dependent} can be used as an explicit regularizer in model training to improve the generalization performance. \n\n\\sec{Proving data-dependent generalization bounds}\nBefore we prove a bound in the of form \\eqref{lec18:eqn:data_dependent}, we first discuss why classical uniform convergence does not work. Note that the RHS of \\eqref{lec18:eqn:data_dependent} is dependent on random variables $x^{(1)}, \\cdots, x^{(n)}$. But typical bound of uniform convergence using Rademacher complexity is in the form\n\\begin{equation}\n\\forall f \\in \\cF,\\quad L(f) \\leq \\text{comp}(\\cF, n),\n\\end{equation}\nwhere $\\text{comp}$ is some complexity measure, or in the form\n\\begin{equation}\n\\forall f \\in \\cF, \\quad L(f) \\leq \\text{comp}(f, n).\n\\end{equation}\nThe second bound can be achieved by defining $\\cF_C= \\{f: \\text{comp}(f, n)\\leq C\\}$ first, applying the first type of bound for the class $\\cF_C$, then performing a union bound over $C$. However, this approach does not work for obtaining a bound like the RHS of~\\eqref{lec18:eqn:data_dependent} because the the corresponding hypothesis class is\n\\begin{equation}\n\\cF_C = \\l\\{f: \\text{comp}(f, \\{ (x\\sp{i}, y\\sp{i}) \\}_{i=1}^n, n) \\leq C \\r\\}.\n\\end{equation}\nThere are random variables in the definition of the hypothesis class, which is not allowed for Rademacher complexity. Hence, we cannot leverage such techniques directly.\n\nTo tackle this issue, we introduce a refined version of uniform convergence. Suppose we can decompose the complexity measure into the sum of a property related to each data point and the function we care about:\n\\begin{align}\n\\text{comp} \\l(f, \\{x^{(i)}, y^{(i)} \\}_{i=1}^n, n \\r) = \\sum_{i=1}^n g((x^{(i)}, y^{(i)}), f).\n\\end{align}\n\nWe can define the \\textit{augmented loss} as\n\n\\begin{align}\n\\Tilde{l} (f) = l(f) \\cdot \\bm{1} (g((x, y), f) \\le C).\n\\end{align}\n\nThis means that we are changing the loss function to include the data-dependent term. An intuitive example can be found in Figure~\\ref{lec18:fig:surrogate_loss}, where we have an empirical loss with very bizarre behavior, but only outside the low complexity region that we really care about. The augmented loss notices this and ``smooths out'' the irregularities outside the low complexity region by ignoring those terms.\n\n\\begin{figure}[htpb]\n    \\centering\n    \\includegraphics[width=0.7\\textwidth]{figures/surrogate_loss.jpg}\n    \\caption{The empirical loss has bizarre behavior, but only outside the region of interest. The general idea is to define a surrogate loss and prove uniform convergence over the surrogate loss so as to avoid the bizarre behavior of the empirical loss.}\n    \\label{lec18:fig:surrogate_loss}\n\\end{figure}\n\nThe difficulty with taking this approach is that the low complexity region is random: if it was fixed, we could just zoom into that region and prove something directly by uniform convergence. We deal with this difficulty by defining a \\textit{surrogate loss} which it could just be constant outside of the low complexity region. We may then apply uniform convergence over the entire space.\n\nWe have talked about the notion of surrogate losses before in this class. For example, the margin loss/ramp loss is a type of surrogate loss. There, we thought of using the surrogate loss to make the zero-one loss more continuous. Here, we use the surrogate loss to avoid dealing with the loss function in ``bad'' regions. Let us define a generalized version of margin loss:\n\n\\begin{definition}[Generalized margin]\nLet $f : \\R^d \\mapsto \\R$ be a classification model. We call $g_f(x, y)$ a \\textit{generalized margin} if $g_f(x,y)$ satisfies\n\\begin{align}\n    g_f(x, y) = \\begin{cases} 0 &\\text{if } f(x)\\cdot y \\le 0 \\quad \\text{(wrong prediction)}, \\\\  > 0 &\\text{if } f(x)\\cdot y > 0. \\end{cases}\n\\end{align}\n\\end{definition}\n\nGiven this definition, we have the following lemma:\n\\begin{lemma}[Generalization bound for general margin]\n\\label{lec18:lem:generalizedmargin}\nSuppose $g_f$ is a generalized margin. Let $G = \\{ g_f : f \\in \\mathcal{F} \\}$, and assume we have an $\\epsilon$-covering of $G$ under the $\\norm{\\cdot}_\\infty$ metric, $N_{\\infty} (\\epsilon, G)$, with $|N_{\\infty} (\\epsilon, G)|  \\le \\lfloor R^2 / \\epsilon^2 \\rfloor$, where $R$ is the Rademacher complexity of the model.\n\nThen with probability larger than $1 - \\delta$ over the draw of training data, $\\forall f \\in \\mathcal{F}$ that correctly predicts the labels on the training data, we have\n\\begin{align}\n    \\Err (f) \\le \\tilO \\l( \\frac{R}{\\min_i g_f(x^{(i)}, y^{(i)})} \\cdot \\frac{1}{\\sqrt{n}} \\r) + \\tilO \\l(\\frac{1}{\\sqrt{n}} \\r).\n\\end{align}\n\\end{lemma}\n\nThe proof is similar to that for the bound we proved with margin loss earlier in the class, with only a few technical details changed.\n\n\\sec{All-layer margin}\n\nTo use Lemma~\\ref{lec18:lem:generalizedmargin}, we want to design a generalized margin $g_f(x, y)$ such that $G = \\{ g_f : f \\in \\mathcal{F} \\}$ has low complexity. We want this margin to capture the Lipschitzness of the model so that the bound will not scale badly in the worst case. If we use the standard margin $g_f(x, y) = yf(x)$, then $G$ depends on $\\prod_i \\| W_i \\|_{\\textup{op}}$; our goal is to do something better than this. To do so, we want to somehow have the margin depend on Lipschitzness.\n\nThe \\textit{all-layer margin}~\\cite{wei2019improved} is one such margin. Consider a perturbed model, where $\\delta = (\\delta_1, ..., \\delta_r)$ is the perturbation and the original neural network model is perturbed in the following way:\n\\begin{align}\n    h_1(x, \\delta) &= \\sigma(W_1 \\cdot x) + \\delta_1 \\| x \\|_2, \\\\\n    h_2(x, \\delta) &= \\sigma(W_2 \\cdot h_1(x, \\delta)) + \\delta_2 \\|h_1(x, \\delta) \\|_2, \\\\\n    &\\vdots \\nonumber \\\\\n    f(x, \\delta) = h_r(x, \\delta) &= \\sigma(W_r \\cdot h_{r-1}(x, \\delta)) + \\delta_r \\|h_{r-1}(x, \\delta) \\|_2.\n\\end{align}\n\nWe can then define the margin of the model as \n\n\\begin{align}\n    m_f(x,y) \\overset{\\Delta}{=} \\min_{\\delta} \\sqrt{\\sum_{i=1}^r \\|\\delta_i \\|^2} \\quad \\text{s.t. } f(x, \\delta) y \\leq 0 \\quad \\text{(incorrect prediction)}.\n\\end{align}\n\n(It can be proven that $m_f$ is indeed a generalized margin.) Under this definition, $m_f(x,y)$ is large if $f(x)$ is large (i.e. correct) and $f$ is robust to perturbation of example $x$. The good property is that under this definition, the margin already captures some Lipschitzness of the model. Applying Lemma~\\ref{lec18:lem:generalizedmargin} with $m_f$ gives the the following theorem.\n\n\\begin{theorem}[Generalization bound for all-layer margin]\n\\label{lec18:thm:alllayermargin}\nWith probability larger than $1-\\delta$ over the draw of training data, \n\\begin{align}\n    \\Err (f) \\le \\tilO \\l( \\frac{\\sum_{i=1}^r \\|W_i \\|_{1,1}}{\\min_i m_f(x^{(i)}, y^{(i)})} \\cdot \\frac{1}{\\sqrt{n}} \\r) + \\tilO \\l( \\frac{1}{\\sqrt{n}} \\r),\n\\end{align}\nwhere $\\| W \\|_{1,1}$ is the sum of the absolute value of entries of $W$.\n\\end{theorem}\n\nThis theorem implies that a larger all-layer-margin implies better generalization. To get a larger all-layer-margin, we should make the network more robust to perturbation, i.e. more Lipschitz.\n\n\\begin{proof}\nWe present just the main proof ideas here. To use Lemma~\\ref{lec18:lem:generalizedmargin}, it suffices to show that\n\\begin{equation}\nN_\\infty (\\epsilon, G) \\leq O \\l( \\frac{\\sum_{i=1}^r \\| W_i \\|_{1,1} }{\\epsilon^2} \\r),\n\\end{equation}\nwhere $G = \\{ m_f : f \\in \\mathcal{F} \\}$. Let $\\cF_1, \\dots, \\cF_r$ be a sequence of hypothesis classes (corresponding to each layer in the network), and let $\\cF = \\{ f_r \\circ f_{r-1} \\circ \\dots \\circ f_1: f_i \\in \\cF_i \\}$. \\cite{wei2019improved} prove the following lemma:\n\n\\begin{lemma}[Decomposition lemma]\n\\label{lec18:lem:decomposition}\nLet $m \\circ \\mathcal{F} = \\{ m_f : f \\in \\mathcal{F}\\}$ denote the family of all-layer margins of function compositions in $\\mathcal{F}$. Then\n\\begin{align*}\n    \\log N_\\infty \\l( \\sqrt{\\sum_{i=1}^r \\epsilon_i^2} , m\\circ \\mathcal{F} \\r) \\le \\sum_i \\log N_\\infty (\\epsilon_i, \\mathcal{F}_i).\n\\end{align*}\n\\end{lemma}\n\nThis reduces the problem to bounding the covering number for each layer which is much easier, since each layer is basically a linear transformation plus a non-linearity.\n\\end{proof}"
  },
  {
    "path": "tex/collection/09-01-unsupervised.tex",
    "content": "\\newcommand{\\jnote}[1]{{\\color{red}\\authnoteimp{JH}{#1}}}\n\n\\metadata{18}{Haoran Xu and Lewis Liu}{Nov 17th, 2021}\n\nWe venture into unsupervised learning by first studying classical (and analytically tractable) approaches to unsupervised learning. Classical unsupervised learning usually consists of specifying a latent variable model and fitting using the expectation-maximization (EM) algorithm. However, so far we do not have a comprehensive theoretical analysis for the convergence of EM algorithms because fundamentally analyzing EM algorithms involves understanding non-convex optimization. Most analysis of EM only applies to special cases (e.g., see ~\\citet{xu2016global,daskalakis2016ten}) and it is not clear whether any of the results can be extended to more realistic, complex setups, without a fundamentally new technique for understanding nonconvex optimization. \nInstead, we will analyze a family of algorithms which are broadly referred to as spectral methods or tensor methods, which are a particular application of the method of moments~\\citep{pearson1894} with the algorithmic technique of tensor decomposition~\\citep{anandkumar2015learning}. While the spectral method appears to be not as empirically sample-efficient as EM, it has provable guarantees and arguably is more reliable than EM given the provable guarantees.\n\n%\\tnote{this paragraph require updating}\nAfter discussing the basics of classical unsupervised learning, we will move on to modern applications of deep learning. In particular, we'll focus on theoretical interpretations of contrastive learning, which is a class of successful self-supervised learning algorithms in computer vision. \n\n\\sec{Method of Moments for mixture models}\n\nWe begin by formally describing the unsupervised learning problem. First, assume that we are studying a family of distributions $P_{\\theta}$ parameterized by $\\theta \\in \\Theta$, where $P_{\\theta}$ can be described by a latent variable model. Then, given data $x^{(i)},...,x^{(n)}$ that is sampled i.i.d. from some distribution in $\\{P_\\theta\\}_{\\theta \\in \\Theta}$, our goal is to recover the true $\\theta$. \n\nPerhaps the most well-studied latent variable model in machine learning is the mixture of Gaussians. We consider the following model for the mixture of $k$ $d$-dimensional Gaussians. Let \n\\begin{align}\n\\theta = \\l ( (\\mu_1, \\cdots, \\mu_k), (p_1, \\cdots, p_k)\\r ),\n\\end{align}\nwhere $\\mu_i\\in \\R^d$ is the mean of the $i$-th component and $p$ is a vector of probabilities belonging to the $k$-simplex, which represents the mixture coefficient for clusters. Formally, for $\\Delta(k) \\defeq \\{ p: \\|p\\|_1 = 1, p\\geq 0, p\\in\\R^k\\}$, \n\\begin{align}\n    p = (p_1, \\cdots, p_k) \\in \\Delta(k).\n\\end{align}\nWe then sample $x \\sim P_\\theta$ in a two-step approach: \n\\begin{align}\n    i &\\sim \\text{categorical}(p), \\notag\\\\\n    x &\\sim \\cN(\\mu_i, I).\n\\end{align}\nHere $i$ is called the latent variable since we only observe $x$. Here we assume the covariances of the Gaussians to be identity, but they can also be parameters that are to be learned.\n\nThere are many other latent variables that could be defined via a similar generative process, such as Hidden Markov Models, Independent Component Analysis, which we will discuss later. %, and Expectation-Maximization, but here we focus on the so-called Moment Method.\n\n\\subsec{Warm-up: mixture of two Gaussians}\nWe first study a simple case: the mixture of two Gaussians.\nIn this case, $k=2$, and we assume $p_1=p_2=\\frac{1}{2}$. For simplicity, we also assume $\\mu_1=-\\mu_2$, that is, the means of the two Gaussians are symmetric around the origin. To simplify our notation, let $\\mu_1=\\mu$ and $\\mu_2=-\\mu$. These assumptions yield the following model for $x$:\n\\begin{equation}\n    x \\sim \\frac{1}{2}\\mathcal{N}(\\mu,I) + \\frac{1}{2}\\mathcal{N}(-\\mu,I).\n\\end{equation}\nTo implement the moment method, we need to complete the following two tasks:\n\\begin{enumerate}\n    \\item Estimate the moment(s) of $x$ using empirical samples.\n    \\item Recover parameters from the moment(s) of $x$.\n\\end{enumerate}\n\nThe first moment of $x$ is\n\\begin{align}\n    M_1 &\\defeq \\Exp [x] \\\\\n    &= \\frac{1}{2}\\Exp [x|i=1]+\\frac{1}{2}\\Exp[x|i=2] \\\\\n    &= \\frac{1}{2}\\mu + \\frac{1}{2}(-\\mu) \\\\\n    &= 0.\n\\end{align}\nTherefore, the first moment provides no information about $\\mu$. We compute the second moment as\n\\begin{align}\n    M_2 &\\defeq \\Exp[xx^\\top] \\\\\n    &= \\frac{1}{2}\\Exp[xx^\\top|i=1]+\\frac{1}{2}\\Exp[xx^\\top|i=2]\n\\end{align}\nTo compute these expectations, consider an arbitrary $Z \\sim \\cN(\\mu, I)$. Then,\n\\begin{align}\n    \\Exp [ZZ^\\top] &= \\Exp[Z] \\Exp[Z]^\\top + \\Cov(Z) \\\\\n    &= \\mu \\mu^\\top + I\n\\end{align}\nRecognizing that this second moment calculation is the same for both Gaussians in our mixture, we obtain:\n\\begin{align}\n    M_2 &= \\frac{1}{2}(\\mu\\mu^\\top+I)+\\frac{1}{2}(\\mu\\mu^\\top+I) \\\\\n    &=\\mu\\mu^\\top+I\n\\end{align}\nSince the second moment provides information about $\\mu$, we can complete the two tasks required for the moment method using the second moment.\n\nIf we had access to infinite data, then we can compute the exact second moment $M_2=\\mu\\mu^\\top+I$. Then, we can recover $\\mu$ by evaluating the top eigenvector and eigenvalue of $M_2$.\\footnote{This approach is known as the spectral method.} The top eigenvector and eigenvalue of $M_2$ is $\\bar{\\mu} \\defeq \\frac{\\mu}{\\norm{\\mu}_2}$ and $\\norm{\\mu}_2^2+1$, respectively. \n\nIn practice, however, we do not have infinite data. In that case, we need to estimate the second moment by an empirical average.\n\\begin{align}\n    \\widehat{M}_2=\\frac{1}{n}\\sum_{i=1}^nx\\sp{i} {x\\sp{i}}^\\top\n\\end{align}\nWe can then recover $\\mu$ by evaluating the top egivenvector and eigenvalue of $\\widehat{M}_2$. However, we need this algorithm to be robust to errors, i.e., similar estimates, $\\widehat{M}_2$, of the second moment should yield similar estimates of $\\mu$. Fortunately, most algorithms we might use for obtaining the top eigenvector and eigenvalue are robust, so we can limit our attention to the infinite data case. Having outlined the moment method approach to the mixture of two Gaussians problem, we study a generalization of this problem in the sequel.\n\n\\subsec{Mixture of Gaussians with more components via tensor decomposition}\n\nThe general moment method for solving latent variable models is summarized by the following steps.\n\\begin{enumerate}\n    \\item Compute $M_1=\\Exp[x]$, $M_2=\\Exp[xx^\\top]$, $M_3=\\Exp[x\\otimes x\\otimes x],$ $M_4 = \\cdots$. Note that $x\\otimes x\\otimes x$ is in $\\mathbb{R}^{d\\times d\\times d}$ and $(x\\otimes x\\otimes x)_{ijk}=x_i\\cdot x_j\\cdot x_k$. For example, $M_{3,ijk}=\\Exp[x_ix_jx_k]$.\n    \\item Design as algorithm $A(M_1, M_2, M_3,\\dots)$ that outputs $\\theta$.\n    \\item Show that $A$ is robust to errors in our moment estimates, i.e., we apply $A$ to $(\\widehat{M}_1,\\widehat{M}_2,\\widehat{M}_3,...)$ in reality.\n\\end{enumerate}\nIn the sequel, we instantiate this paradigm for mixtures of $k$ Gaussians ($k\\geq 3$). \n\nFor the simplicity of demonstrating the idea, we assume $p_1 = \\cdots = p_k =\\frac{1}{k}$, i.e. $i \\stackrel{\\text{unif}} \\sim[k]$, and $x\\sim\\mathcal{N}(\\mu_i,I)$. Equivalently, \n\\begin{equation}\n    x\\sim\\frac{1}{k}\\sum_{i=1}^k\\mathcal{N}(\\mu_i,I).\n\\end{equation}\nIn this example, we only describe steps (1) and (2) in the general algorithm described above.  \n\nWe first evaluate the first and second moments. The first moment follows from\n\\begin{align}\n    M_1 &=\\Exp[x] \\\\\n    &=\\sum_{i=1}^k\\frac{1}{k}\\Exp[x|i] \\\\\n    &=\\frac{1}{k}\\sum_{i=1}^k\\mu_i,\n\\end{align}\nand the second moment is computed as\n\\begin{align}\n    M_2 &= \\Exp[xx^\\top] \\\\\n    &=\\sum_{i=1}^k\\frac{1}{k}\\Exp[xx^\\top|i] \\\\\n    &=\\sum_{i=1}^k\\frac{1}{k}(\\mu_i\\mu_i^\\top+I) \\\\\n    &=\\frac{1}{k}\\sum_{i=1}^k\\mu_i\\mu_i^\\top + I.\n\\end{align}\n\n\\subsubsection{Second moments do not suffice}\nCan we recover $\\mu=(\\mu_1,...,\\mu_k)$ from $\\frac{1}{k}\\sum_{i=1}^k\\mu_i$ and $\\frac{1}{k}\\sum_{i=1}^k\\mu_i\\mu_i^\\top$? Unfortunately, in most of the cases when $k\\geq 3$, the first and second moments are not sufficent to recover $\\mu$. \n\nOne reason is the so-called ``missing rotation information'' problem. Let \n\\begin{equation}\n    U =\\begin{bmatrix} \\vrule & & \\vrule \\\\ \\mu_1 & \\cdots & \\mu_k \\\\ \\vrule & & \\vrule \\end{bmatrix} \\in\\mathbb{R}^{d\\times k}\n\\end{equation}\ndenote the matrix we aim to recover. Then, consider some rotation matrix $R\\in\\mathbb{R}^{k\\times k}$. We consider $U$ versus $U R$:\n\\begin{align}\n    \\frac{1}{k}\\sum_{i=1}^k\\mu_i\\mu_i^\\top &= \\frac{1}{k}U U^\\top \\\\\n    &=\\frac{1}{k}(U R)(U R)^\\top &\\text{($RR^\\top=I$)}\n\\end{align}\nThis result proves that the second moment is invariant to rotations. To prove a similar claim for the first moment, we also constrain our choice of $R$ such that\n\\begin{align}\n    R\\cdot\\Vec{1}=\\Vec{1}\n\\end{align}\nThen,\n\\begin{align}\n    \\frac{1}{k}\\sum_{i=1}^k\\mu_i&=\\frac{1}{k} U \\cdot\\Vec{1} \\\\\n    &=\\frac{1}{k} U R\\cdot\\Vec{1}\n\\end{align}\nTherefore, the first and second moments of $U$ and $U R$ are indistinguishable, and we must consider the third moment in order to identify $U$.\n\n\\subsubsection{Computing the third moment}\n\nThe third moment is the tensor $\\Exp[x \\otimes x \\otimes x] \\in \\mathbb{R}^{d \\times d \\times d}$. To express this expectation in terms of tractable quantities, we can condition on the Gaussian observed and average:\n\\begin{align}\n\t\\Exp[x \\otimes x \\otimes x] = \\frac{1}{k} \\sum_{i=1}^k \\Exp[x \\otimes x \\otimes x \\mid i]\n\\end{align}\n\nEach term in the sum now corresponds to the third moment for some multivariate Gaussian. Fortunately, Lemma~\\ref{lec19:lma:gaussian_third_moment} suggests a formula for estimating its value.\n\\begin{lemma} \\label{lec19:lma:gaussian_third_moment}\nSuppose $z \\in \\cN(v, I)$. Then, \n\\begin{align}\n\t\\Exp[z \\otimes z \\otimes z] = v \\otimes v \\otimes v +  \\sum_{l=1}^d \\Exp[z] \\otimes e_l \\otimes e_l + \\sum_{l=1}^d  e_l \\otimes \\Exp[z] \\otimes e_l + \\sum_{l=1}^d  e_l \\otimes e_l \\otimes \\Exp[z] \n\\end{align}\nwhere $e_1,\\dots,e_d$ denote the canonical basis vectors.\n\\end{lemma}\nThis lemma suggests that we can compute $v \\otimes v \\otimes v$ from a linear combination of $\\Exp[z \\otimes z \\otimes z]$  and $\\Exp[z]$. Also note that $\\Exp[z] = v$. . \n\n\\begin{proof}\nWe compute the third moment element-wise. That is,\n\\begin{align}\n\t(\\Exp[z \\otimes z \\otimes z])_{ijk} &= \\Exp[z_i  z_j z_k] \\\\\n\t&= \\Exp[(v_i + \\xi_i)\\cdot (v_j + \\xi_j) \\cdot (v_k + \\xi_k)]  &\\text{$(z = v + \\xi, \\xi \\sim \\cN(0, I))$}\\\\\n\t&= v_i v_j v_k + \\underbrace{\\Exp [v_i v_j \\xi_k] +  \\Exp [v_i \\xi_j v_k] +  \\Exp [\\xi_i v_j v_k]}_{=0} \\nonumber \\\\ \n\t&\\quad + \\Exp[v_i \\xi_j \\xi_k] + \\Exp[v_j \\xi_i \\xi_k] + \\Exp[v_k \\xi_i \\xi_j] + \\Exp[\\xi_i \\xi_j \\xi_k] \\label{lec19:eqn:higher_moments} \n\\end{align}\nTo explicitly compute the last four terms in \\eqref{lec19:eqn:higher_moments}, we note that:\n\\begin{align}\n    \\Exp[\\xi_i \\xi_k] = \\begin{cases} 0 &\\text{if $i \\neq k$} \\\\ \\Exp[\\xi_i^2] = 1 &\\text{if $i = k$} \\end{cases}\n\\end{align} \nand that for any choice of $i, j,$ and $k$,\n\\begin{align}\n    \\Exp[\\xi_i \\xi_j \\xi_k] = 0.\n\\end{align}\nTherefore, \n\\begin{equation}\n\t(\\Exp[z \\otimes z \\otimes z])_{ijk} = v_i v_j v_k + v_i \\ind{j=k} +  v_j \\ind{i=k}  +  v_k \\ind{i=j} \n\\end{equation}\n\nSince $(\\sum_{l=1}^d v \\otimes e_l \\otimes e_l)_{ijk} = \\sum_{l=1}^d v_i  e_{lj}  e_{lk} = v_i \\bbI(j=k)$, we have proven that:\n\\begin{equation}\n\t\\Exp[z \\otimes z \\otimes z] = v \\otimes v \\otimes v +  \\sum_{l=1}^d v \\otimes e_l \\otimes e_l + \\sum_{l=1}^d  e_l \\otimes v \\otimes e_l + \\sum_{l=1}^d  e_l \\otimes e_l \\otimes v.\n\\end{equation} \n\\end{proof}\n\nWe can now apply Lemma~\\ref{lec19:lma:gaussian_third_moment} to compute the third moment of the mixture of $k$ Gaussians. In particular,\n\\begin{align}\n\t\\Exp[x \\otimes x \\otimes x] & =  \\frac{1}{k} \\sum_{i=1}^k \\Exp[x \\otimes x \\otimes x \\mid i] \\\\\n\t&=  \\frac{1}{k} \\sum_{i=1}^k \\l (\\mu_i \\otimes \\mu_i \\otimes \\mu_i +  \\sum_{l=1}^d \\mu_i \\otimes e_l \\otimes e_l + \\sum_{l=1}^d  e_l \\otimes \\mu_i \\otimes e_l + \\sum_{l=1}^d  e_l \\otimes e_l \\otimes \\mu_i \\r ) \\\\\n\t&=  \\frac{1}{k} \\sum_{i=1}^k \\mu_i \\otimes \\mu_i \\otimes \\mu_i +  \\sum_{l=1}^d \\frac{1}{k} \\l (\\sum_{i=1}^k \\mu_i \\r ) \\otimes e_l \\otimes e_l + \\sum_{l=1}^d  e_l \\otimes \\frac{1}{k} \\l (\\sum_{i=1}^k \\mu_i \\r ) \\otimes e_l \\nonumber \\\\\n    &\\quad + \\sum_{l=1}^d  e_l \\otimes e_l \\otimes \\frac{1}{k} \\l (\\sum_{i=1}^k \\mu_i \\r) \\\\\n\t& =  \\frac{1}{k} \\sum_{i=1}^k \\mu_i \\otimes \\mu_i \\otimes \\mu_i +  \\sum_{l=1}^d \\Exp[x] \\otimes e_l \\otimes e_l + \\sum_{l=1}^d  e_l \\otimes \\Exp[x] \\otimes e_l + \\sum_{l=1}^d  e_l \\otimes e_l \\otimes \\Exp[x]\\\\\n\\end{align} \n\nFor notational convenience, let\n\\begin{equation}\n    a^{\\otimes 3} \\defeq a \\otimes a \\otimes a.\n\\end{equation} \nSo far, we have shown how to compute $\\frac{1}{k} \\sum_{i=1}^k \\mu_i^{\\otimes 3}$ from $\\Exp[x^{\\otimes 3}]$ and $\\Exp[x]$. In the sequel, we will formalize the remaining problem, recovering $\\{\\mu_i\\}_{i = 1}^k$ from $\\frac{1}{k} \\sum_{i = 1}^k \\mu_i^{\\otimes 3}$, as the tensor decomposition problem, and discuss efficient algorithms for it.\n\n\\paragraph{Tensor decomposition}\nRecovering the Gaussian means, $\\{\\mu_i\\}_{i = 1}^k$,  from the third mixture moment, $\\frac{1}{k} \\sum_{i = 1}^k \\mu_i^{\\otimes 3}$, is a special case of the general tensor decomposition problem. That problem is set up as follows: Assume that $a_1, \\cdots a_k \\in \\bbR^d$ are unknown. Then, given $\\sum_{i=1}^k a_i^{\\otimes 3}$, our goal is to reconstruct the $a_i$ vectors. \n\nBefore we present some standard results on tensor decomposition, we first describe some basic facts about tensors. Much like matrices, tensors have an associated rank. For example, $a \\otimes b \\otimes c$ is a rank-1 tensor. In general, the rank of a tensor $T$ is the minimum $k$ such that $T$ can be decomposed as \n\\begin{equation}\n    T = \\sum_{i=1}^k a_i \\otimes b_i \\otimes c_i.\n\\end{equation} \nfor some $\\{a_i\\}_{i =1}^k, \\{b_i\\}_{i =1}^k, \\{c_i\\}_{i =1}^k$.\nAnother difference between tensors and matrices is that the former objects do not have the typical rotational invariance. In particular, consider applying a right rotation $R\\in \\R^{k\\times k}$ to the matrix \\begin{equation}\nA =\\begin{bmatrix} \\vrule & & \\vrule \\\\ a_1 & \\cdots & a_k \\\\ \\vrule & & \\vrule \\end{bmatrix} \\in\\mathbb{R}^{d\\times k}\n\\end{equation}\nand get \\al{\\widetilde{A} = AR = \\begin{bmatrix} \\vrule & & \\vrule \\\\ \\tilde{a}_1 & \\cdots & \\tilde{a}_k \\\\ \\vrule & & \\vrule \\end{bmatrix} \\in\\mathbb{R}^{d\\times k}}\n\nThen, \n\n\\al{\\sum_{i=1}^k a_ia_i^\\top = AA^\\top = (AR)\\cdot (AR)^\\top = \\sum_{i=1}^k \\tilde{a}_i\\tilde{a}_i^\\top}\n%In particular, \nHowever, there is no tensor analogue to the rotation invariance result above. \n%\\begin{equation}\n%    AA^\\top = A RR^\\top A^\\top.\n%\\end{equation}\nBut tensors do maintain an interesting (and useful) permutation invariance; that is, $T = \\sum_{i = 1}^k a_i^{\\otimes 3}$ is invariant to permutations of the indices of $a_i$. Or in other words, let $P\\in \\R^{k\\times k}$ be a permutation matrix suppose, and let \\al{\n\t\\widetilde{A} = AP = \\begin{bmatrix} \\vrule & & \\vrule \\\\ \\tilde{a}_1 & \\cdots & \\tilde{a}_k \\\\ \\vrule & & \\vrule \\end{bmatrix} \\in\\mathbb{R}^{d\\times k}\n}\nThen, \n\\al{\n\t\\sum_{i=1}^k a_i^{\\otimes 3} =  \\sum_{i=1}^k \\tilde{a}_i^{\\otimes 3}\n}\nThe lack of rotation invariance in the sense above and the existence of permutation invariance make tensor decomposition computationally challenging as well as powerful. \n\nWe now summarize some standard results regarding tensor decomposition for $T = \\sum_{i = 1}^k a_i^{\\otimes 3}$. The results for decomposing the asymmetric version  $T = \\sum_{i = 1}^k a_i\\otimes b_i \\otimes$ are largely analogous. We will not prove these results here.\n\\begin{enumerate}\n\\setcounter{enumi}{-1}\n\\tnotelong{Tengyu will add references here}\n\\item  In the most general case, recovering the $a_i$'s from $T$ is computationally hard. Any procedure will either fail to find a unique $a_i$ or it fails to find $a_i$ \\emph{efficiently}. \n\\item In the orthogonal case, i.e. $a_1,\\dots,a_k$ are orthogonal vectors, each $a_i$ is a global maximizer of \n\\begin{equation}\n    \\max_{\\|x\\|_2 = 1} T(x,x,x) = \\sum_{i,j,k} T_{ijk} x_i x_j x_k\n\\end{equation}\nWe can heuristically think of $a_i$ as eigenvectors of $T$ and there exists an algorithm to compute $a_i$ in poly-time.\n\\item In the independent case, i.e. $a_1,\\dots,a_k$ are linearly independent. Jennrich's algorithm can be used to efficiently recover $\\{a_i\\}_{i = 1}^k$.\n\\end{enumerate} \nResults 1 and 2 above both involve the so-called ``under-complete'' case ($k \\leq d$), e.g., when the number of Gaussians in the mixture is smaller than the dimension of the data. Next, we describe certain overcomplete cases for which efficient tensor decomposition is possible.\n\\begin{enumerate}\n\\setcounter{enumi}{2}\n\\item Suppose $a_1^{\\otimes2},\\dots,a_k^{\\otimes2}$ are independent for $k \\leq d^2$. Then, applying Result 2, we can recover $a_i$ from $\\sum_{i=1}^k (a_i^{\\otimes2})^{\\otimes3} = \\sum_{i=1}^k (a_i^{\\otimes6}) \\in \\bbR^{d^6}$.\n\\item Excluding an algebraic set of measure $0$, we can use the FOOBI algorithm to recover $a_i$ from the fourth-order tensor $\\sum_{i = 1}^k a_i^{\\otimes 4}$ when $k \\leq d^2$. A robust version of the FOOBI algorithm is described in \\citet{ma2016poly}.\n\\item Assume $a_i$ are \\emph{randomly} generated unit vectors. Then, for the third order tensor, $k$ can be large as $d^{1.5}$ \\cite{ma2016poly, schrammsteurer17}. \n\\end{enumerate}\n\nIn summary, the moment method is a recipe in which we first compute high order moments (i.e. tensors), assume that these estimates are noiseless, and decompose these tensors to recover the latent variables. Though we do not discuss these results here, there is an extensive literature analyzing the robustness of the moment method to error in the moment estimates. Last, we note that even though we only explicitly analyze the mixture of Gaussians model here, latent variable models amenable to analysis by the moment method include ICA, Hidden Markov Models, topic models, etc.\n\n\\sec{Graph Decomposition and Spectral Clustering}\\label{section:spectral_clustering}\nIntroduced by \\citet{shi2000normalized} and \\citet{ng2001spectral}, spectral clustering learns a model for the data points using a \\emph{pairwise} notion of similarity. Formally, assume that we are given $n$ data points $x\\sp{1}, \\dots, x\\sp{n}$ as well as a similarity matrix $G \\in \\bbR^{n\\times n}$ such that \n\\begin{equation}\n    G_{ij} = \\rho (x\\sp{i}, x\\sp{j})\n\\end{equation}\nwhere $\\rho$ is some measure of similarity that assigns larger values to more similar pairs of points. \n\nFor example, $x\\sp{i}$ could denote images for which $\\rho (x\\sp{i}, x\\sp{j})$ measures the semantic similarity. Alternatively, $x\\sp{i}$ might be users of a social network and $\\rho (x\\sp{i}, x\\sp{j}) = 1$ if $x\\sp{i}$ and $x\\sp{j}$ are friends (hence usually share similar interests / jobs / $\\cdots$). \n\nWe note that in moment methods, $\\Exp[xx^\\top]$ captures pairwise information between coordinates / dimensions, whereas matrix $G$ here captures pairwise information between datapoints.\n\nOur goal is to cluster the data points by viewing $G$ as a graph. For instance, in the social network example, we can naturally view $G$ as the adjacency matrix of an \\emph{unweighted} graph, where $G_{ij} \\in \\{0, 1\\}$ defines the edges. Then, the clustering problem is to partition the graph into distinct neighborhoods, i.e., components that are as separate from each other as possible. As we will see repeatedly in the sequel, the eigendecomposition of $G$ is closely related to this graph paritioning / clustering problem.\n\n\\begin{figure}[ht]\n\t\\centering\n\t\\includegraphics[width=2in]{figures/ssl1.pdf}\n\t\\caption{A demonstration of graph partitioning. Sets $S_1$ and $S_2$ form a good partition of the graph since there's only one edge between them.}\n%\t\\label{lec15:fig:OLgame}\n\\end{figure}\n\n\n\\subsec{Stochastic block model} \nIn the stochastic block model (SBM), $G$ is assumed to be generated randomly from two hidden communities. Formally, \n\\begin{equation}\n    \\{ 1, \\cdots n \\} = S \\cup \\bar{S},\n\\end{equation}\nwhere $S$ and $\\bar{S}$ partition $[n]$. Assume $|S| = \\frac{n}{2}$. We then assume the following generative model for $G$. \nIf $i,j \\in S$ or $i,j \\in \\bar{S}$, then \n\\begin{align}\n    G_{ij} = \\begin{cases}\n        1 &\\text{w.p. $p$} \\\\\n        0 &\\text{w.p. $1-p$} \\end{cases}.\n\\end{align} \nOtherwise, for $i$ and $j$ in distinct components, \n\\begin{align}\n    G_{ij} = \\begin{cases}\n        1 &\\text{w.p. $q$} \\\\\n        0 &\\text{w.p. $1-q$} \\end{cases}\n\\end{align} \nfor $p > q$ (i.e., more likely to be connected if from the same group). For instance, $S$ and $\\bar{S}$ could mean two companies, and $i\\in[n]$ is a user of a social network. Two users $i, j$ are more likely to know each other if they are in the same company.\n\n\\begin{figure}[ht]\n\t\\centering\n\t\\includegraphics[width=2in]{figures/ssl2.pdf}\n\t\\caption{A graph generated by the stochastic block model with $p=\\frac{2}{3}$ and $q=\\frac{1}{5}$.}\n\t%\t\\label{lec15:fig:OLgame}\n\\end{figure}\n\n\nOur goal is then to recover $S$ and $\\bar{S}$ from $G$; the primary tool we use towards this goal is the eigendecomposition of $G$.\n\nIn some trivial cases, it is not necessary to eigendecompose $G$ to recover the two hidden communities. Suppose, for instance, that $p = 0.5$ and $q = 0$. Then, the graph represented by $G$ will contain two connected components that correspond to $S$ and $\\bar{S}$.\n\nAs a warm-up to motivate our approach, we eigendecompose $\\bar{G} = \\Exp[G]$. Observe that\n\\begin{align}\n    \\bar{G}_{ij} = \\begin{cases}\n        p &\\text{if $i,j$ from the same community} \\\\\n        q &\\text{o.w.} \\end{cases}.\n\\end{align}\nIt is then easy to see that $\\bar{G}$ is a matrix of rank $2$:\n\\begin{align}\n    \\bar{G} = \\left[\n        \\begin{array}{c|c}\n        p \\cdots p & q \\cdots q \\\\\n        \\vdots & \\vdots \\\\\n        p \\cdots p & q \\cdots q\\\\\n        \\hline\n        q \\cdots q & p \\cdots p \\\\\n        \\vdots & \\vdots \\\\\n        q \\cdots q & p \\cdots p \n        \\end{array}\n        \\right].\n\\end{align}\n\\begin{lemma} \\label{lec19:lma:sbm_eigen}\nLet $\\bar{G} = \\Exp[G]$ for the stochastic block model. Then, letting $u_i(A)$ denote the $i$-th eigenvector of the matrix $A$,\n\\begin{align}\n    u_1(\\bar{G}) &= \\vec{1} \\label{lec19:eqn:top_eig_G}\\\\\n    u_2(\\bar{G}) &= [\\underbrace{1, \\dots, 1}_{|S|}, \\underbrace{-1, \\dots, -1}_{|\\bar{S}|}]^\\top \\label{lec19:eqn:second_eig_G}\n\\end{align}\nwhere $u_2(\\bar{G})$ has $|S|$ entries of $1$ and $|\\bar{S}|$ entries of $-1$.\n\\end{lemma}\n\n\\begin{proof}\nWe begin by directly proving \\eqref{lec19:eqn:top_eig_G}.\n\\begin{align}\t\n\t\\bar{G} \\cdot \\vec{1}  &= \\begin{bmatrix}\n           \\frac{pn}{2} + \\frac{qn}{2} \\\\\n           \\vdots \\\\\n           \\frac{pn}{2} + \\frac{qn}{2}\n         \\end{bmatrix} \\\\\n         &= \\frac{p+q}{2} \\cdot n \\cdot \\vec{1}.\n\\end{align}\nMore generally, $\\vec{1}$ is the top eigenvector for any matrix with fixed row sum or any graph with uniform degree (i.e., regular graph). \n\nNext, we prove \\eqref{lec19:eqn:second_eig_G}. Let \n\\begin{align}\n    G' = \\left[\n        \\begin{array}{c|c}\n        r \\cdots r \\\\\n        \\vdots & \\makebox{\\text{\\huge 0}} \\\\\n        r \\cdots r \\\\\n        \\hline\n        & r \\cdots r \\\\\n        \\makebox{\\text{\\huge 0}} & \\vdots \\\\\n        & r \\cdots r\n        \\end{array}\n        \\right]\n\\end{align}\nfor $r = p - q$. To precisely define $G'$, we note that $G'$ is block diagonal with two blocks of size $|S|$ and $|\\bar{S}|$, respectively. Then, \n\\begin{align}\t\n\t\\bar{G} &= \\vec{1} \\vec{1}^\\top q + G'. \\label{lec19:eqn:barG}\n\\end{align}\nThus,\n\\begin{align}\n G' \\cdot u &= \\left[\n    \\begin{array}{c|c}\n    r \\cdots r \\\\\n    \\vdots & \\makebox{\\text{\\huge 0}} \\\\\n    r \\cdots r \\\\\n    \\hline\n    & r \\cdots r \\\\\n    \\makebox{\\text{\\huge 0}} & \\vdots \\\\\n    & r \\cdots r\n    \\end{array}\n    \\right] \\cdot \\begin{bmatrix}\n           1 \\\\ \\vdots \\\\ 1\\\\ -1 \\\\\n           \\vdots \\\\\n           -1\n         \\end{bmatrix} = r \\cdot \\frac{n}{2} \\cdot u. \\label{lec19:eqn:gprimeu}\n\\end{align}\nThen, because $u \\perp \\vec{1}$, we can combine \\eqref{lec19:eqn:barG} and \\eqref{lec19:eqn:gprimeu} to obtain\n\\begin{align}\n\\bar{G} \\cdot u =  G' \\cdot u =  r \\cdot \\frac{n}{2} \\cdot u  = \\frac{p-q}{2} \\cdot n \\cdot u\n\\end{align}\nThus, $u$ has eigenvalue $\\frac{p-q}{2} \\cdot n$. \n\\end{proof}\n\n\\begin{remark}\n    Lemma~\\ref{lec19:lma:sbm_eigen} shows that\n    \\begin{equation}\n        \\bar{G} = \\frac{p + q}{2} \\cdot \\vec{1} \\vec{1}^\\top + \\frac{p - q}{2} \\cdot u u^\\top.\n    \\end{equation}\n\\end{remark}\nMore generally, when we have more than two clusters in the graph, $G'$ is block diagonal with more than two blocks. In this setting, the eigenvectors will still align with the blocks. We illustrate this below for a generic block diagonal matrix. Let \n\\begin{align}\n    A = \\left[\n        \\begin{array}{c|c|c}\n        1 \\cdots 1 &&\\\\\n        \\vdots & \\makebox{\\text{\\huge 0}} & \\makebox{\\text{\\huge 0}} \\\\\n        1 \\cdots 1 &&\\\\\n        \\hline\n        & 1 \\cdots 1 \\\\\n        \\makebox{\\text{\\huge 0}} & \\vdots & \\makebox{\\text{\\huge 0}} \\\\\n        & 1 \\cdots 1 \\\\\n        \\hline\n        & & 1 \\cdots 1 \\\\\n        \\makebox{\\text{\\huge 0}}& \\makebox{\\text{\\huge 0}} & \\vdots \\\\\n        & & 1 \\cdots 1\n        \\end{array}\n        \\right]\n\\end{align}\n\nThen, the three eigenvectors of $A$ are \n\\begin{align}\n    \\begin{bmatrix}\n        1 \\\\ \\vdots \\\\ 1\\\\ 0 \\\\\n        \\vdots \\\\\n        0 \\\\ 0 \\\\ \\vdots \\\\ 0\n      \\end{bmatrix}, \\begin{bmatrix}\n        0 \\\\ \\vdots \\\\ 0\\\\ 1 \\\\\n        \\vdots \\\\\n        1 \\\\ 0 \\\\ \\vdots \\\\ 0\n      \\end{bmatrix}, \\begin{bmatrix}\n        0 \\\\ \\vdots \\\\ 0\\\\ 0 \\\\\n        \\vdots \\\\\n        0 \\\\ 1 \\\\ \\vdots \\\\ 1\n      \\end{bmatrix} \\label{lec19:eqn:eigenmatrix}\n\\end{align}\nFurthermore, the rows of the matrix formed by the three eigenvectors given by \\eqref{lec19:eqn:eigenmatrix} clearly give the cluster IDs of the vertices in $G$. Note also that permutations of $A$ will result in equivalent permutations in the coordinates of each of the three eigenvectors.\n\nNext, we relate this observation to the result in Lemma~\\ref{lec19:lma:sbm_eigen}. While there are no negative values in the eigenvectors given in \\eqref{lec19:eqn:eigenmatrix}, we observe that any linear combination of eigenvectors is also an eigenvector, so recovering a solution that look more like \\eqref{lec19:eqn:second_eig_G} is straightforward. Indeed, taking linear combinations of the eigenvectors defined above shows that there is an alternative eigenbasis that includes the all-ones vector, $\\vec{1}$. How for this choice of $A$, the all-ones vector is not the unique top eigenvector. For that to be the case, we require background noise in $\\bar{G}$.\n\nIn reality, we only observe $G$. In the sequel, we will show that in terms of the spectrum, $G \\approx \\Exp[G]$. Formally, we will leverage earlier concentration results to prove that $\\norm{G - \\Exp[G]}_{\\text{op}}$ is small. Concretely, then,\n\\begin{align}\n    G &= (G - \\Exp[G]) + \\Exp[G] \\\\\n    &= (G - \\Exp[G]) + \\frac{p + q}{2} \\cdot \\vec{1} \\vec{1}^\\top + \\frac{p - q}{2} \\cdot u u^\\top\n\\end{align}\nRearranging, we obtain that:\n\\begin{align}\n    G - \\frac{p + q}{2} \\cdot \\vec{1} \\vec{1}^\\top &= (G - \\Exp[G]) + \\frac{p - q}{2} \\cdot u u^\\top\n\\end{align}\nWe then hope that $G - \\Exp[G]$ is a small perturbation, so that the top eigenvector of $G - \\frac{p + q}{2} \\cdot \\vec{1} \\vec{1}^\\top$ is close to $u$. Namely, it suffices to show that \n\\begin{equation}\n    \\norm{G - \\Exp[G]}_{\\text{op}} \\ll \\l \\|\\frac{p - q}{2} \\cdot uu^\\top\\r \\|_{\\text{op}}.\n\\end{equation}\n\n\\metadata{20}{Miria Feng and Christopher Wolff}{Dec 1st, 2021}\n\nWe will start by proving the following lemma.\n\\begin{lemma}\nWith high probability,\n\\begin{align}\n    \\norm{ G - \\Exp[G] }_{\\mathrm{op}} \\leq O (\\sqrt{n \\log n} ) \\;.\n\\end{align}\n\\end{lemma}\n\nNote that this concentration inequality is different from the ones we have seen in the course so far because both $G$ and $\\Exp[G]$ are matrices, not scalars. Our goal will be to turn the quantity on the LHS into something that we are familiar with.\n\n\\begin{proof}\nThe key idea is that we can use uniform convergence, after noting that\n\\begin{align}\n    \\norm{ G - \\Exp[G] }_{\\mathrm{op}} &= \\max_{\\norm{ v }_2 = 1} \\left\\vert v^\\top (G - \\Exp[G]) v \\right\\vert \\\\\n    &= \\max_{\\norm{ v }_2 = 1} \\left\\vert v^\\top G v - v^\\top \\Exp[G] v \\right\\vert \\\\\n    &= \\max_{\\norm{ v }_2 = 1} \\left\\vert \\sum_{i, j \\in [n]} v_i v_j G_{ij} - \\Exp \\left[ \\sum_{i, j \\in [n]} v_i v_j G_{ij} \\right] \\right\\vert \\;.\n\\end{align}\nNow, the quantity inside the $\\max$ is the difference between the sum of independent random variables and their expectation, which we are familiar with. We can use brute force discretization to deal with the $\\max$. First, note that for a fixed $v$ with $\\norm{ v }_2 = 1$, we can use Hoeffding's inequality to find that\n\\begin{align}\n    \\Pr \\left( \\left\\vert \\sum_{i, j \\in [n]} v_i v_j G_{ij} - \\Exp \\left[ \\sum_{i, j \\in [n]} v_i v_j G_{ij} \\right] \\right\\vert \\geq \\epsilon \\right) \\leq \\exp(-\\frac{\\epsilon^2}{2}) \\;.\n\\end{align}\nThen, we choose $\\epsilon = O(\\sqrt{n \\log n})$, take a discretization of the unit ball with granularity $1/n^{O(1)}$ (which yields a cover of cardinality  $\\exp(n \\log n)$), and take a union bound over this discretized set to achieve the desired result.\n\\end{proof}\n\n\\begin{remark}\nComparing this bound to $\\frac{p - q}{2} \\cdot n$, we can deduce that if $p - q \\gg \\frac{\\sqrt{\\log n}}{\\sqrt{n}}$, then we can recover the vector $u$ approximately. Via a post-processing step that we do not discuss here, $u$ can actually be recovered exactly. \n\\end{remark}\n\n\\subsec{Clustering the worst-case graph}\\label{subsec:clustering_worst_graph}\n\n%\\tnote{I wonder whether it's easier to use $A$ for adjacency matrix instead of overloading $G$}\n\nGiven a graph $G(V, E)$ where $V$ denotes the set of vertices and $E$ the set of edges, we define the {\\it conductance} of a component $S$ as\n\\begin{align}\n    \\phi(S) &\\defeq \\frac{\\vert E(S, \\bar{S}) \\vert}{\\operatorname{Vol}(S)}\n\\end{align}\nwhere $E(S, \\bar{S})$ is the total number of edges between $S$ and $\\bar{S}$, and $\\operatorname{Vol}(S)$ is the total number of edges connecting to $S$. To be precise, let $A$ be the adjacency matrix of $G$, \n\\begin{align}\n    E(S, \\bar{S}) &= \\sum_{i \\in S, j \\in \\bar{S}} A_{ij} \\\\\n    \\operatorname{Vol}(S) &= \\sum_{i \\in S, j \\in [n]} A_{ij} \\;.\n\\end{align}\nIntuitively, conductance captures how separated $S$ and $\\bar{S}$ are. \n\nSince $\\operatorname{Vol}(S) \\geq E(S, \\bar{S})$, it follows that $\\phi(S) \\leq 1$. Next, observe that $\\operatorname{Vol}(S) + \\operatorname{Vol}(\\bar{S}) = \\operatorname{Vol}(V)$. Then, if $\\operatorname{Vol}(S) \\leq \\operatorname{Vol}(V)/2$, it must be the case that $\\operatorname{Vol}(S) \\leq \\operatorname{Vol}(\\bar{S})$ and therefore $\\phi(S) \\geq \\phi(\\bar{S})$. In some sense, thus suggests that the conductance of a set $\\bar{S}$ with volume larger than $\\operatorname{Vol}(V)/2$ could be misleading, because the conductance of the other part could be larger. Therefore, typically people only consider the conductance of a smaller part of the partition. \n%In subsequent analysis, we assume without loss of generality that $\\operatorname{Vol}(S) \\leq \\operatorname{Vol}(V)/2$.\n\nNext, we define $\\phi(G)$ to be the {\\it sparsest cut} of $G$:\n\\begin{align}\n    \\phi(G) &= \\min_{S: \\operatorname{Vol}(S) \\leq \\operatorname{Vol}(V)/2} \\phi(S) \\;.\n\\end{align}\nOne may wonder why we need to normalize by $\\operatorname{Vol}(S)$ in the definition of conductance. The reason is that $E(S, \\bar{S})$ itself is typically minimized when $S$ is small. Thus, without this normalization, the sparsest cut would not be very meaningful. For instance, suppose the graph $G$ contains $N$ nodes and can be divided into two halves each containing $N/2$ nodes, and every node is connected to all the other nodes in the same half, but is connected to only $2$ nodes in the other half (as shown in Figure~\\ref{fig:ssl1}). Then, we can consider two subsets $S_1$ and $S_2$, where $S_1$ contains half the nodes, and $S_2$ contains two nodes in the same half. It's easy to see that $E(S_1, \\bar{S}_1) = \\frac{N}{2}\\cdot 2 > E(S_2, \\bar{S}_2) = \\frac{N}{2}$. However, the conductance of $S_1$ is $\\phi(S_1) = \\frac{E(S_1, \\bar{S}_1)}{\\operatorname{Vol}(S_1)} = \\frac{\\frac{N}{2}\\cdot 2}{\\frac{N}{2}\\cdot(\\frac{N}{2}-1)+\\frac{N}{2}\\cdot 2}\\approx\\frac{4}{N}$, whereas the conductance of $S_2$ is $\\phi(S_2) = \\frac{E(S_2, \\bar{S}_2)}{\\operatorname{Vol}(S_2)}=\\frac{\\frac{N}{2}}{N+2} \\approx\\frac{1}{2}$. Thus, when $n$ is large, $S_1$ is a sparser cut than $S_2$ under $\\phi(\\cdot)$. \n\n\n\\begin{figure}[ht]\n\t\\centering\n\t\\includegraphics[width=2in]{figures/ssl3.pdf}\n\t\\caption{A demonstration of the sparsest cut. $S_1$ is a sparser cut than $S_2$.}\n\t\\label{fig:ssl1}\n\\end{figure}\n\n\nOur goal is to find an approximate sparsest cut $\\hat{S}$ such that $\\phi(\\hat{S}) \\approx \\phi(G)$.\\footnote{Finding the exact sparsest cut is a NP-hard problem.} Our main technique is eigendecomposition or spectral clustering, though in the literature much more advanced and better algorithms have been proposed, e.g., the famous ARV algorithm~\\cite{arora2009expander}. Let $d_i = \\operatorname{Vol}(\\{i\\})$ be the degree of node $i$, and let $D = \\text{diag}(\\{d_i\\})$ be the diagonal matrix that contains the degrees $d_i$ as entries. Furthermore, let \n\\begin{equation}\n    \\bar{A} = D^{-\\frac{1}{2}} A D^{\\frac{1}{2}}\n\\end{equation}\nbe the normalized adjacency matrix. This is equivalent to normalizing each element $A_{ij}$ of the adjacency matrix by $\\frac{1}{\\sqrt{d_i d_j}}$ (i.e., $\\bar{A}_{ij} = \\frac{A_{ij}}{\\sqrt{d_i d_j}}$).\nIn most cases, it suffices to starting with considering $G$ as a regular graph (whose degrees are all the same), because the proof for regular graph can oftentimes extend to general graph easily. Assuming $G$ is a $\\kappa$-regular graph, i.e. $d_i = \\kappa$; then, this normalization simply scales $A$ by $\\frac{1}{\\kappa}$.\n\n Let $L = I - \\bar{A}$ be the Laplacian matrix. Note that any eigenvector of $L$ is also an eigenvector of $\\bar{A}$. Suppose $L$ has eigenvalues $\\lambda_1 \\leq \\hdots \\leq \\lambda_n$ with corresponding eigenvectors $u_1 \\hdots u_n$, then $\\bar{A}$ has eigenvalues $1 - \\lambda_1 \\geq \\hdots \\geq 1 - \\lambda_n$ with the same eigenvectors.\n \n The following famous Cheeger's inequality relates the eigendecompositions to the graph partitioning. \n\n\\begin{theorem}[Cheeger's inequality]\nThe second eigenvalue of $L$, namely $\\lambda_2$, is related to the sparsest cut $\\phi(G)$ as follows:\n\\begin{align}\n    \\frac{\\lambda_2}{2} \\leq \\phi(G) \\leq \\sqrt{2 \\lambda_2} \\;.\n\\end{align}\nMoreover, we can find $\\hat{S}$ such that $\\phi(\\hat{S}) \\leq \\sqrt{2 \\lambda_2} \\leq 2 \\sqrt{\\phi(G)}$ efficiently by rounding the second eigenvector. Suppose $u_2 = [\\beta_1 \\cdots \\beta_n]^\\top \\in \\bbR^n$ is the second eigenvector of $L$. Then we can choose a threshold $\\tau = \\beta_i$ and consider $\\hat{S}_i = \\{ j \\in [n] : \\beta_j \\leq \\tau \\}$. At least one such $\\hat{S}_i$ satisfies $\\phi(\\hat{S}_i) \\leq 2 \\sqrt{\\phi(G)}$.\n\\end{theorem}\n\nNote that this can be viewed as a general but weaker version of the theorem that we proved for stochastic block model. There is no randomized assumption; the conclusion is weaker than those for SBM; also the rounding algorithm to recover the cluster is also more complicated---one has to try multiple thresholding instead of using threshold $1/2$. \n\nWe will refer the readers to~\\citet{chung2007four} for the proof of the theorem. Here below we give a few basic lemmas that help build up intuitions on why eigendecompositions relate to graph decomposition.\n\nFirst, one might wonder why this algorithm uses the second eigenvector of $\\bar{A}$, but not the first eigenvector. As we have seen in the SBM case, the first eigenvector captures the background in some sense. Here for general graph, we see the same phenomenon. The top eigenvector is generally not that interesting as it only captures the ``background density'' of the graph. For instance, when $A$ is $\\kappa$-regular, $\\vec{1}$ is the top eigenvector of $A$ and is thus also the top eigenvector of $\\bar{A} = \\frac{1}{\\kappa} \\cdot A$. More generally, we have the following lemma:\n\n\n\\begin{lemma}\nThe top eigenvector of $\\bar{A}$ (respectively, the smallest eigenvector of $L$) is $u_1 = [\\sqrt{d_1} \\cdots \\sqrt{d_n}]^\\top$.\t\n\\end{lemma}\n\\begin{proof}\n\\begin{align}\n\t(\\bar{A} \\cdot u_1)_i &= \\sum_j \\bar{A}_{ij} \\sqrt{d}_j \\\\\n\t&= \\sum_j \\frac{A_{ij}}{\\sqrt{d_i}\\sqrt{d_j}} \\sqrt{d}_j \\\\\n\t&= \\frac{1}{\\sqrt{d}_i} \\sum_j A_{ij} \\\\\n\t&= \\frac{d_i}{\\sqrt{d}_i} = \\sqrt{d}_i.\n\\end{align}\n\\end{proof}\n\n\nTo understand why the eigenvectors of the Laplacian are related to the sparsest cut, we examine the quadratic form the Laplacian. In particular, we have the following lemma:\n\\begin{lemma}\\label{lemma:laplacian_quadratic}\n\tLet $v\\in\\bbR^N$ be a vector, $L$ is the graph Laplacian. Then, \n\t\\begin{align}\n\t\tv^\\top L v = \\frac{1}{2} \\sum_{(i, j) \\in E} \\left( \\frac{v_i}{\\sqrt{d_i}} - \\frac{v_j}{\\sqrt{d_j}} \\right)^2.\n\t\\end{align}\t\n\\end{lemma}\n\\begin{proof}\n\\begin{align}\n\tv^\\top L v &= v^\\top I v - v^\\top \\bar{A} v \\\\\n\t&= \\sum_{i=1}^n v_i^2 - \\sum_{i, j = 1}^n v_i v_j \\bar{A}_{ij} \\\\\n\t&= \\sum_{i=1}^n v_i^2 - \\sum_{i, j = 1}^n v_i v_j \\frac{A_{ij}}{\\sqrt{d_i d_j}} \\\\\n\t&= \\frac{1}{2}\\cdot \\left(2\\sum_{i=1}^n v_i^2 - 2 \\sum_{(i, j) \\in E} \\frac{v_i}{\\sqrt{d_i}} \\cdot \\frac{v_j}{\\sqrt{d_j}}\\right) \\\\\n\t&= \\frac{1}{2} \\sum_{(i, j) \\in E} \\left( \\frac{v_i}{\\sqrt{d_i}} - \\frac{v_j}{\\sqrt{d_j}} \\right)^2 \\;.\n\\end{align}\n\\end{proof}\n\nIf $G$ is $\\kappa$-regular, then this becomes $v^\\top L v = \\frac{1}{2\\kappa} \\sum_{(i, j) \\in E} (v_i - v_j)^2$. Furthermore, suppose $v \\in \\{0, 1\\}$ is a binary vector with support $S = \\operatorname{supp}(v)$. Then, \n\\begin{align}\n    \\frac{1}{2\\kappa} \\sum_{(i, j) \\in E} (v_i - v_j)^2 &= \\frac{1}{\\kappa} E(S, \\bar{S}) \\\\\n    &= \\frac{1}{\\kappa} E(\\operatorname{supp}(v), \\overline{\\operatorname{supp}}(v)) \\;.\n\\end{align}\nIf $|\\operatorname{supp}(v)| \\leq n/2$, implying $\\operatorname{Vol}(S) \\leq \\operatorname{Vol}(V)/2$, then\n\\begin{align}\n    \\frac{v^\\top L v}{\\norm{v}^2_2} &= \\frac{\\frac{1}{\\kappa} E(S, \\bar{S})}{\\frac{1}{\\kappa} \\operatorname{Vol}(S)} = \\phi(S) \\;.\n\\end{align}\nThe term $\\frac{v^\\top L v}{\\norm{v}^2_2}$ is also called the {\\it Rayleigh quotient}. This result nicely connects the abstract linear algebraic approach to the sparsest cut approach. Note that we only achieve an approximate sparsest cut because when we compute eigenvectors, we minimize the Rayleigh quotient \\emph{without any constraints on $v$}. By contrast, the sparsest cut minimizes the Rayleigh quotient subject to the constraint that $v \\in \\{0,1\\}^n$. Proving Cheeger inequality essentially involves controlling the difference caused by real $v$ vs binary $v$. We omit the proof of Cheeger's inequality because it's beyond the scope of this notes. \n\n\\subsec{Applying spectral clustering}\n\n%\\tnote{it's possible to merge this with the beginning of 10.2.2 if you felt that's good; please use your own assessments}\nHow do the ideas from the previous sections connect to our previous discussion of spectral clustering? Suppose that we have some raw data $x^{(1)} \\cdots x^{(n)}$ that we'd like to group into $k$ clusters. \\citet{ng2001spectral} propose to define a weighted graph $G$ such that each element \n\\begin{equation}\n    G_{ij} = \\exp \\left( -\\frac{\\norm{x^{(i)} - x^{(j)}}^2_2}{2\\sigma^2} \\right)\n\\end{equation}\nrepresents a distance between two data points. Then, we compute the first $k$ eigenvectors of the Laplacian $L$ and arrange them into the columns of a matrix: \n\\begin{equation}\n    \\begin{bmatrix} \\lvert &  & \\lvert \\\\ u_1 & \\cdots &  u_k \\\\ \\lvert &  & \\lvert \\end{bmatrix} \\in \\bbR^{n \\times k}.\n\\end{equation} \nThe $i$-th row of this matrix (which we denote by $v_i$) is then a $k$-dimensional embedding of the $i$-th example. Note that this is usually a much lower-dimensional representation than the raw data. Finally, we can use $k$-means to cluster the embeddings $\\{v_1,\\dots,v_n\\}$.\n\nIn high dimensions, the issue with \\citet{ng2001spectral}'s approach is that the training data points are all far away from each other. The Euclidean distance between points becomes meaningless, and so our definition of $G$ does not make sense. \n\nHow do we solve this issue? \\citet{haochen2021provable} propose a solution. They consider an infinite weighted graph $G(V, w)$, where $w$ are the edge weights, and $V = \\cX \\subseteq \\bbR^n$ is the set of all possible data points. We define $w(x, x')$ to be large only if $x$ and $x'$ are very close in $\\ell_2$ distance. Now, the graph is more meaningful, because only data points that are small perturbations of each other have high connection weights. However, we do not have explicit access to this graph, and its eigenvectors are infinite-dimensional. \n\nNow, suppose we have some eigenvector $u = (u_x)_{x \\in \\cX} $. Rather than explicitly represent $u_x$, we represent $u_x$ by $f_\\theta(x)$ where $f_\\theta$ is some parameterized model. Now, the challenge is to find $\\theta$ such that $[f_\\theta(x)]_{x \\in \\cX}$ is the second smallest eigenvector of Laplacian of $G$. It turns out solving this problem gives a form of contrastive learning, which we will discuss in Section~\\ref{section:analysis_contrastive_learning}.\n\n\n\\sec{Self-supervised Learning}\n\n\\subsec{Pretraining / self-supervised learning / foundation model basics}\nSelf-supervised learning is widely used for pretraining modern models. These large pretrained models are also called foundation models~\\cite{bommasani2021opportunities}. One simplified setup / workflow contains the following two stages:\n\n\\paragraph{Pretraining on unlabeled, massive data.} Let $\\{x^{(1)}, \\cdots, x^{(n)}\\}$ be a dataset where $x^{(i)}\\in\\bbR^d$ is sampled from some pretraining data distribution $x^{(i)}\\sim P_{\\text{pre}}$. The goal is to learn a pretrained model $f_{\\theta}: \\bbR^d\\rightarrow\\bbR^k$, where $k$ is the dimension of features / representations / embeddings, and $\\theta$ is the model parameter. This model can be learned by minimizing certain pretrained loss function: $\\hat{L}_{\\text{pre}}(\\theta) = \\hat{L}_{\\text{pre}}(x^{(1)}, \\cdots, x^{(n)}; \\theta)$, which sometimes is of the form $\\hat{L}_{\\text{pre}}(\\theta)  = \\frac{1}{n}\\sum_{i=1}^n \\ell_{\\text{pre}}(x^{(i)}; \\theta)$. We use $\\hat{\\theta} = \\argmin_{\\theta}\\hat{L}_{\\text{pre}}(\\theta)$ to denote the parameter learned during pretraining.\n\n\\paragraph{Adaptation.} During adaptation, we have access to a set of labeled downstream task examples $\\{(x^{(1)}_{\\text{task}}, y^{(1)}_{\\text{task}}), \\cdots, (x^{(n_{\\text{task}})}_{\\text{task}}, y^{(n_{\\text{task}})}_{\\text{task}})\\}$, where usually $n_{\\text{task}}\\ll n$. One popular adapataion method is \\emph{linear probe}, which uses $f_{\\hat{\\theta}}(x)$ as features / representations / embeddings, and train a linear classifier on downstream tasks. Concretely, the prediction on data $x$ is $w^\\top f_{\\hat{\\theta}}(x)$, where $w$ is the linear head learned from $\\min_{w} \\hat{L}_{\\text{task}}(w) = \\frac{1}{n_{\\text{task}}} \\sum_{i=1}^{n_{\\text{task}}} \\ell_{\\text{task}}(y^{(i)}_{\\text{task}}, w^\\top f_{\\hat{\\theta}}(x^{(i)}_{\\text{task}}))$. Another popular adaptation method is \\emph{finetuning}, which also updates the parameter $\\theta$. Concretely, one initializes from $\\theta = \\hat{\\theta}$, and solve $\\min_{\\theta, w} \\hat{L}_{\\text{task}}(w, \\theta) = \\frac{1}{n_{\\text{task}}} \\sum_{i=1}^{n_{\\text{task}}} \\ell_{\\text{task}}(y^{(i)}_{\\text{task}}, w^\\top f_{{\\theta}}(x^{(i)}_{\\text{task}}))$.\n\nWhy does pretraining on unlabeled data with an unsupervised (self-supervised) loss help a wide range of downstream prediction tasks? There are many open questions to be answered in this field. For instance,  we may ask: (1) how pretraining helps label efficiency of downstream tasks, (2) why pretraining can give ``universal'' representations, and (3) why does pretraining provide robustness to distribution shift. \n\n\\subsec{Analysis of contrastive learning}\\label{section:analysis_contrastive_learning}\nLet $\\bar{X}$ be the set of all natural images in the image domain, $\\bar{P}_{\\bar{X}}$ be the distribution over $\\bar{X}$. Contrastive learning learns $f_{\\theta}$ such that representations of augmentations of the same image are close, whereas augmentations of random images are far away (as visualized in Figure~\\ref{fig:ssl2}). \n\n\\begin{figure}[ht]\n\t\\centering\n\t\\includegraphics[width=3in]{figures/ssl5.pdf}\n\t\\caption{A demonstration of contrastive learning. Representations of augmentations of the same image are pulled close, whereas augmentations of random images are pushed far away.}\n\t\\label{fig:ssl2}\n\\end{figure}\n\n\\newcommand{\\aug}[1]{\\mathcal{A}(\\cdot|#1)}\n\nGiven a natural image $\\bar{x}\\in\\bar{X}$, one can generate augmentations by random cropping, flipping, adding Gaussian noise or color transformation. We use $x\\sim\\aug{\\bar{x}}$ to denote the conditional distribution of augmentations given the natural image. For simplicity, we consider the case where Gaussian blurring is the augmentation, we have\n\\begin{align}\n\tx\\sim\\aug{\\bar{x}} \\Leftrightarrow x=\\bar{x}+\\xi \\quad\\quad\\quad\\quad \\xi\\sim\\mathcal{N}(0, \\sigma^2\\mathcal{I}),\n\\end{align}\nwhere $\\norm{\\xi}_2\\approx\\sigma\\sqrt{d}$ should be $\\ll\\norm{\\bar{x}}$. \n\nWe say $(x, x^+)$ is a \\emph{positive pair} if they are generated as follows: first sample $\\bar{x}\\sim \\bar{P}_{\\bar{X}}$, then sample $x\\sim\\aug{\\bar{x}}$ and $x^+\\sim\\aug{\\bar{x}}$ independently. In other words, $(x, x^+)$ are augmentations of the same natural image.\n\nWe say $(x, x')$ is a \\emph{random pair / negative pair} if they are sampled as: first sample two natural images $\\bar{x}\\sim \\bar{P}_{\\bar{X}}$ and $\\bar{x}'\\sim \\bar{P}_{\\bar{X}}$, then sample augmentations $x\\sim\\aug{\\bar{x}}$ and $x'\\sim\\aug{\\bar{x}'}$.\n\nThe design principle for contrastive learning is to find $\\theta$ such that $f_{\\theta}(x)$, $f_{\\theta}(x^+)$ are close, while $f_{{\\theta}}(x), f_{{\\theta}}(x')$ are far away~\\citep{chen2020simclr, zbontar2021barlow, he2020momentum}. \n\nOne example of contrastive learning is SimCLR~\\citep{chen2020simclr}.  Given $B$ positive pairs $(x^{(1)}, x^{(1)+}), \\cdots, (x^{(B)}, x^{(B)+})$, note that $(x^{(i)}, x^{(j)+})$ is a random pair if $i\\ne j$, SimCLR defines the loss on the $i$-th pair as \n\\begin{align}\n\t\\text{loss}_i = -\\log\\frac{\\exp(f_{{\\theta}}(x^{(i)})^\\top f_{{\\theta}}(x^{(i)+}))}{\\exp(f_{{\\theta}}(x^{(i)})^\\top f_{{\\theta}}(x^{(i)+}))+ \\sum_{j\\ne i} \\exp(f_{{\\theta}}(x^{(i)})^\\top f_{{\\theta}}(x^{(j)+}))}.\n\\end{align}\nNotice that $-\\log\\frac{A}{A+B}$ is decreasing in $A$ but increasing in $B$, the loss above encourages $f_{{\\theta}}(x^{(i)})^\\top f_{{\\theta}}(x^{(i)+})$ to be large, while $f_{{\\theta}}(x^{(i)})^\\top f_{{\\theta}}(x^{(j)+})$ to be small.\n\nIn the rest of this section, we consider a variant of contrastive loss, proposed in~\\cite{haochen2021provable}: \n\\begin{align}\n\tL(\\theta)  = -2\\Exp_{(x, x^+) \\sim\\text{positive}} f_{{\\theta}}(x)^\\top f_{{\\theta}}(x^+) + \\Exp_{(x, x') \\sim\\text{random}}  \\left(f_{{\\theta}}(x)^\\top f_{{\\theta}}(x')\\right)^2.\n\\end{align}\nThis contrastive loss follows the same design principle as other contrastive losses in the literature: suppose all representations have the same norm, then the first term encourages the representations of a positive pair to be closer while  the second term encourages the representations of a random pair to be orthogonal to each other (hence far away). ~\\cite{haochen2021provable} show that the loss function, though inspired by theoretical derivations, still perform competitively, nearly matching the SOTA methods. \n\nWe can also define the empirical loss on a set of tuples $(x^{(i)}, x^{+(i)}, x'^{(i)})$ sampled i.i.d. as follows: $\\bar{x}\\sim\\bar{P}_{\\bar{X}}, x^{(i)}\\sim\\aug{\\bar{x}^{(i)}}, x^{+(i)}\\sim\\aug{\\bar{x}^{(i)}}$, $\\bar{x}'\\sim\\bar{P}_{\\bar{X}}$, $x'^{(i)}\\sim\\aug{\\bar{x}'^{(i)}}$. The empirical loss is defined as \n\\begin{align}\n\t\\hat{L}(\\theta) = \\sum_{i=1}^n \\left[-2f_{{\\theta}}(x^{(i)})^\\top f_{{\\theta}}(x^{+(i)}) + \\left(f_{{\\theta}}(x^{(i)})^\\top f_{{\\theta}}(x'^{(i)})\\right)^2\\right].\n\\end{align}\nThen the empirically learned parameter is $\\hat{\\theta} = \\min_{\\theta} \\hat{L}(\\theta)$. \n\nSuppose the downatream task is binary classification with label set $\\{1, -1\\}$. We define the downstream loss as \n\\begin{align}\n\t\\hat{L}_{\\text{task}}(w, \\theta) = \\frac{1}{n_{\\text{task}}} \\sum_{i=1}^{n_{\\text{task}}} \\frac{1}{2} \\left(y^{(i)}_{\\text{task}} - w^\\top f_{{\\theta}}(x^{(i)}_{\\text{task}})\\right)^2.\n\\end{align}\n\nWe learn the linear head $\\hat{w} = \\argmin_w \\hat{L}_{\\text{task}}(w, \\hat{\\theta})$, and the evaluate its performance on downstream population data:\n\\begin{align}\n\tL_{\\text{task}}(\\hat{w}, \\hat{\\theta}) = \\Exp\\left[\\frac{1}{2} \\left(y_{\\text{task}} - \\hat{w}^\\top f_{{\\theta}}(x_{\\text{task}})\\right)^2\\right].\n\\end{align}\n\n\\noindent {\\bf Analysis pipeline. } We give a summary of our analysis pipeline below. The key takeaway is that we only have to focus on the population distribution case (step 3). \n\\begin{enumerate}\\setcounter{enumi}{-1}\n\\item{Assume expressivity, i.e., assuming $\\exists \\theta^*$ such that $L(\\theta^*)$ is sufficiently small (the details will be quantified later).}\n\\item{For large enough $n$ (e.g., $n>\\text{Comp}({\\mathcal{F}})/\\epsilon^2$ where ${\\mathcal{F}}=\\{f_\\theta\\}$ is the function class, $\\text{Comp}(\\cdot)$ is some measure of complexity, $\\epsilon$ is the target error), show that $\\hat{L}(\\theta) = L(\\theta) \\pm \\epsilon$.}\n\\item{Let $\\hat{\\theta}$ be the parameter learned on empirical data. Since $\\hat{L}(\\hat{\\theta})=\\min_{\\theta}\\hat{L}(\\theta) \\le \\hat{L}(\\theta^*) \\le L(\\theta^*)+\\epsilon$, we have\n\\begin{align}\n\t\\hat{L}(\\hat{\\theta})\\le\\epsilon \\Rightarrow L(\\hat{\\theta}) \\le 2\\epsilon\n\\end{align}\n}\n\\item{\\textbf{Key step:} (infinite data case) We will prove a theorem (Theorem~\\ref{theorem:scl} below as a simplified version, or Theorem 3.8 of~\\citet{haochen2021provable}) that shows if $L(\\hat{\\theta})\\le2\\epsilon$, then there exists $w$ such that $L_{\\text{task}}(\\theta, w)\\le \\delta$, where $\\delta$ is a function of $\\epsilon$ and data distribution $\\bar{P}$.}\n\\item{When we have enough downstream data $n_{\\text{task}}\\ge\\text{poly}(k, \\frac{1}{\\epsilon'})$, for any $\\theta$, with high probability we have (via uniform convergence) that for any $w$, \n$\t\\hat{L}_{\\text{task}}(w, \\theta) \\approx L_{\\text{task}}(w, \\theta)\\pm \\epsilon'$. }\n\\item{Using the results in step 3 and step 4, we have $\\hat{L}_{\\text{task}}(\\hat{w}, \\hat{\\theta}) = \\min_{w} \\hat{L}_{\\text{task}}(w, \\hat{\\theta}) \\le \\min_w L_{\\text{task}}(w, \\hat{\\theta}) + \\epsilon' \\le \\delta + \\epsilon'$. Thus, the final evaluation loss on the downstream task is $L_{\\text{task}}(\\hat{w}, \\hat{\\theta}) \\le \\hat{L}_{\\text{task}}(\\hat{w}, \\hat{\\theta}) +\\epsilon' \\le \\delta + 2\\epsilon'$. }\n\\end{enumerate}\n\n\\noindent{\\bf Key step: the case with population pretraining and downstream data.} We will now dive into the analysis of step 3, as all the other steps are from standard concentration inequalities. Recall that \n\\begin{align}\n\t\tL(\\theta)  = -2\\Exp_{(x, x^+) } f_{{\\theta}}(x)^\\top f_{{\\theta}}(x^+) + \\Exp_{(x, x') }  \\left(f_{{\\theta}}(x)^\\top f_{{\\theta}}(x')\\right)^2.\n\\end{align}\n\nAs expected, the analysis requires structural assumptions on the data. In particular, we will use the graph-theoretic language to describe the assumptions on population data. Let $X$ be the set of all augmented data, $P$ be the distribution of augmented data $x\\sim\\aug{\\bar{x}}$ where $\\bar{x}\\sim\\bar{P}_{\\bar{X}}$. Let $p(x, x^+)$ be the probability density of positive pair $(x, x^+)$. We define a graph $G(V, w)$ where vertex set $V=X$ and edge weights $w(x, z) = p(x, z)$ for any $(x, z) \\in X\\times X$. In general, this graph may be infinitely large. To simplify math and avoid integrals, we assume $|X|=N$ where $N$ is the number of all possible augmented images (which can be infinite or exponential in dimension). \n\nThe degree of node $x$ is $p(x) = \\sum_{z\\in X} p(x, z)$.  Let $A\\in\\bbR^{N\\times N}$ be the adjacency matrix of this graph defined as $A_{x, z} = p(x, z)$, and let $\\bar{A}$ ber the normalized adjacency matrix such that $\\bar{A}_{x, z} = \\frac{p(x, z)}{\\sqrt{p(x)p(z)}}$. \n\nThe following lemma shows that contrastive learning is closely related to the eigendecomposition of $\\bar{A}$. \n\\begin{lemma}\\label{lemma:scl_as_decomposition}\n\tLet $L(f) = -2\\Exp_{(x, x^+) } f(x)^\\top f(x^+) + \\Exp_{(x, x') } \\left(f(x)^\\top f(x')\\right)^2$.  Suppose $X=\\{x_1, \\cdots, x_N\\}$, let matrix \n\t\\begin{equation}\n\t    F = \\begin{bmatrix}  p(x_1)^{\\frac{1}{2}} f(x_1)^\\top  \\\\ \\vdots \\\\  p(x_N)^{\\frac{1}{2}} f(x_N)^\\top \\end{bmatrix}.\n\t\\end{equation}\n\tThen,\n\t\\begin{align}\n\t\tL(f) = \\norm{\\bar{A}-FF^\\top}_F^2 +\\text{const}.\n\t\\end{align}\n\tHence, minimizing $L(f)$ w.r.t the variable $f$ is equivalent to eigendecomposition of $\\bar{A}$. \n\\end{lemma}\n\\begin{proof}\nDirectly expanding the Frobenius norm $\\norm{\\bar{A}-FF^\\top}_F^2$ as a sum over entries, we have\n\\begin{align}\n\t\\norm{\\bar{A} - FF^\\top}_F^2 &= \\sum_{x, z\\in X} \\left(\\frac{p(x, z)}{\\sqrt{p(x)}\\sqrt{p(z)}} - f(x)^\\top f(z) \\sqrt{p(x)}\\sqrt{p(z)}\\right)^2\\\\\n\t&= \\text{const} -2 \\sum_{x, z\\in X} p(x, z) f(x)^\\top f(z) + \\sum_{x, z\\in X}p(x)p(z)\\left(f(x)^\\top f(z)\\right)^2\\\\\n\t&= \\text{const} -2\\Exp_{(x, x^+) \\sim\\text{positive}} f(x)^\\top f(x^+) + \\Exp_{(x, x') \\sim\\text{random}}  \\left(f(x)^\\top f(x')\\right)^2,\n\\end{align}\nwhere the last equation uses the fact that $p(x, z)$ and $p(x)p(z)$ are the probability densities of $(x,z)$ being a positive pair and a random pair, respectively. \n\\end{proof}\n\nStandard matrix decomposition results tell us that the minimizer of $\\norm{\\bar{A}-FF^\\top}_F^2 $ satisfies $F = U \\cdot \\text{diag}(\\gamma_i^{\\frac{1}{2}})$, where $\\gamma_i$'s are the eigenvalues of $\\bar{A}$ and $U\\in\\bbR^{N\\times k}$ contains the top $k$ eigenvectors of $\\bar{A}$ as its columns. Suppose we use $v_1, \\cdots, v_N$ to represent the rows of $U$, i.e., \n\t\\begin{equation}\n\tU = \\begin{bmatrix}  v_1^\\top  \\\\ \\vdots \\\\  v_N^\\top \\end{bmatrix}.\n\\end{equation}\nThen we know $f(x_j)= p(x_j)^{-\\frac{1}{2}} \\cdot \\text{diag}(\\gamma_i^{\\frac{1}{2}}) \\cdot v_j$ is the minimizer of the contrastive loss. \n\n\\newcommand{\\id}[1]{\\mathbbm{1}\\left[#1\\right]}\n\nOne interesting thing is that $f(x_i)$ has the same separability as $v_i$. This is because for any vector $b\\in\\bbR^k$, we have $\\id{b^\\top v_i>0} = \\id{b^\\top \\text{diag}(\\gamma_i^{-\\frac{1}{2}}) f(x) >0}$, suggesting linear head $\\text{diag}(\\gamma_i^{-\\frac{1}{2}}) b$ applied on feature $f(x_i)$ would achieve the same classification accuracy as $v$ applied on $v_i$. Thus, it suffices to analyze $v_i$'s downstream accuracy under linear head.\n\nSince $v_i$ is exactly the feature used by the classic spectral clustering algorithm, we may ask when spectral clustering produces good features. As discussed in Section~\\ref{section:spectral_clustering}, spectral clustering is good at graph partitioning in stochastic block models. In this section, we aim to find more general settings where spectral clustering produces good features. For simplicity, let's consider a regular graph where $w(x) = \\sum_{x'\\in V} w(x, x') = \\kappa$.\\footnote{It turns out that most, if not all, spectral graph theory tools on regular graph can extend to general graph settings. Therefore, it oftentimes suffices to consider a regular graph. } %\\jnote{rewrote this para}\n\nThe following lemma shows that suppose the graph roughly contains two clusters, then the spectral clustering features can be used to accurately predict which cluster a node belongs to.\n%\\tnote{maybe say what the lemma is about in words before stating the lemma}\\jnote{added}\n\\begin{lemma}\\label{lemma:ssl1}\n\tSuppose the graph $G$ can be partitioned into $2$ clusters $S_1$, $S_2$ with size $|S_1| = |S_2| = \\frac{N}{2}$, such that $E(S_1, S_2)=\\sum_{x\\in S_1, z\\in S_2} w(x, z) \\le \\alpha \\kappa N$. Furthermore, suppose $G$ cannot be partitioned well into $3$ clusters in the sense that for all partition $ T_1, T_2, T_3$, we have $\\max\\{\\phi(T_1), \\phi(T_2), \\phi(T_3)\\} \\ge \\rho$. (Figure~\\ref{figure:ssl_thm_assumption} gives a demonstration of these assumptions.)\n\t\\begin{figure}[ht]\n\t\t\\centering\n\t\t\\includegraphics[width=3in]{figures/ssl4.pdf}\n\t\t\\caption{A demonstration of the assumptions in Lemma~\\ref{lemma:ssl1}. The left half and right half of the graph can be chosen as $S_1$ and $S_2$, since there's at most $\\alpha$ proportion of edges between them. Sets $T_1, T_2, T_3$ form a 3-way partition where $\\phi(T_1)\\ge \\rho$.\n%\\tnote{a bit more caption?} \n%\t\t\t\\tnote{I think $S_1, S_2$ are mentioned twice with different meanings in the lemma 10.10, so the figure is a bit ambiguous. Maybe replace $S_1, S_2, S_3$ by $T_1,..$ and change the figure.}\n\t\t}\\label{figure:ssl_thm_assumption}\n\t\\end{figure}\n\tThen, let $g=\\mathbbm{1}(S_1)\\in \\bbR^N$ (i.e., $g_i=1$ if $i\\in S_1$), and $k\\ge 6$, there exists linear classifier $b$ such that %\\jnote{I realize that there should be a $N$ here on RHS.}\n\t\\begin{align}\n\t\t\\norm{Ub-g}_2^2 \\lesssim \\frac{N\\alpha}{\\rho^2},\n\t\\end{align}\nwhere $U$ contains the top $k$ eigenvectors of $\\bar{A}$ as its columns.\n\\end{lemma}\n\nThe above lemma essentially says that $\\langle v_x, b \\rangle\\approx g_x$ for all data $x\\in X$, where $v_x$ is the $x$-th row of $U$. \n\nBefore proving the above lemma, we first introduce the following higher-order Cheeger inequality, which shows that when the graph cannot be partitioned well into $3$ clusters, the $6$-th smalled eigenvalue of the Laplacian cannot be too small. %\\tnote{say briefly what its about}\\jnote{added}\n\\begin{lemma}[Proposition 1.2  in \\cite{louis2014approximation}]\\label{lemma:higher_order_cheeger}\n\tLet $G=(V, w)$ be a weight graph.  Suppose the graph cannot be partitioned into $3$ sets $S_1, S_2, S_3$ such that $\\max\\{\\phi(S_1), \\phi(S_2), \\phi(S_3)\\} \\le \\rho$. Then, we have\n\t\\begin{align*}\n\t\t \\lambda_{6} \\gtrsim\\rho^2.\n\t\\end{align*}\n%\t\\tnote{can we have the other side of the inequality stated here as well? }\\jnote{the other side of inequality requires 6-way partition rather than 3-way. Would it be worth of it to introduce that?}\n%\t\\tnote{could you state the contrapositive of this? [I suspect that it's easier to understand the contrapositive in this context]}\n\\end{lemma}\n\nNow we give a proof of Lemma~\\ref{lemma:ssl1}.\n\\begin{proof}[Proof of Lemma~\\ref{lemma:ssl1}]\n\tBy Lemma~\\ref{lemma:laplacian_quadratic} we know that \n\\begin{align}\n\t\\frac{2}{N} g^\\top L g &= \\frac{1}{N\\kappa} \\sum_{x, z} (g_x - g_z)^2 w(x, z)\\\\\n\t&= \\frac{1}{N\\kappa} \\left(\\sum_{x\\in S_1, z\\in S_2} w(x, z) + \\sum_{x\\in S_2, z\\in S_1} w(x, z)\\right)\\\\\n\t&= \\frac{2}{N\\kappa} E(S_1, S_2) \\\\\n\t&\\le \\alpha.\n\\end{align}\nThus, $g$ has to be mostly in the smaller eigenspace of $L$.  Suppose $L$ has eigenvalue $0=\\lambda_1\\le\\lambda_2 \\le \\cdots \\le \\lambda_N $, with corresponding eigenvectors $u_1, u_2, \\cdots u_N$. Define matrix $U=[u_1, \\cdots, u_k] \\in \\bbR^{N\\times k}$. Suppose $\\sqrt{\\frac{2}{N}}g = \\sum_{i=1}^N \\beta_i u_i$. Since $\\norm{\\sqrt{\\frac{2}{N}}g}=1$, we know $\\sum_{i=1}^N \\beta_i^2 = 1$.\n\nSince we know $g^\\top L g = \\sum_{i=1}^N \\beta_i^2 \\lambda_i \\le \\frac{N\\alpha}{2}$, we can conclude $\\sum_{i>k} \\beta_i^2 \\lambda_i \\le \\frac{N\\alpha}{2}$, which implies that $\\sum_{i>k}\\beta_i^2 \\le \\frac{N\\alpha}{2\\lambda_{k+1}} \\lesssim \\frac{N\\alpha}{\\rho^2}$. Here we used the fact $\\lambda_6 \\gtrsim \\rho^2$ by higher-order Cheeger inequality (Lemma~\\ref{lemma:higher_order_cheeger}). Thus, we have $\\norm{g-\\sum_{i=1}^k \\beta_i u_i}_2^2 = \\norm{\\sum_{i>k} \\beta_i u_i}_2^2 \\lesssim \\frac{N\\alpha}{\\rho^2}$ which finishes the proof. \n\\end{proof}\n\nWe can combine Lemma~\\ref{lemma:scl_as_decomposition} and Lemma~\\ref{lemma:ssl1} to prove the following theorem, which shows that when the graph roughly contains $2$ clusters, the feature learned from contrastive learning can be used to predict the cluster membership accurately. \n%\\jnote{added the theorem below so that we have something to point to in the \"key step\" bullet point}\n\\begin{theorem}\\label{theorem:scl}\n\tLet $L(f) = -2\\Exp_{(x, x^+) } f(x)^\\top f(x^+) + \\Exp_{(x, x') } \\left(f(x)^\\top f(x')\\right)^2$, and $f^*: X\\rightarrow \\bbR^k$ is a minimizer of $L(f)$ for $k\\ge 6$. Suppose the graph $G$ can be partitioned into $2$ clusters $S_1$, $S_2$ with size $|S_1| = |S_2| = \\frac{N}{2}$, such that $E(S_1, S_2)=\\sum_{x\\in S_1, z\\in S_2} w(x, z) \\le \\alpha \\kappa N$. Furthermore, suppose $G$ cannot be partitioned well into $3$ clusters in the sense that for all partition $ T_1, T_2, T_3$, we have $\\max\\{\\phi(T_1), \\phi(T_2), \\phi(T_3)\\} \\ge \\rho$. Let $y(x_i) = \\mathbbm{1}(x_i\\in S_1)$ (i.e., $y(x_i)=1$ if $x_i\\in S_1$, otherwise $y(x_i)=0$). Then, there exists linear classifier $b\\in\\bbR^k$ such that\n\t\\begin{align}\n\t\t\\frac{1}{N}\\sum_{i\\in [N]} \\left(f(x_i)^\\top b - y(x_i)\\right)^2 \\lesssim \\frac{\\alpha}{\\rho^2}.\n\t\\end{align}\n\\end{theorem}\n\\begin{proof}\n\tLet $U\\in\\bbR^{N\\times k}$ contains the top $k$ eigenvectors of $\\bar{A}$ as its columns. By Lemma~\\ref{lemma:ssl1}, we know there exists some $\\hat{b}\\in\\bbR^k$ such that $\\norm{U\\hat{b}-g}_2^2 \\lesssim \\frac{N\\alpha}{\\rho^2}$, where $g\\in\\bbR^N$ such that $g_i = y(x_i)$. Let $v_1, \\cdots, v_N$ be the rows of $U$. According to Lemma~\\ref{lemma:scl_as_decomposition}, we know that $f(x_i)= p(x_i)^{-\\frac{1}{2}} \\cdot \\text{diag}(\\gamma_j^{\\frac{1}{2}}) \\cdot v_i = \\kappa^{-\\frac{1}{2}}\\cdot \\text{diag}(\\gamma_j^{\\frac{1}{2}}) \\cdot v_i$, where $\\gamma_j$ is the $j$-th largest eigenvalue of $\\bar{A}$, and $\\text{diag}(\\gamma_j^{\\frac{1}{2}})$ is a diagonal matrix containing $\\gamma_1^\\frac{1}{2}, \\gamma_2^{\\frac{1}{2}}, \\cdots, \\gamma_k^{\\frac{1}{2}}$ as its entries. Thus, if we let $b=\\sqrt{\\kappa}\\cdot  \\text{diag}(\\gamma_j^{-\\frac{1}{2}}) \\cdot \\hat{b}$, we would have \n\t\\begin{align}\n\\sum_{i\\in[N]} (f(x_i)^\\top b - y(x_i))^2 = \\sum_{i\\in[N]} (v_i^\\top \\hat{b} - g_i)^2 = \\norm{U\\hat{b}-g}_2^2 \\lesssim \\frac{N\\alpha}{\\rho^2}.\n\t\\end{align}\n\\end{proof}\n"
  },
  {
    "path": "tex/collection/10-01-online.tex",
    "content": "% reset section counter\n\\setcounter{section}{0}\n\n\\metadata{15}{Tianyu Du, Xin Lu and Soham Sinha}{Mar 8th, 2021}\n\nIn this chapter, we switch gears and talk about \\textit{online learning} and \\textit{online convex optimization}. The main idea driving online learning is that we move away from the assumption that the training and test data are both drawn i.i.d from some fixed distribution. In the online setting, training data and test data come to the user in an interwoven manner, and data can be generated \\textit{adversarially}. We will describe how online learning can be reduced to online convex optimization, some important algorithms, as well as applications of these algorithms to some illustrative examples.\n\n\\sec{Online learning setup}\n\nIn classical supervised learning, we train the model with the assumption that $(x^{(i)}, y^{(i)}) \\overset{i.i.d.}{\\sim} P_{\\text{train}}$, where $P_{\\text{train}}$ is the underlying distribution of the training data. In most cases, we assume the test data, i.e., the data we want our model to predict well, comes from the same distribution (or at least one that is close to $P_{\\text{train}}$). Reality is often more complicated: data could indeed be generated in sequence, or even in an adversarial manner, so it is often the case that $P_\\text{test}$ differs from $P_\\text{train}$. The situation where $P_\\text{test}$ and $P_{\\text{train}}$ are different is known as \\textit{domain shift}. There are some theories that tackle the issue of domain shift and generalization properties of transfer learning. However, the field is still largely being developed. (See \\cite{ben2007analysis}, for example.)\n\nOnline learning is an attempt to deal with domain shift in a way that is agnostic to the relationship between the training and test data distributions (i.e. deal with ``worst-case'' domain shift). As an example, many recommendation systems today collect users' historical trace of shopping behavior, which are not i.i.d. samples, and makes adaptive recommendations based on users' changing shopping behavior. Hence, one can see that online learning attempts to adapt to the constantly evolving reality on time. Notice that unlike the ``offline model'' (i.e., classical supervised learning), online learning learns while testing, and hence there is no rigid division in time to differentiate training and testing phase.\n\nOnline learning has several distinctive features \\cite{percynotes}:\n\\begin{enumerate}\n\\item The data may be \\textit{adversarial}. We cannot assume that sample is drawn independently from some distribution.    \n\\item The data and predictions are \\textit{sequential}. At each step, the algorithm makes a prediction after given a single piece of data.\n\\item The feedback is \\textit{limited}. For example, in bandit problems, the algorithm only knows if its right or wrong, but no other feedback is given. \n\\end{enumerate}\n\nOnline learning can be viewed as a game between two parties: (i) the learner/agent/algorithm/player, and (ii) the environment/nature. For simplicity, we will refer to the two parties as ``learner'' and ``environment'' in the remainder of this chapter.\n\nThe game takes place over $T$ rounds or time steps. At each step $t = 1, \\dots, T$, the learner receives an input $x_t \\in \\cX$ from the environment and makes a prediction $\\yhat \\in \\cY$ in response. The learner then receives the label $y_t$ from the environment and suffers some loss. This procedure is outlined in Algorithm \\ref{lec15:alg:gen-ol} and is illustrated in Figure \\ref{lec15:fig:OLgame}.\n\n    \\begin{algorithm}[h]\n        \\caption{General online learning problem}\n        \\label{lec15:alg:gen-ol}\n        \\For {$t = 1, ... T$}{\n            Learner receives $x_t \\in \\mathcal{X}$ from environment, which may be chosen adversarially\\;\n            Learner predicts $\\yhat \\in \\mathcal{Y}$\\;\n            Learner receives the label $y_t$, from environment, which may be chosen adversarially;\n            Learner suffers some loss $\\ell(y_t, \\yhat_t)$.\n        }\n    \\end{algorithm}\n\n\\begin{figure}[ht]\n    \\centering\n    \\includegraphics[width=2in]{figures/OLupdated.png}\n    \\caption{A representation of the online learning problem.}\n    \\label{lec15:fig:OLgame}\n\\end{figure}\n\nLater, we will see that the manner in which nature generates  $(x_t, y_t)$ leads to different types of online learning. In the most adversarial setting of online learning, it is possible that the ``true label'' $y_t$ is not generated at the same time as $x_t$. The environment could generate the label $y_t$ depending on the prediction $\\hat{y}_t$ made by the learner.  We can also see that Algorithm \\ref{lec15:alg:gen-ol} is a very general framework as there are very few constraints on how $x_t$ and $y_t$ are generated.\n    \n\\subsec{Evaluation of the learner}\nGiven this setup, a natural question to ask is how one can evaluate the performance of the learner. Intuitively, one could simply evaluate the learner's performance by computing the loss between the predicted label and the ``true'' label sent by the environment $\\ell(y_t, \\hat{y}_t)$. For the entire sequence of tasks, one can then evaluate in terms of the cumulative loss:\n    \\begin{align}\n        \\sum_{t=1}^T \\ell(y_t, \\yhat_t).\n    \\end{align}\n    \nHowever, as the environment can be adversarial, the task itself might be inherently hard and even the best possible learner fails to achieve a small loss. Hence, instead instead of using the cumulative loss for a learner by itself, we compare its performance against a suitable baseline, the ``best model in hindsight''. Assume that our learner comes from a set of hypotheses $\\mathcal{H}$. Let us choose the hypothesis $h \\in \\mathcal{H}$ that minimizes the cumulative loss, i.e.\n\\begin{equation}\n    h^\\star = \\argmin_{h \\in \\mathcal{H}} \\sum_{t=1}^T \\ell(y_t, h(x_t)).\n\\end{equation}\n\nNote here that in minimizing the cumulative loss, the learner gets to see all the data points $(x_t, y_t)$ at once. The cumulative loss of $h^\\star$ is the best we can ever hope to do, and so it would be better to compare the cumulative loss of the learner against it. (This approach is analogous to ``excess risk'', which tells how far the current model is away from the best we could hope for.) This measurement is denoted as \\emph{regret}, and is formally defined as:\n    \\begin{align}\n        \\text{Regret} \\overset{\\Delta}{=} \n        \\left[\\sum_{t=1}^T \\ell(y_t, \\yhat_t)\\right]\n        - \\underbrace{\n        \\left[\\min_{h \\in \\mathcal{H}} \\sum_{t=1}^T \\ell(y_t, h(x_t))\\right]\n        }_{\\text{best loss in hindsight}}\n    \\end{align}\n\nUsing this definition, if the best model in hindsight performs well, then the learner has more responsibility to learn to predict well in order to match up the performance of the baseline.\n    \n\\subsec{The realizable case}\nIn general, if the environment is too powerful, leading the learner to a large loss, it will also hinder the best model in hindsight from doing well. On the other hand, there are settings where some members of the hypothesis class can actually do well. Such settings/problems are usually referred to as \\textit{realizable}:\n\n\\begin{definition}[Realizable problem]\nAn online learning problem is \\textit{realizable} (for a family of predictors $\\mathcal{H}$) if there exists $h \\in \\mathcal{H}$ such that for any $T$, $\\sum_{t = 1}^T \\ell(y_t, h(x_t)) = 0$.\n\\end{definition}\n\nNote that even though zero error is possible, this is still an interesting problem to consider because the $x_t$'s are not i.i.d. as they are in classical supervised learning. Hence, standard statistical learning theory does not apply, and there is still research to be done here.\n\n\\begin{example}\nConsider a classification problem on $(x_t, y_t)$, and for simplicity assume $y_t \\in \\{0, 1\\}$. Suppose there exists $h^\\star \\in \\mathcal{H}$ such that we always have $y_t = \\yhat^\\star_t = h^\\star(x_t)$. In this case, the problem is realizable. \n    \nIn this case, the learner can adopt a ``majority algorithm''. At each time, the learner maintains a set $V_t \\subset \\mathcal{H}$ so that $\\sum_{t=1}^T \\ell (y_t, h(x_t)) = 0$ for all $h \\in V_t$, and $\\hat{y}_t$ is simply the prediction made by the majority of $h \\in V_t$. Based on the loss received, learners $h \\in V_t$ that fail for time $t + 1$ will be eliminated from future $V_t$'s.\n    \nWith this setup, we can see that for each wrong prediction made by the learner, at least half of the hypotheses $h \\in V_t$ will be eliminated. Hence, $1 \\leq |V_{t+1}| \\leq |\\mathcal{H}|2^{-M}$ where $M$ is the number of mistakes made so far. Thus, one has $M \\leq \\log |\\mathcal{H}|$ by taking log on both sides of inequalities and rearrange.\n    \nNow, if one puts $\\ell$ as the zero-one loss, the regret for this example will be\n\\begin{equation}\n\\text{Regret} = \\sum_{t=1}^T \\ell(y_t, h(x_t)) = M,\n\\end{equation}\nso in this example, one has $\\text{regret} \\leq \\log |\\mathcal{H}|$, which is a non-trivial bound when $\\mathcal{H}$ is finite.\n\\end{example}\n    \nAs one can see in the example, the realizable case usually indicates that the problem is not too far out of reach. Indeed, for finite hypothesis classes and linear models, the realizable case is considered to be straightforward to solve. This is perhaps why most of the past literature has focused on non-realizable cases. However, the realizable case is still an interesting problem and perhaps a very good starting point when the model class is beyond linear models and when the loss function is no longer convex, because the $x_t$'s are not i.i.d. as they are in classical supervised learning. Hence, standard statistical learning theory does not apply, and there is still research to be done here.\n \nIn the rest of the chapter, we will only focus on the convex loss case, where we reduce online learning to online convex optimization. \n    \n\\sec{Online (convex) optimization (OCO)}\n\n\\textit{Online convex optimization (OCO)} is a particularly useful tool to get results for online learning. Many online learning problems (and many other types of problems!) can be reduced to OCO problems, which allow them to be solved and analyzed algorithmically. Algorithm \\ref{lec15:alg:oco} describes the OCO problem, which is more general than the online learning problem. (Note: \\textit{Online optimization (OO)} refers to Algorithm \\ref{lec15:alg:oco} except that the $f_t$'s need not be convex. However, due to the difficulty in non-convex function optimization, most research has focused on OCO.)\n\n    \\begin{algorithm}\n    \\caption{Online (convex) optimization problem}\n    \\label{lec15:alg:oco}\n    \\For{$t = 1, ..., T$} {\n        The learner picks some action $w_t \\in \\Omega$ from the action space $\\Omega$\\;\n        The environment picks a (convex) function $f_t: \\Omega \\to [0, 1]$\\;\n        The learner suffers the loss $f_t(w_t)$ and observes the \\emph{entire} loss function $f_t(\\cdot)$.\n        }\n    \\end{algorithm}\n    \nEssentially the learner is trying to minimize the function $f_t$ at each step. As with online learning, one evaluates the performance of learner in online optimization setting using the regret:\n\\begin{align}\n\\text{Regret} = \\sum_{t=1}^T f_t(w_t) - \n\\underbrace{\\min_{w \\in \\Omega} \\sum_{t=1}^T f_t(w)}_\\text{best action in hindsight}.\n\\end{align}\n\nAt some level, OCO seems like an impossible task, since we are trying to minimize a function $f_t$ that we only get to see \\textit{after} we have made our prediction! This is certainly the case for $t = 1$. However, as time goes on, we see more and more functions and, if future functions are somewhat related to past functions, we have more information to make better predictions. (And if the future functions are completely unrelated or contradictory to past functions, then the best action in hindsight would also be bad and therefore our algorithm does not have to do much.)\n\n\\subsec{Settings and variants of OCO}\nThere are multiple settings of the OCO network, which can vary the power of the environment and observations.\n\n\\begin{itemize}\n    \\item \\underline{Stochastic setting:} $f_1,...,f_T$ are i.i.d samples from some distribution $P$. This corresponds to $(x_t, y_t)$ being i.i.d. in online learning. Under this setting, the environment is not adversarial.\n    \\item \\underline{Oblivious setting:} $f_1,...,f_T$ are chosen arbitrarily but before the game starts. This corresponds to $(x_t, y_t$ being chosen before the game starts. In this setting, the environment can be adversarial but cannot be adaptive. The environment can choose these functions based on the learner's algorithm, but not the actual action if the learner's algorithm contains randomness. (This is the setting that we focus on in this course.)\n    \\item \\underline{Non-oblivious/adaptive setting:} For all $t$, $f_t$ can depend on the learner's actions $w_1,...w_t$. Under this setting, the environment can be adversarial and adaptive. This is the most challenging setting because the environment is powerful enough to know not only the strategy of the learner, but also the exact choice the learner finally made. (Note however that If the learner is deterministic, the environment does not have more power here than in the oblivious setting. The oblivious adversary can simulate the game before the game starts, and chose the most adversarial input accordingly.)\n\\end{itemize}\n \n\\sec{Reducing online learning to online optimization}\nThere is a natural way to reduce the online learning problem to online optimization, with respect to a specific type of model $h_{w}$ parametrized by $w \\in \\Omega$. Recall that in online learning problem, the learner predicts $y_t$ upon receiving $x_t$. If the learner possesses oracle to solve online optimization problem, the learner can consult the oracle to obtain $w_t$, the parameter of the model as in online optimization problem, and then predict $\\hat{y}_t = h_{w_t}(x_t)$.\n\nIn the next two subsections, we give two examples of how an online learning problem can be reduced to an OCO problem.\n    \n\\subsec{Example: Online learning regression problem}\n\nConsider the regression model $h_w(x) = w^\\top x$ parameterized by $w$ in parameter space $\\Omega$ with squared error loss $\\ell$. Here is the online learning formulation of the regression problem:\n\n\\begin{algorithm}\n\\caption{Online learning regression problem}\n\\For{$t = 1, ..., T$} {\nThe learner receives $x_t \\in \\R^d$ from the environment\\;\nThe learner predicts $\\yhat_t$\\;\nThe environment selects $y_t$ and sends it to the learner\\;\nThe learner suffers loss $\\ell(y_t, \\yhat_t) = (y_t-\\yhat_t)^2$.\n}\n\\end{algorithm}\n\nThis can be reduced to the OCO problem in the following way:\n\n\\begin{algorithm}\n\\caption{OCO formulation of regression problem}\n\\For{$t = 1, ..., T$} {\nThe learner receives $x_t \\in \\R^d$ from the environment\\;\nThe learner gives $x_t$ to the OCO solver and obtains $w_t \\in \\R^d$\\;\nThe learner predicts $\\hat{y}_t = h_{w_t}(x_t) = w_t^\\top x_t$\\;\nThe environment selects $y_t$ and sends it to the learner\\;\nThe learner suffers loss $(y_t - h_{w_t}(x_t))^2$\\;\nWith $(x_t, y_t)$ observed, the learner can reconstruct the loss function $f_t(w) = (y_t -h_{w}(x_t))^2$ and give it to the OCO solver.\n}\n\\end{algorithm}\n\nIn this example, we have the following correspondence:\n\\begin{itemize}\n\\item $f_t$ in online optimization $\\leftrightarrow$ squared error loss functions for $(x_t, y_t)$.\n\\item $w_t$ in online optimization $\\leftrightarrow$ parameters of the linear model $h_{w_t}$.\n\\end{itemize}\n    \nSince $h_w(\\cdot)$ is linear, the corresponding squared error loss function $f_t$ are convex, and so we have effectively reduced the online linear regression problem to an online \\emph{convex} optimization problem.\n    \nNotice that in the previous example, the loss function $f_t$ actually depends on the label $y_t$, which demonstrates that the key challenge in online optimization is that the function $f_t$ is unknown to the learner when the prediction $\\hat{y}_t$ is made.\n    \n\\subsec{Example: The expert problem}\nSuppose we wish to predict tomorrow's weather and 10 different TV channels provide different forecasts. Which one should we follow? Formally, consider a finite hypothesis class $\\mathcal{H}$, where each $h \\in \\mathcal{H}$ represents an expert, and we wish to choose a $h_t$ wisely at each time step. For simplicity, we assume the prediction is binary, i.e. $\\hat{y} \\in \\{0, 1\\}$, and suppose the loss function is 0-1 loss. (The problem can easily be generalized to more general predictions and losses.) The problem is outlined in Algorithm \\ref{lec15:alg:expert_discrete}.\n\n\\begin{algorithm}[h]\n\\caption{The expert problem}\n\\label{lec15:alg:expert_discrete}\n\\For{$t = 1, ..., T$}{\nThe learner obtains predictions from $N$ experts\\;\nThe learner chooses to follow prediction of one of the experts $i_t \\in [N]$\\;\nThe environment gives the learner the true value. The learner is thus able to learn the loss of each of the experts: $\\ell_t \\in \\{0, 1\\}^N$\\;\nThe learner suffers the loss of the expert which was chosen: $\\ell_t(i_t)$.\n}\n\\end{algorithm}\n\nWe want to design a method that chooses $i_t$ for each step (line 3 in Algorithm \\ref{lec15:alg:expert_discrete}) to minimize the regret:\n\\begin{equation}\n\\text{Regret} \\overset{\\Delta}{=} \\mathbb{E}\\left[\n\\sum_{t=1}^T \\ell_t(i_t)\n- \\underbrace{\\min_{i \\in [N]} \\sum_{t=1}^T \\ell_t(i)}_\\text{the best expert in hindsight}\n\\right],\n\\end{equation}\nwhere the expected value is over $i_t$, thus covering the case where the $i_t$'s could be random.\n    \nTo make the expert problem amenable to reduction to OCO, we introduce idea of a \\textit{continuous action space}. Instead of choosing $i_t$ from $\\Omega = [N]$, the learner chooses a distribution $p_t$ from the $N$-dimensional simplex $\\Delta(N) = \\left\\{p \\in \\R^N : \\norm{p}_1 = 1, p \\geq 0 \\right\\}$. The learner then samples $i_t \\sim p_t$. With this formulation, instead of selecting particular expert $i_t$ to follow, the learner adjusts the belief $p_t$, and samples from the distribution to choose which expert to follow. Algorithm \\ref{lec15:alg:expert_randomized} outlines this procedure. Note that the loss is the expected loss $\\mathbb{E}_{i \\sim p_t}[\\ell_t(i)]$ instead of the sampled $\\ell_t(i_t)$.\n\n\\begin{algorithm}\n\\caption{The expert problem with continuous action}\n\\label{lec15:alg:expert_randomized}\n\\For{$t = 1, ..., T$}{\nThe learner obtains predictions from $N$ experts\\;\nThe learner chooses a distribution $p_t \\in \\Delta(N)$\\;\nThe learner samples one expert $i_t \\sim p_t$\\;\nThe environment gives the learner the true value and the loss/error of all experts: $\\ell_t \\in \\{0, 1\\}^N$\\;\nThe learner suffers expected loss $\\sum_{i\\in[N]} p_t(i) \\ell_t(i) = \\langle p_t, \\ell_t \\rangle$\\;\n}\n\\end{algorithm}\n    \nWith the continuous action space, it is easy to reduce the expert problem to an OCO: see Algorithm \\ref{lec15:alg:expert_discrete_oco}. (The problem is convex since the loss function is convex and the parameter space $\\Delta(N)$ is convex.)\n\n\\begin{algorithm}[h]\n\\caption{The expert problem}\n\\label{lec15:alg:expert_discrete_oco}\n\\For{$t = 1, ..., T$}{\nThe learner obtains predictions from $N$ experts\\;\nThe learner invokes the OCO oracle to obtain $p_t \\in \\Delta(N)$\\;\nThe learner chooses to follow prediction of one of the experts $i_t \\in [N]$\\;\nThe environment gives the learner the true value. The learner is thus able to learn the loss of each of the experts: $\\ell_t \\in \\{0, 1\\}^N$\\;\nThe learner suffers the loss of the expert which was chosen: $\\ell_t(i_t)$.\nThe learner can reconstruct the loss function $f_t (p) = \\langle p, \\ell_t \\rangle$ and give it to the OCO oracle.\n}\n\\end{algorithm}\n\nIn this setting, one can rewrite the regret as:\n\\begin{align}\n\\text{Regret} &= \\sum_{t=1}^T \\langle p_t, \\ell_t \\rangle - \\min_{i\\in[N]}\\sum_{t=1}^T \\ell_t(i)  \\\\\n&= \\sum_{t=1}^T \\langle p_t, \\ell_t \\rangle - \\min_{p \\in \\Delta(N)}\\sum_{t=1}^T \\langle p, \\ell_t \\rangle \\label{lec15:eqn:changearg} \\\\\n&= \\sum_{t=1}^T f_t(p_t) - \\min_{p \\in \\Delta(N)}\\sum_{t=1}^T f_t(p). \\label{lec15:eqn:regret}\n\\end{align}\n\nWe obtain \\eqref{lec15:eqn:changearg} because\n\\begin{align}\n\\sum_{t=1}^T \\langle p, \\ell_t \\rangle &=  \\left\\langle p,  \\sum_{t=1}^T\\ell_t \\right\\rangle \\geq \\min_{i \\in [N]} \\left[ \\sum_{t=1}^T \\ell_t (i) \\right],\n\\end{align}\nwith equality for the probability distribution $p(i) =1$ when $i = \\text{argmin}_i \\left[ \\sum_{t=1}^T \\ell_t (i) \\right]$ and $p(i) = 0$ otherwise, and \\eqref{lec15:eqn:regret} is by definition of $f_t$.\n\n\n\\sec{Reducing online learning to batch learning}    \nIn this section, we present a reduction from online learning to standard supervised learning problem, also known as the ``batch problem'' in this literature.\n\nAs in the standard supervised learning setting, consider an i.i.d dataset $\\{(x_t, y_t)\\}_{t=1}^T$ and some parameter $w$. Let $L(w)$ and $\\hatL(w)$ be the population loss and empirical loss respectively. For simplicity, assume $|\\ell((x_i, y_i), w)| \\leq 1$. The theorem below establishes a link between the regret obtained in online learning and the excess risk obtained in the batch setting.\n    \n\\begin{theorem}[Relationship between excess risk and regret]\nAssume $\\ell((x, y), w)$ is convex. Suppose we run an online learning algorithm on the dataset $\\{(x_i, y_i)\\}_{i=1}^T$ and obtain a sequence of models $w_1, \\dots, w_T$, and regret $R_T$. Let $\\overline{w} = \\frac{1}{T} \\sum_{i=1}^T w_i$, then the excess risk of $\\overline{w}$ can be bounded above:\n\\begin{align}\nL(\\overline{w}) - L(w^\\star) \\leq \\frac{R_T}{T} + \\tilO\\left(\\frac{1}{\\sqrt{T}}\\right), \\label{lec15:eqn:lec15_ol_gen_bound}\n\\end{align}\nwhere $w^\\star = \\argmin_{w \\in \\Omega} L(w)$.\n\\end{theorem}\n\nHere are some intuitive interpretations of the theorem:\n\n    \\begin{itemize}\n        \\item If $R_T = O(T)$, then we have some non-trivial result. Otherwise, the bound in \\eqref{lec15:eqn:lec15_ol_gen_bound} is increasing $T$ and does not provide any useful information.\n        \\item If the batch problem has a $1 / \\sqrt{T}$ generalization bound, then the best you can hope for in online learning is $R_T = O(\\sqrt{T})$.\n        \\item If the batch problem has a $1 / T$ generalization bound, you can hope for $O(1)$ regret (or $\\tilO(1)$ regret in some cases).\n        \\item We often have $O(\\sqrt{T})$ excess risk supervised learning problems; hence it is reasonable to expect $O(\\sqrt{T})$ regret in online learning problems.\n    \\end{itemize}\n    \n\\sec{Follow-the-Leader (FTL) algorithm} \\label{lec15:sec:FTL}\nIn this section, we analyze an algorithm called ``Follow-the-Leader'' (FTL) for OCO, which is intuitive but fails to perform well in many cases.\n\nThe FTL algorithm behaves as its name suggests: it always selects the action $w_t$ such that it minimizes the historical loss the learner has seen so far, i.e.\n\\begin{equation}\nw_t = \\argmin_{w \\in \\Omega} \\sum_{i=1}^{t-1} f_i(w).\n\\end{equation}\n\nWe now demonstrate how the FTL algorithm can fail for the expert problem. In the expert problem, $f_t(p) = \\langle p, \\ell_t \\rangle$, so \n    \\begin{align}\n        p_t &= \\argmin_{p \\in \\Delta(N)} \\sum_{i=1}^{t-1} f_i(p) \\\\\n        &= \\argmin_{p \\in \\Delta(N)} \\sum_{i=1}^{t-1} \\langle\\ell_i, p\\rangle \\\\\n        &= \\argmin_{p \\in \\Delta(N)} \\left\\langle\\sum_{i=1}^{t-1}\\ell_i, p\\right\\rangle.\n    \\end{align}\n\nThe minimizer $p \\in \\Delta(N)$ is a point-mass probability, with the point mass at the smallest coordinate of $\\sum_{i=1}^{t-1} \\ell_i$. This gives regret\n\\begin{equation}\n\\text{Regret} = \\sum_{i=1}^{t-1} \\ell_i(i_t),\n\\quad \\text{ where } i_t = \\argmin_{j \\in [N]} \\sum_{i=1}^{t-1}\\ell_i(j).\n\\end{equation}\n    \nNow, consider the following example: suppose we have only two experts. Suppose expert 1 makes perfect predictions on even days while expert 2 makes perfect predictions on odd days. Assume also that the FTL algorithm chooses expert 1 to break ties (this is not an important point but makes the exposition simpler.) In this setting, the FTL algorithm always selects the \\textit{wrong} expert to follow. A few rounds of simulation of this example is shown in Table \\ref{lec15:tab:counter example}.\n\n    \\begin{table}[h]\n        \\caption{An example where FTL fails}\n        \\label{lec15:tab:counter example}\n        \\medbreak\n        \\centering\n        \\small\n        \\begin{tabular}{l|c c c c c c}\n        \\toprule\n        Day & 1 & 2 & 3 & 4 & $\\dots$ & $\\dots$ \\\\\n        \\midrule \n        Expert 1's loss & 1 & 0 & 1 & 0 & $\\dots$ & $\\dots$ \\\\\n        Expert 2's loss & 0 & 1 & 0 & 1 & $\\dots$ & $\\dots$ \\\\\n        \\midrule \n        \\midrule \n        FTL choice $i_t$ & 1 & 2 & 1 & 2 & 1 & $\\dots$ \\\\\n        \\bottomrule\n        \\end{tabular}\n    \\end{table}\n\nThe best expert in hindsight has a loss of $T/2$ (choosing either expert all the time incurs this loss, and so the regret of the FTL algorithm is $T - T/2 = T/2 = \\Theta(T)$. The main reason for FTL's failure is that is a deterministic algorithm driven by an extreme update, with no consideration on potential domain shift (it always selects the best expert based on the past with no consideration of the potential next $f_t$). Knowing its deterministic strategy, the environment can easily play in an adversarial manner. To perform better in a problem like this, we need some randomness to hedge risk."
  },
  {
    "path": "tex/collection/10-02-online.tex",
    "content": "% reset section counter\n%\\setcounter{section}{0}\n\n%\\metadata{lecture ID}{Your names}{date}\n\\metadata{16}{Kevin Guo}{Mar 10th, 2021}\n\n% ===============================================\n\\sec{Be-the-leader (BTL) algorithm}\n\nA better strategy is called \\textit{``Be the Leader'' (BTL)}.  At time $t$, the BTL strategy chooses the action that would have performed best on $f_1, \\cdots, f_{t-1}$ \\textit{and} $f_t$.  In other words, the BTL action at time $t$ is $w_{t+1}$, as defined for the FTL algorithm. Note that this is an ``illegal'' choice for the action because $w_{t+1}$ depends on $f_t$: in online convex optimization, the action at time $t$ is required to be chosen \\textit{before} seeing the function $f_t$.  Nevertheless, we can still gain some useful insights by analyzing this procedure. In particular, the following lemma shows that the BTL strategy is worth emulating because it achieves very good regret.\n\n\\begin{lemma}\\label{lec16:lem:btl_regret}\nThe BTL strategy has non-positive regret. That, is, if $w_t$ is defined as in the FTL algorithm, then\n\\begin{align}\n\\text{BTL regret} = \\sum_{t = 1}^T f_t(w_{t + 1}) - \\min_{w \\in \\Omega} \\sum_{t = 1}^T f_t(w) \\leq 0, \\label{lec16:eqn:btl_regret}\n\\end{align}\nfor any $T$ and any sequence of functions $f_1, \\cdots, f_T$.\n\\end{lemma}\n\n\\begin{proof}\nWe prove the lemma by induction on $T$. \\eqref{lec16:eqn:btl_regret} holds trivially for $T = 1$. Suppose that \\eqref{lec16:eqn:btl_regret} holds for all $t \\leq T - 1$ and any $f_1, \\cdots, f_{T-1}$.  Now we wish to extend \\eqref{lec16:eqn:btl_regret} to time $t = T$.  Let $f_T$ be any function.  Since $w_{T+1} = \\argmin_w \\sum_{t = 1}^T f_t(w)$, we can write:\n\\begin{align}\n\\sum_{t = 1}^{T} f_t(w_{t+1}) - \\min_{w \\in \\Omega} \\sum_{t = 1}^{T} f_t(w) &= \\sum_{t = 1}^T f_t(w_{t+1}) - \\sum_{t = 1}^T f_t(w_{T+1})\\\\\n&= \\sum_{t = 1}^{T - 1} f_t(w_{t+1}) - \\sum_{t = 1}^{T - 1} f_t(w_{T+1}) &\\text{(final summands cancel)}\\\\\n&\\leq \\sum_{t = 1}^{T - 1} f_t(w_{t+1}) - \\min_{w \\in \\Omega} \\sum_{t = 1}^{T - 1} f_t(w)\\\\\n&\\leq 0. &\\text{(induction hypothesis)}\n\\end{align}\n\\end{proof}\n\nA useful consequence of this lemma is a regret bound for the FTL strategy.\n\n\\begin{lemma}\n\\label{lec16:lem:ftl_regret}\n\\textup{(FTL regret bound)} Again, let $w_t$ be as in the FTL algorithm. The FTL strategy has the regret guarantee\n\\begin{align}\n\\text{FTL regret} = \\sum_{t = 1}^T f_t(w_t) - \\min_{w \\in \\Omega} \\sum_{t = 1}^T f_t(w) \\leq \\sum_{t = 1}^T [f_t(w_t) - f_t(w_{t+1})].\n\\end{align}\n\\end{lemma}\n\n\\begin{proof}\n\\begin{align}\n\\text{FTL regret} &= \\sum_{t = 1}^T f_t(w_t) - \\min_{w \\in \\Omega} \\sum_{t = 1}^T f_t(w) \\\\\n&= \\sum_{t = 1}^T f_t(w_{t+1}) - \\min_{w \\in \\Omega} \\sum_{t = 1}^T f_t(w) + \\sum_{t = 1}^T [f_t(w_t) - f_t(w_{t+1})] \\\\\n&\\leq 0 + \\sum_{t = 1}^T [f_t(w_t) - f_t(w_{t+1})],\n\\end{align}\nwhere the last inequality is due to \\eqref{lec16:eqn:btl_regret}.\n\n\\end{proof}\n\nLemma \\ref{lec16:lem:ftl_regret} tells us that if terms $f_t(w_t) - f_t(w_{t+1})$ are small (e.g. $w_t$ does not change much from round to round), then the FTL strategy can have small regret. It suggests that the player should adopt a \\textit{stable} policy, i.e. one where the terms $f_t(w_t) - f_t(w_{t+1})$ are small.  It turns out that following this intuition will lead to a strategy that improves the regret all the way to $O(\\sqrt{T})$ in certain cases.\n\n% ===============================================\n\\sec{Follow-the-regularized-leader (FTRL) strategy}\n\nNow, we discuss a OCO strategy aims to improve the stability of FTL by controlling the differences $f_t(w_t) - f_t(w_{t+1})$. To describe the method, we will first need a preliminary definition.\n\n\\begin{definition}\nWe say that a differentiable function $\\phi : \\Omega \\mapsto \\R$ is \\textit{$\\alpha$-strongly-convex} with respect to the norm $|| \\cdot ||$ on $\\Omega$ if we have \n\\begin{equation}\\label{lec16:eqn:strongly-convex}\n\\phi(x) \\geq \\phi(y) + \\langle \\nabla f(y), x - y \\rangle + \\frac{\\alpha}{2} \\norm{x - y}^2\n\\end{equation}\nfor any $x, y \\in \\Omega$.\n\\end{definition}\n\n\\begin{remark}\nIf $\\phi$ is convex, then we know that $f(x)$ has a linear lower bound $\\phi(y) + \\langle \\nabla f(y), x - y \\rangle$. Being $\\alpha$-strong-convex means that $f(x)$ has a quadratic lower bound, the RHS of \\eqref{lec16:eqn:strongly-convex}. This quadratic lower bound is very useful in proving theorems in optimization.\n\\end{remark}\n\n\\begin{remark}\nIf $\\nabla^2 f(y) \\succeq \\alpha I$ for all $y$, then $f$ is $\\alpha$-strongly-convex. This follows directly from writing the second-order Taylor expansion of $f$ around $y$.\n\\end{remark}\n\nGiven a $1$-strongly-convex function $\\phi(\\cdot)$, which we call a \\textit{regularizer}, we can implement the \\textit{``Follow the Regularized Leader'' (FTRL)} strategy.  At time $t$, this strategy chooses the action\n\\begin{align}\nw_t = \\argmin_{w \\in \\Omega} \\left[ \\sum_{i = 1}^{t -1} f_i(w) + \\frac{1}{\\eta} \\phi(w) \\right], \\label{lec16:eqn:ftrl}\n\\end{align}\nwhere $\\eta > 0$ is a tuning parameter that we will tune later.\n\n\\subsec{Regularization and stability}\n\nTo understand why we might use the FTRL policy, we first establish that it achieves the intended goal of controlling the differences $f_t(w_t) - f_t(w_{t+1})$. Actually, we will show a more general result that adding a regularizer induces stability for any convex objective.\n\n\\begin{lemma}\n\\label{lec16:lem:regularizers_stability}\n\\textup{(Regularizers induce stability)} Let $F$ and $f$ be functions taking $\\Omega$ into $\\R$, and assume that $F$ is $\\alpha$-strongly-convex with respect to the norm $\\norm{\\cdot}$ and that $f$ is convex.  Let $w = \\argmin_{z \\in \\Omega} F(z)$ and $w' = \\argmin_{z \\in \\Omega} [f(z) + F(z)]$.  Then\n\\begin{equation}\\label{lec16:eqn:regularizers_stability}\n0 \\leq f(w) - f(w') \\leq \\frac{1}{\\alpha} \\norm{\\nabla f(w)}_*^2,\n\\end{equation}\nwhere $\\norm{\\cdot}_*$ is the dual norm of $\\norm{\\cdot}$.\n\\end{lemma}\n\n\\begin{proof}\nBy strong convexity,\n\\begin{align}\nF(w') - F(w) &\\geq \\langle \\nabla F(w), w' - w \\rangle + \\frac{\\alpha}{2} \\norm{w - w'}^2 \\\\\n&\\geq \\frac{\\alpha}{2} \\norm{w - w'}^2,\n\\end{align}\nwhere in the second step we used the fact that the KKT optimality conditions for $w$ imply $\\langle \\nabla F(w), w' - w \\rangle \\geq 0$. (Informally, if $\\Omega = \\R^d$, then $\\nabla F(w) = 0$ as $w$ minimizes $F$. If $\\Omega$ is a convex subset of $\\R^d$, then the gradient $\\nabla F(w)$ must be perpendicular to the tangent to $\\Omega$ at $w$; otherwise, we could move in the direction of the negative gradient and project back to the set $\\Omega$ to lower the value of $F$.) Since $F + f$ is also $\\alpha$-strongly convex, exactly the same argument implies:\n\\begin{align}\n[F(w) + f(w)] - [F(w') + f(w')] \\geq \\frac{\\alpha}{2} \\norm{w - w'}^2.\n\\end{align}\nAdding these two inequalities gives\n\\begin{align}\nf(w) - f(w') \\geq \\alpha \\norm{w - w'}^2. \\label{lec16:eqn:lower_bound}\n\\end{align}\nSince this lower bound is clearly positive, this shows $0 \\leq f(w) - f(w')$.\n\nNext, we prove the upper bound on $f(w) - f(w')$. Rearranging the inequality \\eqref{lec16:eqn:lower_bound}, we obtain\n\\begin{align}\n\\norm{w - w'} \\leq \\sqrt{\\frac{1}{\\alpha} [f(w) - f(w')]}. \\label{lec16:eqn:upper_bound}\n\\end{align}\nSince $f$ is convex, we have $f(w') \\geq f(w) + \\langle \\nabla f(w), w' - w \\rangle$.  Rearranging this gives\n\\begin{align*}\nf(w) - f(w') &\\leq \\langle \\nabla f(w), w - w' \\rangle\\\\\n&\\leq \\norm{\\nabla f(w)}_* \\cdot \\norm{w - w'} &\\text{(by Cauchy-Schwarz)} \\\\\n&\\leq \\norm{\\nabla f(w)}_* \\sqrt{ \\frac{1}{\\alpha} [f(w) - f(w')]}. &\\text{(by \\eqref{lec16:eqn:upper_bound})}\n\\end{align*}\nSince $f(w) - f(w') \\geq 0$, we can square both sides of this inequality to conclude that\n\\begin{equation}\n[f(w) - f(w')]^2 \\leq || \\nabla f(w) ||_*^2 \\frac{1}{\\alpha} [f(w) - f(w')].\n\\end{equation}\nDividing both sides of this expression by $f(w) - f(w')$ gives the desired upper bound.\n\\end{proof}\n\n\\begin{remark}\nConsider the special case where $\\nabla f(w) = 0$. In this situation, $w$ is the minimizer of both $F$ and $f$, and hence is the minimizer of $F + f$. This implies that $w = w'$, and the inequalities in \\eqref{lec16:eqn:regularizers_stability} become equalities.\n\\end{remark}\n\n\\subsec{Regret of FTRL}\nWe are now ready to prove a regret bound for the FTRL procedure, based on the idea that strongly convex regularizers induce stability.\n\n\\begin{theorem}\\label{lec16:thm:ftrl_regret}\n\\textup{(Regret of FTRL)} Let $\\phi$ be a 1-strongly-convex regularizer with respect to the norm $\\norm{\\cdot}$ on $\\Omega$.  Then the FTRL algorithm (\\ref{lec16:eqn:ftrl}) satisfies the regret guarantee\n\\begin{align}\n\\text{FTRL regret} = \\sum_{t = 1}^T f_t(w_t) - \\argmin_{w \\in \\Omega} \\sum_{t = 1}^T f_t(w)  \\leq \\frac{D}{\\eta} + \\eta \\sum_{t = 1}^T \\norm{\\nabla f_t(w_t)}_*^2,\n\\end{align}\nwhere $D = \\max_{w \\in \\Omega} \\phi(w) - \\min_{w \\in \\Omega} \\phi(w)$.\n\\end{theorem}\n\n\\begin{remark}\nSuppose that for all $t$ and $w$, we have the uniform bound $|| \\nabla f_t(w) ||_* \\leq G$.  Then Theorem \\ref{lec16:thm:ftrl_regret} implies that the regret is upper bounded by $D / \\eta + \\eta G T$.  Optimizing this upper bound over $\\eta$ by taking $\\eta = \\sqrt{\\dfrac{D}{TG^2}}$ gives the guarantee\n\\begin{equation}\\label{lec17:eqn:ftrl-regret-ub}\n\\text{FTRL regret} \\leq 2 \\sqrt{D G} \\times \\sqrt{T}.\n\\end{equation}\nIn other words, optimally-tuned FTRL can achieve $O(\\sqrt{T})$ regret in many cases.\n\\end{remark}\n\n\\begin{proof}\nFor convenience, define $f_0(w) = \\phi(w) / \\eta$.  Then the FTRL policy can be written as\n\\begin{equation}\nw_t = \\argmin_{w \\in \\Omega} \\sum_{i = 0}^{t - 1} f_i(w),\n\\end{equation}\ni.e. FTRL is just FTL with an additional ``round'' of play at time zero. Thus, by Lemma \\ref{lec16:lem:ftl_regret} with time starting from $t = 0$, we have\n\\begin{align}\n\\sum_{t = 0}^T f_t(w_t) - \\argmin_{w \\in \\Omega} \\sum_{t = 0}^T f_t(w) &\\leq \\sum_{t = 0}^T [f_t(w_t) - f_t(w_{t+1})].\n\\end{align}\nFor any $t \\geq 1$, applying Lemma \\ref{lec16:lem:regularizers_stability} with $F(w) = \\sum_{i = 0}^{t-1} f_i(w)$ (which is $1/\\eta$-strongly-convex) and $f(w) = f_t(w)$ gives the bound $f_t(w_t) - f_t(w_{t+1}) \\leq \\eta || \\nabla f_t(w_t) ||_*^2$.  Plugging this into the preceding display gives the upper bound:\n\\begin{align}\n\\sum_{t = 0}^T f_t(w_t) - \\argmin_{w \\in \\Omega} \\sum_{t = 0}^T f_t(w) &\\leq f_0(w_0) - f_0(w_1) + \\eta \\sum_{t = 1}^T \\norm{\\nabla f_t(w_t)}_*^2. \\label{lec16:eqn:ftrl_ub}\n\\end{align}\n\nNext, we need to relate the LHS of the above display (which starts at time $t = 0$) to the actual regret of FTRL (which starts at time $t = 1$). To do this, define $w^* = \\argmin_{w \\in \\Omega} \\sum_{t = 1}^T f_t(w)$. Then,\n\\begin{align}\n\\sum_{t = 0}^T f_t(w_t) - \\argmin_{w \\in \\Omega} \\sum_{t = 0}^T f_t(w) &\\geq \\sum_{t = 0}^T f_t(w_t) - \\sum_{t = 0}^T f_t(w^*)\\\\\n&= f_0(w_0) - f_0(w^*) + \\underbrace{\\left( \\sum_{t = 1}^T f_t(w_t) - \\argmin_{w \\in \\Omega} \\sum_{t = 1}^T f_t(w)  \\right)}_{\\text{Regret of FTRL}}.\n\\end{align}\nCombining this inequality with (\\ref{lec16:eqn:ftrl_ub}) gives\n\\begin{align}\n\\text{Regret of FTRL} &\\leq f_0(w_0) - f_0(w_1) + f_0(w^*) - f_0(w_0) + \\eta \\sum_{t = 1}^T \\norm{\\nabla f_t(w_t)}_*^2\\\\\n&= \\frac{\\phi(w^*) - \\phi(w_1)}{\\eta} + \\eta \\sum_{t = 1}^T \\norm{\\nabla f_t(w_t)}_*^2\\\\\n&\\leq \\frac{D}{\\eta} + \\eta \\sum_{t = 1}^T \\norm{\\nabla f_t(w_t)}_*^2.\n\\end{align}\nThis concludes the proof of the theorem.\n\\end{proof}\n\n\\subsec{Applying FTRL to online linear regression}\n\nWe apply the FTRL algorithm to a concrete machine learning problem. Let $\\Omega = \\{ \\omega \\, : \\, \\norm{w}_2 \\leq 1 \\}$, and let $f_t(\\omega) = \\tfrac{1}{2}(y_t - \\omega^{\\top} x_t)^2$ for some observation pair $(x_t, y_t)$ satisfying $\\norm{x_t}_2 \\leq 1$ and $|y_t| \\leq 1$.  This corresponds to a problem where we are trying to make accurate predictions using a linear model, but we do not assume any structure on the observation sequence $(x_t, y_t)$ beyond boundedness.\n\nConsider using FTRL in this problem with a ridge regularizer, $\\phi(\\omega) = \\tfrac{1}{2} \\norm{w}_2^2$.  One can check that $\\phi$ is 1-strongly-convex with respect to the $\\ell_2$-norm, and also that $D = \\max_{\\omega \\in \\Omega} \\phi(\\omega) - \\min_{\\omega \\in \\Omega} \\phi(\\omega) = \\tfrac{1}{2}$.  Moreover, for all $t$ and $w$ we have \n\\begin{align}\n\\nabla f_t(w) &= - (y_t - w^\\top x_t) x_t, \\\\\n\\norm{\\nabla f_t(w)}_2 &\\leq |y_t - w^\\top x_t| \\cdot \\norm{x_t}_2 \\\\\n&\\leq 2 \\cdot 1 = 2.\n\\end{align}\nTherefore, by choosing $\\eta = \\sqrt{1/(8T)}$ and applying the FTRL regret theorem (Theorem \\ref{lec16:thm:ftrl_regret}), we can obtain the regret guarantee\n\\begin{align}\n\\sum_{t = 1}^T (y_t - w_t^{\\top} x_t)^2 - \\min_{|| w ||_2 \\leq 1} \\sum_{t = 1}^T  (y_t - w^{\\top} x_t)^2 \\leq 4 \\sqrt{T}.\n\\end{align}\n\n\\subsec{Applying FTRL to the expert problem}\n\nFor the expert problem, recall that the action space is $\\Delta (N)$ and $f_t = \\langle \\ell_t , p \\rangle$, where $\\ell_t \\in [0,1]^N$. As a first attempt at applying FTRL to this problem, we set $\\phi (p) = \\frac{1}{2}\\norm{p}_2^2$. With this choice,\n\\begin{align}\nD &= \\max_{p \\in \\Delta(N)} \\phi (p) - \\min_{p \\in \\Delta(N)} \\phi (p) \\\\\n&\\leq \\max_{p \\in \\Delta(N)} \\frac{1}{2}\\norm{p}_2^2 \\\\\n&\\leq \\max_{p \\in \\Delta(N)} \\frac{1}{2}\\norm{p}_1^2 \\\\\n&= \\frac{1}{2}.\n\\end{align}\n\nAlso,\n\\begin{align}\n\\norm{\\nabla f_t}_2 &= \\norm{\\ell_t}_2 \\leq \\sqrt{N}.\n\\end{align}\n\nThus, the regret bound is $O(G\\sqrt{DT}) = O(\\sqrt{NT})$. This is optimal dependency on $T$, but not good dependency on $N$.\n\nNext, we show that if we change our regularization, we can get a better regret guarantee which is logarithmic in $N$, i.e., the regret is $O(\\sqrt{(log N) \\cdot T})$. The new regularizer we choose is the \\textit{(negative) entropy regularizer}:\n\\begin{equation}\n\\phi(p) = -H(p) = \\sum_{j=1}^N p(j)\\log p(j),\n\\end{equation}\nwhere $p \\in \\Delta(N)$ is in the set of distributions over $[N]$. We first introduce the following nice property of this regularizer:\n\\begin{lemma}\n\t$\\phi(p)$ defined above is 1-strongly convex with respective to the $\\ell_1$ norm $\\|\\cdot\\|_1$. \n\\end{lemma}\n\n\\begin{proof}\nBy definition of strong convexity, we need to show that for all $p, q \\in \\Delta(N)$,\n\\begin{equation}\\label{lec17:eqn:entropy-sc}\n\\phi(p) - \\phi(q) - \\langle \\nabla \\phi(q), p-q\\rangle \\geq \\frac{1}{2} \\|p-q\\|_1^2.\n\\end{equation}\n\t\nFrom direct computation, we know the gradient of $\\phi(q)$ is \n\\begin{equation}\n\\nabla\\phi(q) = \\begin{bmatrix} 1+\\log q(1)\\\\\\cdots \\\\ 1+\\log q(N) \\end{bmatrix}.\n\\end{equation}\n\t\nPlugging this into the LHS of \\eqref{lec17:eqn:entropy-sc}, we get\n\\begin{align}\n&\\phi(p) - \\phi(q) - \\langle \\nabla \\phi(q), p-q\\rangle  \\\\\n=& \\sum_{j=1}^N p(j)\\log p(j) - \\sum_{j=1}^N q(j)\\log q(j) - \\sum_{j=1}^N \\left(1 + \\log q(j)\\right)\\left(p(j) - q(j)\\right) \\\\\n=& \\sum_{j=1}^N p(j)\\log p(j) - \\sum_{j=1}^N p(j)\\log q(j) - \\sum_{j=1}^N \\left(p(j) - q(j)\\right)\\\\\n=& \\sum_{j=1}^N p(j) \\log \\frac{p(j)}{q(j)} \\label{lec17:eqn:entropy-sc-proof} \\\\\n=& KL(p||q),\n\\end{align}\nwhere $KL(p || q)$ is the KL-divergence between $p$ and $q$. (We used the fact that $\\sum_{j=1}^N p(j) = \\sum_{j=1}^N q(j) = 1$ to get \\eqref{lec17:eqn:entropy-sc-proof}.) Finally, we finish the proof by applying Pinsker's inequality: $KL(p||q) \\geq \\frac{1}{2} \\norm{p-q}_1^2$. \n\t\n\\end{proof}\n\nHence, $\\phi$ is a satisfies the condition on the regularizer for our FTRL regret guarantee. To obtain the regret bound \\eqref{lec17:eqn:ftrl-regret-ub}, we also need to bound $D = \\sup \\phi(p) - \\inf \\phi(p)$ and $G = \\sup \\|\\nabla f_t(w)\\|_\\infty$ (since $\\|\\cdot\\|_\\infty$ is the dual norm of $\\|\\cdot \\|_1$ ). Since negative entropy is always non-positive and (positive) entropy is always bounded above by $\\log N$, we bound $D$ with\n\\begin{equation} \nD = \\sup \\phi(p) - \\inf \\phi(p) \\leq -\\inf \\phi (p) = -\\inf (-H(p)) = \\sup (H(p)) \\leq \\log N,\n\\end{equation}\nand we bound $G$ with\n\\begin{equation}\nG = \\|\\nabla f_t(w)\\|_\\infty = \\|l_t\\|_\\infty \\leq 1.\n\\end{equation}\n\nPlugging these two into the regret bound \\eqref{lec17:eqn:ftrl-regret-ub} we get bound $O(\\sqrt{(\\log N) \\cdot T})$. \n\nThus far, we have looked at FTRL and the expert problem abstractly: at each time $t$ we choose action $p_t$ based on the update\n\\begin{equation}\np_t = \\argmin_{p \\in \\Delta(N)} \\sum_{i=1}^{t+1} f_t(p) - \\frac{1}{\\eta} H(p).\n\\end{equation}\n\n\\textbf{Can we get an exact analytical solution for $p_t$?} Since we are minimizing a convex function, we can call some off-the-shelf convex optimization algorithm to solve this at each step. Another way is to write down the KKT conditions and solve that set of equations.  Instead, we will show that there exists much simpler ways to solve this update. In particular, we will be using the \\textit{Gibbs variational principle}, which is essentially the KKT conditions under the hood.\n\n\\begin{lemma}[Gibbs variational principle] \\label{lec17:lem:gibbs}\nLet $\\nu, \\mu$ be probability distributions on $[N]$. Then \n\\al{\\sup_\\nu \\left(\\Exp_\\nu[f] - KL(\\nu||\\mu)\\right) = \\log \\Exp_\\mu \\left[e^f\\right],} where $\\Exp_\\nu[f] = \\Exp_{x \\sim \\nu} [f(x)] = \\langle v, f\\rangle$ and $\\Exp_\\mu \\left[e^f\\right] = \\Exp_{x \\sim \\mu}  \\left[e^{f(x)}\\right]$. Moreover, the optimal solution is attained at \\al{\\nu(x) \\propto \\mu(x) \\cdot e^{f(x)}.}\n\\end{lemma}\n\nIntuitively, Lemma~\\ref{lec17:lem:gibbs} says that taking the supremum over distributions $\\mu$ of a linear function plus the KL divergence as the regularizer will give us the same distribution as exponentiating $f$. \n\nIf we take $\\mu$ to be the uniform distribution on $[N]$ and replace $f$ with $-f$ in Lemma~\\ref{lec17:lem:gibbs}, we get the following corollary:\n\n\\begin{corollary}\\label{lec17:cor:gibbs}\n\tLet $\\nu$ be a probability distribution. Then, \n\t$\\Exp_\\nu[f] - H(\\nu)$ is minimized at $\\nu(x) \\propto e^{-f(x)}$.\n\\end{corollary} \n\n\\begin{proof}\nWhen $\\mu$ is uniform distribution, we have\n\\begin{align}\nKL(\\nu||\\mu) &= \\sum_x \\nu(x) \\log \\frac{\\nu(x)}{\\mu(x)} \\\\\n&= \\log N - \\sum_x \\nu(x) \\log \\frac{1}{\\nu(x)} \\\\\n&= \\log N - H(\\nu).\n\\end{align}\n\nSo $\\sup_\\nu \\left(\\Exp_\\nu[-f] - KL(\\nu||\\mu)\\right) = -\\inf_\\nu \\left(\\Exp_\\nu[f] - H(\\nu) + \\log N\\right)$. This means that the value of $\\nu$ that attains the infimum of $\\Exp_\\nu[f] - H(\\nu)$ is the same $\\nu$ attaining the supremum of $\\Exp_\\nu[-f] - KL(\\nu||\\mu)$, which by Lemma~\\ref{lec17:lem:gibbs} is proportional to $e^{-f(x)}$.\n\\end{proof}\n\nWe now apply the Gibbs variational principle to the expert problem. Notice that our FTRL update for the expert problem at time $t$ can be written as\n\\begin{equation}\n\\argmin_{p_t \\in \\Delta(N)} \\l\\langle \\sum_{i=1}^{t-1}l_i, p_t \\r\\rangle - \\frac{1}{\\eta}H(p_t) = \\argmin_{p_t \\in \\Delta(N)} \\l\\langle\\eta \\sum_{i=1}^{t-1}l_i, p_t \\r\\rangle - H(p_t),\n\\end{equation}\nwhere $l_i$ is the vector of expert losses at time $i$. Letting $f = \\eta \\sum_{i=1}^{t-1} l_i$, we know from Corollary~\\ref{lec17:cor:gibbs} that the minimizer is attained at $p_t \\propto \\exp \\l(-\\eta \\sum_{i=1}^{t-1}l_i \\r)$, or equivalently,  \n\\begin{equation}\np_t(j) = \\frac{\\exp(-\\eta L_t(j))}{\\sum_{k=1}^N \\exp(-\\eta L_t(k))},\n\\end{equation}\nwhere $L_t = \\sum_{i=1}^{t-1}l_i$ is the cumulative loss vector. Basically, solving the expert problem is to look a the historical loss of each expert and take softmax to find the probability distribution of how much to trust each expert. \n\nThis algorithm is also called the ``Multiplicative Weights Update'', which has been studied before online learning framework became popular~\\cite{arora2005fast, freund1997decision, littlestone1994weighted}. One way of doing multiplicative weights update is the following: Let $\\tilde{p}_t$ be the unnormalized distribution that we keep track of. At each time step $t$, for each expert $j$, we look at $l_{t-1}(j)$. if $l_{t-1}(j)=1$, i.e. the expert made a mistake at the previous time step, we update $\\tilde{p}_t(j) = \\tilde{p}_{t-1}(j) \\cdot \\exp(-\\eta)$; otherwise we make no change. We then get a distribution by normalizing $\\tilde{p}_t$:\n\\begin{equation}\np_t = \\frac{\\tilde{p}_t}{\\|\\tilde{p}_t\\|_1}.\n\\end{equation}\n\n\\sec{Convex to linear reduction}\n\nIn the previous section we considered the expert problem, where the loss function is a \\textit{linear} function of the parameters. At first glance we may think this is a very restrictive constraint for online convex optimization. However, as we will see in this section, we can always assume $f_t$ to be linear in online convex optimization without loss of generality. That means that for online learning, the linear case is the hardest one. \n\nMore concretely, assume we have an algorithm $\\cA$ that solves the linear case. Given any online convex optimization, we will build an algorithm $\\cA'$ which invokes algorithm $\\cA$ in the following fashion: for $t = 1, \\dots, T$,\n\\begin{enumerate}\n\t\\item The learner invoke $\\cA$ to get output action $w_t \\in \\Omega$. \n\t\\item The environment gives the learner the loss function $f_t(\\cdot)$. \n\t\\item The learner construct a linear function $g_t(w) = \\langle\\nabla f_t(w_t), w \\rangle$, which is the local linear approximation of $f$ at $w$. (Technically the local linear approximation of $f$ and $w$ is $\\langle \\nabla f_t(w_t), w - w_t\\rangle$, but we drop the $w_t$ shift for convenience.)\n\t\\item The learner feeds $g_t(\\cdot)$ to algorithm $\\cA$ as the loss function. \n\\end{enumerate}\n\nWe have the following informal claim\\footnote{For rigorous proof, we need additional assumptions and restrictions on $f_t, g_t$.}:\n\\begin{proposition}[Informal]\n\tIf a deterministic algorithm $\\cA$ has regret no more than $\\gamma (T)$ for linear cases for some function $\\gamma (\\cdot)$, then $\\cA'$ stated above has regret no more than $\\gamma(T)$ for convex functions. \n\\end{proposition}\n\n\\begin{proof}\n\tFor all $w \\in \\Omega$, the regret guarantee on $\\cA$ tells us that\n\t\\begin{align}\n\t\t\\sum_{t=1}^T g_t(w_t) - \\sum_{t=1}^T g_t(w) \\leq \\gamma(T).\n\t\\end{align}\n\tSince $f_t$ is convex, we also know that\n\t\\begin{align}\n\t\tg_t(w_t) - g_t(w) = \\langle \\nabla f_t(w_t), w_t- w \\rangle \\geq f_t(w_t) - f_t(w).\n\t\\end{align}\n\tTherefore, for all $w \\in \\Omega$,\n\t\\begin{align}\n\t\t\\sum_{t=1}^T f_t(w_t) - \\sum_{t=1}^Tf_t(w) &\\leq \\sum_{t=1}^T g_t(w_t) - \\sum_{t=1}^T g_t(w) \\\\\n\t\t&\\leq \\gamma(T).\n\t\\end{align}\n\t\n\tHence, the regret for $\\cA'$ is upper bounded by $\\gamma(T)$ as well.\n\\end{proof}\n\n\\subsec{Online gradient descent}\nIn this section we combine the FTRL framework with $\\ell_2$-regularization and the online-to-linear reduction. The resulting algorithm is \\textit{online gradient descent}.\n\nConcretely, given any convex online optimization problem, we first do the online-to-linear reduction, then we use FTRL with $\\ell_2$ regularization ($\\phi (w) = \\frac{1}{2}\\norm{w}_2^2$) to solve the resulting linear case. This gives us the following update:\n\\begin{align}\nw_t &= \\argmin \\sum_{i=1}^{t-1} g_i(w) + \\frac{1}{\\eta} \\|w\\|_2^2 \\\\\n&= \\argmin_{w \\in \\Omega} \\sum_{i=1}^{t-1} \\langle \\nabla f_i(w_i), w \\rangle + \\frac{1}{\\eta} \\|w\\|_2^2 \\\\\n&= \\Pi_\\Omega \\left( -\\eta \\cdot \\sum_{i=1}^{t-1} \\nabla f_i(w_i) \\right),\n\\end{align}\nwhere $\\Pi_\\Omega (\\cdot)$ is the projection operator onto the set $\\Omega$.The last equality is because for any vector $a$, we have \n\\begin{align}\n\\argmin_{w \\in \\Omega} \\langle a, w\\rangle + \\frac{1}{\\eta} \\|w\\|_2^2 & = \\argmin_{w \\in \\Omega} \\frac{1}{2\\eta} \\|w + \\eta a\\|_2^2 - \\eta \\|a\\|_2^2 \\\\\n& = \\argmin_{w \\in \\Omega} \\|w + \\eta a\\|_2^2 \\\\\n& = \\argmin_{w \\in \\Omega} \\|w - (-\\eta a)\\|_2^2 \\\\\n& = \\Pi_\\Omega(-\\eta a).\n\\end{align}\n\nIntuitively, we can think of this algorithm as gradient descent with ``lazy'' projection:\n\\begin{align}\nu_t &= u_{t-1} - \\eta \\nabla f_{t-1}(w_{t-1}), \\\\\nw_t &= \\Pi_\\Omega(u_t).\n\\end{align}\n\nSimilarly, we can define gradient descent with ``eager'' projection (which can get similar regret bounds):\n\\begin{align}\nu_t &= w_{t-1} - \\eta \\nabla f_{t-1}(w_{t-1}), \\\\\nw_t &= \\Pi_\\Omega(u_t).\n\\end{align}\n\nThis concludes our discussion of online learning in this course."
  },
  {
    "path": "tex/figures/ntk-1d.py",
    "content": "import matplotlib.pyplot as plt\nimport numpy as np\n\n# Data for plotting\nt = np.arange(0.0, 1.3, 0.01)\n\nfig, ax = plt.subplots()\n\nbeta = 2\n\ndef fun(alpha, i):\n\tlf = (1.0 - alpha*(t - beta*t*t))**2\n\tlg = (1.0 - alpha*t)**2\n\tif alpha == 1:\n\t\tax.plot(t, lf, linestyle = 'solid', label='$\\hat{L}(\\\\bar{f}_{\\\\theta})$', color = 'C%d'%i)\n\t\tax.plot(t, lg, linestyle = 'dashed', label='$\\hat{L}(\\\\bar{g}_{\\\\theta})$', color = 'C%d'%i)\n\telse: \n\t\tax.plot(t, lf, linestyle = 'solid', label='$\\hat{L}(\\\\alpha \\\\bar{f}_{\\\\theta}))$, alpha=%.f'%alpha, color = 'C%d'%i)\n\t\tax.plot(t, lg, linestyle = 'dashed', label='$\\hat{L}(\\\\alpha \\\\bar{g}_{\\\\theta}))$, alpha=%.f'%alpha, color = 'C%d'%i)\n\nfun(1, 1)\n\nfun(2, 2)\n\nfun(5, 3)\n\nfun(10, 4)\n\n\nax.set_ylim([0, 1.5])\nax.set(xlabel='theta', ylabel='loss',\n       title='NTK regime via reparameterization')\nax.legend()\n\nfig.savefig(\"ntk-1d.png\")\nplt.show()\n"
  },
  {
    "path": "tex/macros.tex",
    "content": "\\usepackage{color}\n\\usepackage{lipsum}\n\\usepackage{enumitem}\n\n% for potential improvements\n\\def\\shownotes{0}  %set 1 to show author notes\n\\ifnum\\shownotes=1\n\\newcommand{\\authnoteimp}[2]{[#1: #2]}\n\\else\n\\newcommand{\\authnoteimp}[2]{}\n\\fi\n\\newcommand{\\tnoteimp}[1]{{\\color{blue}\\authnoteimp{todo}{#1}}}\n\\newcommand{\\tnote}[1]{{\\color{blue}\\authnoteimp{TM}{#1}}}\n\\newcommand{\\todo}[1]{\\tnoteimp{#1}}\n\n\n% for course staff to edit or comment\n\\def\\shownotes{0}  %set 1 to show author notes\n\\ifnum\\shownotes=1\n\\newcommand{\\authnote}[2]{[#1: #2]}\n\\else\n\\newcommand{\\authnote}[2]{}\n\\fi\n\\newcommand{\\ttodo}[1]{{\\color{blue}\\authnote{todo for TM}{#1}}}\n\n\n% for long term commnets \n\\def\\shownotes{0}  %set 1 to show author notes\n\\ifnum\\shownotes=1\n\\newcommand{\\authnotelong}[2]{[#1: #2]}\n\\else\n\\newcommand{\\authnotelong}[2]{}\n\\fi\n\\newcommand{\\tnotelong}[1]{{\\color{blue}\\authnotelong{TM}{#1}}}\n\n\n\n\n\\ifnum\\lectureformat=1\n\\newcommand{\\metadata}[3]\n{\n\t\\newpage\n\t\n\t\\def\\lectureID{#1}\n\t\n\t\\setcounter{chapter}{\\lectureID}\n\n%\t\\draftnotice\n\t\n\t\\begin{center}\n\t\t\\bf\\large CS229M/STATS214: Machine Learning Theory\n\t\\end{center}\n\t\n\t\\noindent\n\tLecturer: Tengyu Ma   %%% FILL IN LECTURER (if not RS)\n\t\\hfill\n\tLecture \\# \\lectureID              %%% FILL IN LECTURE NUMBER HERE\n\t\\\\\n\tScribe: #2                  %%% FILL IN YOUR NAME HERE\n\t\\hfill\n\t#3           %%% FILL IN LECTURE DATE HERE\n\t\n\t\\noindent\n\t\\rule{\\textwidth}{1pt}\n\t\n\t\\medskip\n}\n\\else \n\\newcommand{\\metadata}[3]{}\n\\fi\n\n\\newcommand*\\circled[1]{\\tikz[baseline=(char.base)]{\n\t\\node[shape=circle,draw,inner sep=2pt] (char) {#1};}}\n\n\\DeclareMathOperator*{\\Exp}{\\mathbb{E}}\n\\DeclareMathOperator*{\\argmin}{\\textup{argmin}}\n\\DeclareMathOperator*{\\argmax}{\\textup{argmax}}\n\n\\newcommand{\\Cov}{\\operatorname{Cov}}\n\\newcommand{\\KL}{\\operatorname{KL}}\n\\newcommand{\\margin}{\\text{margin}}\n\\newcommand{\\poly}{\\operatorname{poly}}\n\\newcommand{\\sd}{\\operatorname{sd}}\n\\newcommand{\\sgn}{\\text{sgn}}\n\\newcommand{\\tr}{\\operatorname{tr}}\n\\newcommand{\\Var}{\\operatorname{Var}}\n\\newcommand{\\ind}[1]{\\mathbf{1}[#1]}\n\n\\newcommand{\\err}{\\ell_{\\textup{0-1}}}\n\\newcommand{\\Err}{L_{\\textup{0-1}}}\n\\newcommand{\\thetaerm}{\\theta_{\\textup{ERM}}}\n\\newcommand{\\hatL}{\\widehat{L}}\n\\newcommand{\\tilO}{\\widetilde{O}}\n\\newcommand{\\iid}{\\overset{\\textup{iid}}{\\sim}}\n\\newcommand\\defeq{\\stackrel{\\mathclap{\\text{\\tiny\\mbox{$\\Delta$}}}}{=}}\n\n\\newcommand{\\gammamin}{\\gamma_{\\mathrm{min}}}\n\\newcommand{\\phirelu}{\\phi_{\\textup{relu}}}\n\\newcommand{\\supunitball}{\\sup_{\\overline{u}:\\norm{\\overline{u}}_2 \\le 1}}\n\\newcommand{\\ubar}{\\overline{u}}\n\\newcommand{\\thetazero}{\\theta^{0}}\n\\newcommand{\\popL}{L(\\beta)}\n\\newcommand{\\empL}{\\hatL(\\beta)}\n\\newcommand{\\popLt}{L(\\beta^t)}\n\\newcommand{\\empLt}{\\hatL(\\beta^t)}\n\\newcommand{\\yhat}[0]{\\hat{y}}\n\n\\newcommand{\\norm}[1]{\\|#1\\|}\n\\newcommand{\\Norm}[1]{\\left\\|#1\\right\\|}\n\\renewcommand{\\l}{\\left}\n\\renewcommand{\\r}{\\right}\n\\newcommand{\\rbr}[1]{\\left(#1\\right)}\n\\newcommand{\\sbr}[1]{\\left[#1\\right]}\n\\newcommand{\\cbr}[1]{\\left\\{#1\\right\\}}\n\\newcommand{\\abs}[1]{\\left\\lvert#1\\right\\rvert}\n\\newcommand{\\inprod}[1]{\\left\\langle#1\\right\\rangle}\n\\renewcommand{\\because}{{\\textup{because}~}}\n\n\\newcommand{\\al}[1]{\n\t\\begin{align}\n\t#1\n\t\\end{align}\n}\n\n\\renewcommand{\\sp}[1]{^{(#1)}}\n\n\\newcommand{\\cA}{\\mathcal A}\t\n\\newcommand{\\cB}{\\mathcal B}\n\\newcommand{\\cC}{\\mathcal C}\n\\newcommand{\\cD}{\\mathcal D}\n\\newcommand{\\cE}{\\mathcal E}\n\\newcommand{\\cF}{\\mathcal F}\n\\newcommand{\\cG}{\\mathcal G}\n\\newcommand{\\cH}{\\mathcal H}\n\\newcommand{\\cI}{\\mathcal I}\n\\newcommand{\\cJ}{\\mathcal J}\n\\newcommand{\\cK}{\\mathcal K}\n\\newcommand{\\cL}{\\mathcal L}\n\\newcommand{\\cM}{\\mathcal M}\n\\newcommand{\\cN}{\\mathcal N}\n\\newcommand{\\cO}{\\mathcal O}\n\\newcommand{\\cP}{\\mathcal P}\n\\newcommand{\\cQ}{\\mathcal Q}\n\\newcommand{\\cR}{\\mathcal R}\n\\newcommand{\\cS}{\\mathcal S}\n\\newcommand{\\cT}{\\mathcal T}\n\\newcommand{\\cU}{\\mathcal U}\n\\newcommand{\\cV}{\\mathcal V}\n\\newcommand{\\cW}{\\mathcal W}\n\\newcommand{\\cX}{\\mathcal X}\n\\newcommand{\\cY}{\\mathcal Y}\n\\newcommand{\\cZ}{\\mathcal Z}\n\n\\newcommand{\\bbB}{\\mathbb B}\n\\newcommand{\\bbS}{\\mathbb S}\n\\newcommand{\\bbR}{\\mathbb R}\n\\newcommand{\\bbZ}{\\mathbb Z}\n\\newcommand{\\bbI}{\\mathbb I}\n\\newcommand{\\bbQ}{\\mathbb Q}\n\\newcommand{\\bbP}{\\mathbb P}\n\\newcommand{\\bbE}{\\mathbb E}\n\\newcommand{\\bbN}{\\mathbb N}\n\n\\newcommand{\\N}{\\mathbb{N}}\n\\newcommand{\\R}{\\bbR}\n\\newcommand{\\Z}{\\mathbb{Z}}\n\n\n"
  },
  {
    "path": "tex/master.tex",
    "content": "%% filename: amsbook-template.tex\n%% version: 1.1\n%% date: 2014/07/24\n%%\n%% American Mathematical Society\n%% Technical Support\n%% Publications Technical Group\n%% 201 Charles Street\n%% Providence, RI 02904\n%% USA\n%% tel: (401) 455-4080\n%%      (800) 321-4267 (USA and Canada only)\n%% fax: (401) 331-3842\n%% email: tech-support@ams.org\n%% \n%% Copyright 2006, 2008-2010, 2014 American Mathematical Society.\n%% \n%% This work may be distributed and/or modified under the\n%% conditions of the LaTeX Project Public License, either version 1.3c\n%% of this license or (at your option) any later version.\n%% The latest version of this license is in\n%%   http://www.latex-project.org/lppl.txt\n%% and version 1.3c or later is part of all distributions of LaTeX\n%% version 2005/12/01 or later.\n%% \n%% This work has the LPPL maintenance status `maintained'.\n%% \n%% The Current Maintainer of this work is the American Mathematical\n%% Society.\n%%\n%% ====================================================================\n\n%    AMS-LaTeX v.2 driver file template for use with amsbook\n%\n%    Remove any commented or uncommented macros you do not use.\n\n\\documentclass[oneside, openany]{book}\n\\usepackage{amsfonts,bm,amsthm,amsmath,bbm,amssymb,mathtools}\n\\usepackage{fullpage}\n\\usepackage{tikz, pgfplots} % added for Lecture 2\t\n\\usepackage{caption, subcaption}\n\\usepackage{float}  % added for Lecture 8\n\\usepackage[ruled,vlined,linesnumbered]{algorithm2e}  % added for Lecture 15\n\\usepackage{esdiff}\n\\usepackage{booktabs}  % added for Lecture 15\n\\usepackage{hyperref}\n\\hypersetup{linktocpage}\n\\usepackage{natbib}\n\\renewcommand{\\cite}[1]{\\citep{#1}}\n\n\n\n\\newtheorem{theorem}{Theorem}[chapter]\n\\newtheorem{lemma}[theorem]{Lemma}\n\n\\theoremstyle{definition}\n\\newtheorem{definition}[theorem]{Definition}\n\\newtheorem{example}[theorem]{Example}\n\\newtheorem{xca}[theorem]{Exercise}\n\\newtheorem{corollary}[theorem]{Corollary}  % added for Lecture 5\n\\newtheorem{proposition}{Proposition}[section]  % added for Lecture 6\n\n\\theoremstyle{remark}\n\\newtheorem{remark}[theorem]{Remark}\n\n\\numberwithin{section}{chapter}\n\\numberwithin{equation}{chapter}\n\n%    For a single index; for multiple indexes, see the manual\n%    \"Instructions for preparation of papers and monographs:\n%    AMS-LaTeX\" (instr-l.pdf in the AMS-LaTeX distribution).\n\\makeindex\n\\def\\lectureformat{0}\n\\input{macros}\n\\pgfplotsset{compat=1.17}\n\\begin{document}\n\t\n\t\\frontmatter\n\t\n\t\\title{Lecture Notes for Machine Learning Theory (CS229M/STATS214)}\n\t\n\t%    Remove any unused author tags.\n\t\n\t%    author one information\n\t\\author{Instructor: Tengyu Ma\n\t}\n\t%\\address{}\n\t%\\curraddr{}\n\t%\\email{}\n\t%\\thanks{}\n\t\n\t%    author two information\n\t%\\author{}\n\t%\\address{}\n\t%\\curraddr{}\n\t%\\email{}\n\t%\\thanks{}\n\t\n\t%\\subjclass[2010]{Primary }\n\t\n\t%\\keywords{}\n\t\n\t%\\date{}\n\t\n\t%\\begin{abstract}\n\t%\\end{abstract}\n\t\n\t\n\t\\maketitle\n\t\n\t%    Dedication.  If the dedication is longer than a line or two,\n\t%    remove the centering instructions and the line break.\n\t%\\cleardoublepage\n\t%\\thispagestyle{empty}\n\t%\\vspace*{13.5pc}\n\t%\\begin{center}\n\t%  Dedication text (use \\\\[2pt] for line break if necessary)\n\t%\\end{center}\n\t%\\cleardoublepage\n\t\n\t%    Change page number to 6 if a dedication is present.\n\t%\\setcounter{page}{1}\n\t\n\t\\tableofcontents\n\t\n\t%    Include unnumbered chapters (preface, acknowledgments, etc.) here.\n\t%\\include{}\n\t\\mainmatter\n\t\\let\\sec\\section\n\t\\let\\subsec\\subsection\n\t\\let\\subsubsec\\subsubsection\n\t\n\t\\chapter*{Acknowledgments}\n\t\\setcounter{page}{5}\n\tThis monograph is a collection of scribe notes for the course CS229M/STATS214 at Stanford University. The materials in Chapter \\ref{chap:supervised}--\\ref{chap:gen-bounds} are mostly based on Percy Liang's lecture notes~\\cite{percynotes}, and Chapter~\\ref{chap:OL} is largely based on Haipeng Luo's lectures~\\cite{haipengnotes}. Kenneth Tay contributed significantly to the revision of these notes as a teaching assistant for the course. The original contributor to the scribe notes are Stanford students including but not limited to Anusri Pampari, Gabriel Poesia, Alexander Ke, Trenton Chang, Brad Ross, Robbie Jones, Yizhou Qian, Will Song, Daniel Do, Spencer M. Richards, Thomas Lew, David Lin, Jinhui Wang, Rafael Rafailov, Aidan Perreault, Kevin Han, Han Wu, Andrew Wang, Rohan Taori, Jonathan Lee, Rohith Kuditipudi, Kefan Dong, Roshni Sahoo, Sarah Wu, Tianyu Du, Xin Lu, Soham Sinha, Kevin Guo, Jeff Z. HaoChen, Carrie Wu, Kaidi Cao, and Ruocheng Wang.  The notes will be updated every year with new materials. The reference list is far from complete.\n\t\n\t\\tnotelong{to add names for fall 2021 quarter}\n\t\n\t\n\t\\chapter{Supervised Learning Formulations}\\label{chap:supervised}\n\t\\input{collection/01supervised.tex}\n\t\n\t\\chapter{Asymptotic Analysis}\\label{chap:asymp}\n\t\\input{collection/02asymptotics.tex}\n\t\n\t\\chapter{Concentration Inequalities}\\label{chap:conc}\n\t\\input{collection/03concentration.tex}\n\t\n\t\\chapter{Generalization Bounds via Uniform Convergence}\\label{chap:uc}\n\t\\input{collection/04-01-uniform.tex}\n\t\\input{collection/04-02-uniform.tex}\n\t\\input{collection/04-03-uniform.tex}\n\t\n\t\\chapter{Rademacher Complexity Bounds for Concrete Models and Losses}\\label{chap:gen-bounds}\n\t\\input{collection/05-01-concrete-models.tex}\n\t\\input{collection/05-02-concrete-models}\n\t\\input{collection/05-03-deep-nets.tex}\n\t\n\t\\chapter{Theoretical Mysteries in Deep Learning}\n\t\\input{collection/06-dltheory}\n\t\n\t\\chapter{Nonconvex Optimization}\n\t\\input{collection/07-01-nonconvex.tex}\n\t\\input{collection/07-02-nonconvex.tex}\n\t\\input{collection/07-03-ntk.tex}\n%\t\\input{collection/07-05-ntk-limitation.tex}\n\t\n\t\\chapter{Implicit/Algorithmic Regularization Effect}\n\t\\input{collection/08-01-algorithmic.tex}\n\t\\input{collection/08-02-algorithmic.tex}\n\t\\input{collection/08-03-algorithmic-new.tex}\n\t\n\t\\chapter{Unsupervised Learning and Self-supervised Learning}\n\t\\input{collection/09-01-unsupervised.tex}\n%\t\\input{collection/09-01-data-dependent.tex}\n\t\n\t\\chapter{Online learning}\\label{chap:OL}\n\t\\input{collection/10-01-online.tex}\n\t\\input{collection/10-02-online.tex}\n\t\n\t\n\t\\appendix\n\t%    Include appendix \"chapters\" here.\n\t\n\t\n\t\\backmatter\n\t%    Bibliography styles amsplain or harvard are also acceptable.\n\t\\bibliographystyle{plainnat}\n\t\\bibliography{all, bibliography}\n\t%    See note above about multiple indexes.\n\t%\\printindex\n\t\n\\end{document}\n\n%-----------------------------------------------------------------------\n% End of amsbook-template.tex\n%-----------------------------------------------------------------------\n"
  }
]