[
  {
    "path": ".github/workflows/manual.yml",
    "content": "# Workflow to ensure whenever a Github PR is submitted, \n# a JIRA ticket gets created automatically. \nname: Manual Workflow\n\n# Controls when the action will run. \non:\n  # Triggers the workflow on pull request events but only for the master branch\n  pull_request_target:\n    types: [opened, reopened]\n\n  # Allows you to run this workflow manually from the Actions tab\n  workflow_dispatch:\n\njobs:\n  test-transition-issue:\n    name: Convert Github Issue to Jira Issue\n    runs-on: ubuntu-latest\n    steps:\n    - name: Checkout\n      uses: actions/checkout@master\n\n    - name: Login\n      uses: atlassian/gajira-login@master\n      env:\n        JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }}\n        JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}\n        JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}\n        \n    - name: Create NEW JIRA ticket\n      id: create\n      uses: atlassian/gajira-create@master\n      with:\n        project: CONUPDATE\n        issuetype: Task\n        summary: |\n          Github PR [Assign the ND component] | Repo: ${{ github.repository }}  | PR# ${{github.event.number}}\n        description: |\n           Repo link: https://github.com/${{ github.repository }}   \n           PR no. ${{ github.event.pull_request.number }} \n           PR title: ${{ github.event.pull_request.title }}  \n           PR description: ${{ github.event.pull_request.description }}  \n           In addition, please resolve other issues, if any. \n        fields: '{\"components\": [{\"name\":\"nd013 - Self Driving Car Engineer ND\"}], \"customfield_16449\":\"https://classroom.udacity.com/\", \"customfield_16450\":\"Resolve the PR\", \"labels\": [\"github\"], \"priority\":{\"id\": \"4\"}}'\n\n    - name: Log created issue\n      run: echo \"Issue ${{ steps.create.outputs.issue }} was created\"\n"
  },
  {
    "path": "CODEOWNERS",
    "content": "*           @udacity/active-public-content"
  },
  {
    "path": "LICENSE.txt",
    "content": "Copyright (c) 2017 Udacity, Inc.\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "# Reinforcement Learning (RL) Cheatsheet\n\nYou are encouraged to use the [PDF file](https://github.com/udacity/rl-cheatsheet/blob/master/cheatsheet.pdf) in the repository to guide your study of RL.\n\nIf you would like to learn how to implement these algorithms, please check out Udacity's [Machine Learning Engineer Nanodegree Program](http://www.udacity.com/course/machine-learning-engineer-nanodegree--nd009).\n"
  },
  {
    "path": "cheatsheet.tex",
    "content": "\\documentclass[10pt]{amsart}\n\\usepackage[top=1in, bottom=1in, left=1in, right=1in]{geometry}\n\\geometry{letterpaper}                  \n\\geometry{landscape}               \n%\\usepackage[parfill]{parskip}    % Activate to begin paragraphs with an empty line rather than an indent\n\\usepackage{graphicx}\n\\usepackage{amssymb}\n\\usepackage{epstopdf}\n\\usepackage{tabto}\n\\usepackage{empheq, comment}\n\\usepackage[ruled]{algorithm2e}\n\\usepackage{fancyhdr}\n\\renewcommand{\\headrulewidth}{0pt}\n\\fancyhead[L]{}\n\\fancyhead[R]{\n\\includegraphics[width=4cm]{udacity-logo.png}\n}\n\\DeclareGraphicsRule{.tif}{png}{.png}{`convert #1 `dirname #1`/`basename #1 .tif`.png}\n\\pagestyle{fancy}\n\n\\title{Reinforcement Learning}\n\n\\begin{document}\n\\maketitle\n\\thispagestyle{fancy}\n\n\\section{The Problem}\n\n\\begin{itemize}\n\\item[] $S_t$ \\tabto{2cm} state at time $t$\n\\item[] $A_t$ \\tabto{2cm} action at time $t$\n\\item[] $R_t$ \\tabto{2cm} reward at time $t$\n\\item[] $\\gamma$ \\tabto{2cm} discount rate (where $0 \\leq \\gamma \\leq 1$)\n\\item[] $G_t$ \\tabto{2cm} discounted return at time $t$ ($\\sum_{k=0}^\\infty \\gamma^k R_{t+k+1}$)\n\\item[] $\\mathcal{S}$ \\tabto{2cm} set of all nonterminal states\n\\item[] $\\mathcal{S}^+$ \\tabto{2cm} set of all states (including terminal states)\n\\item[] $\\mathcal{A}$ \\tabto{2cm} set of all actions \n\\item[] $\\mathcal{A}(s)$ \\tabto{2cm} set of all actions available in state $s$\n\\item[] $\\mathcal{R}$ \\tabto{2cm} set of all rewards\n\\item[] $p(s',r|s,a)$ \\tabto{2cm} probability of next state $s'$ and reward $r$, given current state $s$ and current action $a$ ($\\mathbb{P}(S_{t+1}=s', R_{t+1}=r|S_t = s, A_t = a)$)\n\\end{itemize}\n\n\\section{The Solution}\n\\begin{itemize}\n\\item[] $\\pi$ \\tabto{2cm} policy \n\\item[] \\tabto{2.5cm} \\textit{if deterministic}: $\\pi(s) \\in \\mathcal{A}(s)$ for all $s \\in \\mathcal{S}$ \n\\item[] \\tabto{2.5cm} \\textit{if stochastic}: $\\pi(a|s) = \\mathbb{P}(A_t=a|S_t=s)$ for all $s \\in \\mathcal{S}$ and $a \\in \\mathcal{A}(s)$\n\\item[] $v_\\pi$ \\tabto{2cm} state-value function for policy $\\pi$ ($v_\\pi(s) \\doteq \\mathbb{E}[G_t|S_t=s]$ for all $s\\in\\mathcal{S}$)\n\\item[] $q_\\pi$ \\tabto{2cm} action-value function for policy $\\pi$ ($q_\\pi(s,a) \\doteq \\mathbb{E}[G_t|S_t=s, A_t=a]$ for all $s \\in \\mathcal{S}$ and $a \\in \\mathcal{A}(s)$)\n\\item[] $v_*$ \\tabto{2cm} optimal state-value function ($v_*(s) \\doteq \\max_\\pi v_\\pi(s)$ for all $s \\in \\mathcal{S}$)\n\\item[] $q_*$ \\tabto{2cm} optimal action-value function ($q_*(s,a) \\doteq \\max_\\pi q_\\pi(s,a)$ for all $s \\in \\mathcal{S}$ and $a \\in \\mathcal{A}(s)$)\n\\end{itemize}\n\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n\\newpage\n\n\\section{Bellman Equations}\n\n\\subsection{Bellman Expectation Equations}\n\n\\begin{empheq}[box=\\fbox]{align}\nv_\\pi(s) = \\sum_{a \\in \\mathcal{A}(s)}\\pi(a|s)\\sum_{s' \\in \\mathcal{S}, r\\in\\mathcal{R}}p(s',r|s,a)(r + \\gamma v_\\pi(s'))\\nonumber\n\\end{empheq}\n\n\\begin{empheq}[box=\\fbox]{align}\nq_\\pi(s,a) = \\sum_{s' \\in \\mathcal{S}, r\\in\\mathcal{R}}p(s',r|s,a)(r + \\gamma\\sum_{a' \\in \\mathcal{A}(s')} \\pi(a'|s') q_\\pi(s',a'))\\nonumber\n\\end{empheq}\n\n\\subsection{Bellman Optimality Equations}\n\\begin{empheq}[box=\\fbox]{align}\nv_*(s) = \\max_{a \\in \\mathcal{A}(s)}\\sum_{s' \\in \\mathcal{S}, r\\in\\mathcal{R}}p(s',r|s,a)(r + \\gamma v_*(s')) \\nonumber\n\\end{empheq}\n\n\\begin{empheq}[box=\\fbox]{align}\nq_*(s,a) = \\sum_{s' \\in \\mathcal{S}, r\\in\\mathcal{R}}p(s',r|s,a)(r + \\gamma \\max_{a'\\in\\mathcal{A}(s')}q_*(s',a')) \\nonumber\n\\end{empheq}\n\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n\n\n\\subsection{Useful Formulas for Deriving the Bellman Equations}\n\n\\begin{equation*}\nv_\\pi(s) = \\sum_{a \\in \\mathcal{A}(s)}\\pi(a|s)q_\\pi(s,a) \n\\end{equation*}\n\n\\begin{equation*}\nv_*(s) = \\max_{a \\in \\mathcal{A}(s)}q_*(s,a) \n\\end{equation*}\n\n\\begin{equation*}\nq_\\pi(s,a) = \\sum_{s' \\in \\mathcal{S}, r\\in\\mathcal{R}}p(s',r|s,a)(r + \\gamma v_\\pi(s'))\n\\end{equation*}\n\n\\begin{equation*}\nq_*(s,a) = \\sum_{s' \\in \\mathcal{S}, r\\in\\mathcal{R}}p(s',r|s,a)(r + \\gamma v_*(s'))\n\\end{equation*}\n\n\\begin{align*}\nq_\\pi(s,a) &\\doteq \\mathbb{E}_{\\pi}[ G_t | S_t = s, A_t = a ] & (1)\\\\\n&= \\sum_{s' \\in \\mathcal{S}, r\\in\\mathcal{R}}\\mathbb{P}(S_{t+1}=s', R_{t+1}=r|S_t=s, A_t=a)\\mathbb{E}_{\\pi}[ G_{t} | S_t = s, A_t = a, S_{t+1}=s', R_{t+1}=r] & (2)\\\\\n&= \\sum_{s' \\in \\mathcal{S}, r\\in\\mathcal{R}}p(s',r|s,a)\\mathbb{E}_{\\pi}[ G_{t} | S_t = s, A_t = a, S_{t+1}=s', R_{t+1}=r] & (3)\\\\\n&= \\sum_{s' \\in \\mathcal{S}, r\\in\\mathcal{R}}p(s',r|s,a)\\mathbb{E}_{\\pi}[ G_{t} | S_{t+1}=s', R_{t+1}=r] & (4)\\\\\n&= \\sum_{s' \\in \\mathcal{S}, r\\in\\mathcal{R}}p(s',r|s,a)\\mathbb{E}_{\\pi}[ R_{t+1} + \\gamma G_{t+1} | S_{t+1}=s', R_{t+1}=r] & (5)\\\\\n&= \\sum_{s' \\in \\mathcal{S}, r\\in\\mathcal{R}}p(s',r|s,a)(r  + \\gamma \\mathbb{E}_\\pi[G_{t+1} | S_{t+1}=s'] ) & (6)\\\\\n&= \\sum_{s' \\in \\mathcal{S}, r\\in\\mathcal{R}}p(s',r|s,a)(r  + \\gamma v_\\pi(s') ) & (7)\n\\end{align*}\n\n\\vspace{.5in}\n\nThe reasoning for the above is as follows:\n\\vspace{.2in}\n\\begin{itemize}\n\\item (1) by definition ($q_\\pi(s,a) \\doteq \\mathbb{E}_{\\pi}[ G_t | S_t = s, A_t = a ]$) \\\\\n\\item (2) Law of Total Expectation\\\\\n\\item (3) by definition ($p(s',r|s,a)\\doteq\\mathbb{P}(S_{t+1}=s', R_{t+1}=r|S_t=s, A_t=a)$)\\\\\n\\item (4) $\\mathbb{E}_{\\pi}[ G_{t} | S_t = s, A_t = a, S_{t+1}=s', R_{t+1}=r] = \\mathbb{E}_{\\pi}[ G_{t} | S_{t+1}=s', R_{t+1}=r]$\\\\\n\\item (5) $G_t = R_{t+1} + \\gamma G_{t+1}$\\\\\n\\item (6) Linearity of Expectation\\\\\n\\item (7) $v_\\pi(s') = \\mathbb{E}_\\pi[G_{t+1} | S_{t+1}=s']$\n\\end{itemize}\n\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n\\newpage\n\n\\section{Dynamic Programming}\n\n%%%%%%%%%%%% 1 POLICY EVALUATION\n\\begin{algorithm}\n\t\\KwIn{MDP, policy $\\pi$, small positive number $\\theta$}\n    \t\\KwOut{$V \\approx v_\\pi$}\n    \tInitialize $V$ arbitrarily (e.g., $V(s)=0$ for all $s \\in \\mathcal{S}^+$)\\\\\n    \t\\Repeat{$\\Delta < \\theta$}{\n    \t\t$\\Delta \\leftarrow 0$\\\\\n\t\t\\For{$s \\in \\mathcal{S}$}{\n\t\t\t$v \\leftarrow V(s)$\\\\\n\t\t\t$V(s) \\leftarrow \\sum_{a\\in\\mathcal{A}(s)} \\pi(a|s) \\sum_{s' \\in \\mathcal{S}, r\\in\\mathcal{R}}p(s',r|s,a)(r + \\gamma V(s'))$\\\\\n\t\t\t$\\Delta \\leftarrow \\max(\\Delta, |v-V(s)|)$\n\t\t}\n    \t}\n\t\\KwRet{$V$}\n\t\\caption{Policy Evaluation}\n\\end{algorithm}\n\n%%%%%%%%%%%% 2 ESTIMATION OF ACTION VALUES\n\\begin{algorithm}\n\t\\KwIn{MDP, state-value function $V$}\n    \t\\KwOut{action-value function $Q$}\n\t\t\n    \t\\For{$s \\in \\mathcal{S}$}{\n\t\t\\For{$a \\in \\mathcal{A}(s)$}{\n\t\t$Q(s,a) \\leftarrow  \\sum_{s' \\in \\mathcal{S}, r\\in\\mathcal{R}}p(s',r|s,a)(r+\\gamma V(s'))$\n\t\t}\n\t}\n\t\\KwRet{$Q$}\n\t\\caption{Estimation of Action Values}\n\\end{algorithm}\n\n\n%%%%%%%%%%%% 3 POLICY IMPROVEMENT\n\\begin{algorithm}\n\t\\KwIn{MDP, value function $V$}\n    \t\\KwOut{policy $\\pi'$}\n\t\t\n    \t\\For{$s \\in \\mathcal{S}$}{\n\t\t\\For{$a \\in \\mathcal{A}(s)$}{\n\t\t$Q(s,a) \\leftarrow  \\sum_{s' \\in \\mathcal{S}, r\\in\\mathcal{R}}p(s',r|s,a)(r+\\gamma V(s'))$\n\t\t}\n\t\t$\\pi'(s) \\leftarrow \\arg\\max_{a\\in\\mathcal{A}(s)}Q(s,a)$\n\t\t\n\t}\n\t\\KwRet{$\\pi'$}\n\t\\caption{Policy Improvement}\n\\end{algorithm}\n\n%%%%%%%%%%%% 4 POLICY ITERATION \n\\begin{algorithm}\n\t\\KwIn{MDP, small positive number $\\theta$}\n    \t\\KwOut{policy $\\pi \\approx \\pi_*$}\n\tInitialize $\\pi$ arbitrarily (e.g., $\\pi(a|s)=\\frac{1}{|\\mathcal{A}(s)|}$ for all $s \\in \\mathcal{S}$ and $a \\in \\mathcal{A}(s)$)\\\\\n\t$policy\\text{-}stable \\leftarrow false$\\\\\n\t\\Repeat{$policy\\text{-}stable = true$}{\n\t$V \\leftarrow \\textbf{Policy\\_Evaluation}(\\text{MDP}, \\pi, \\theta)$\\\\\n\t$\\pi' \\leftarrow \\textbf{Policy\\_Improvement}(\\text{MDP}, V)$\\\\\n\t\\If{$\\pi= \\pi'$}{\n\t\t\t$policy\\text{-}stable \\leftarrow true$\\\\\n\t}\n\t$\\pi \\leftarrow \\pi'$\n\t}\n\t\\KwRet{$\\pi$}\n\t\\caption{Policy Iteration}\n\\end{algorithm}\n\n%%%%%%%%%%%% 5 TRUNCATED POLICY EVALUATION\n\\begin{algorithm}\n\t\\KwIn{MDP, policy $\\pi$, value function $V$, positive integer $max\\_iterations$}\n    \t\\KwOut{$V \\approx v_\\pi$ (if $max\\_iterations$ is large enough)}\n\t$counter \\leftarrow 0$\\\\\n    \t\\While{$counter < max\\_iterations$}{\n\t\t\\For{$s \\in \\mathcal{S}$}{\n\t\t\t$V(s) \\leftarrow \\sum_{a\\in\\mathcal{A}(s)} \\pi(a|s) \\sum_{s' \\in \\mathcal{S}, r\\in\\mathcal{R}}p(s',r|s,a)(r + \\gamma V(s'))$\\\\\n\t\t}\n\t\t$counter \\leftarrow counter + 1$ \n    \t}\n\t\\KwRet{$V$}\n\t\\caption{Truncated Policy Evaluation}\n\\end{algorithm}\n\n%%%%%%%%%%%% 6 TRUNCATED POLICY ITERATION\n\\begin{algorithm}\n\t\\KwIn{MDP, positive integer $max\\_iterations$, small positive number $\\theta$}\n    \t\\KwOut{policy $\\pi \\approx \\pi_*$}\n\tInitialize $V$ arbitrarily (e.g., $V(s)=0$ for all $s \\in \\mathcal{S}^+$)\\\\\n\tInitialize $\\pi$ arbitrarily (e.g., $\\pi(a|s)=\\frac{1}{|\\mathcal{A}(s)|}$ for all $s \\in \\mathcal{S}$ and $a \\in \\mathcal{A}(s)$)\\\\\n\t\\Repeat{$\\max_{s\\in\\mathcal{S}}|V(s) - V_{old}(s)| < \\theta$}{\n\t$\\pi \\leftarrow \\textbf{Policy\\_Improvement}(\\text{MDP}, V)$\\\\\n\t$V_{old} \\leftarrow V$\\\\\n\t$V \\leftarrow \\textbf{Truncated\\_Policy\\_Evaluation}(\\text{MDP}, \\pi, V, max\\_iterations)$\n\t}\n\t\\KwRet{$\\pi$}\n\t\\caption{Truncated Policy Iteration}\n\\end{algorithm}\n\n%%%%%%%%%%%% 7 VALUE ITERATION\n\\begin{algorithm}\n\t\\KwIn{MDP, small positive number $\\theta$}\n    \t\\KwOut{policy $\\pi \\approx \\pi_*$}\n    \tInitialize $V$ arbitrarily (e.g., $V(s)=0$ for all $s \\in \\mathcal{S}^+$)\\\\\n    \t\\Repeat{$\\Delta < \\theta$}{\n    \t\t$\\Delta \\leftarrow 0$\\\\\n\t\t\\For{$s \\in \\mathcal{S}$}{\n\t\t\t$v \\leftarrow V(s)$\\\\\n\t\t\t$V(s) \\leftarrow \\max_{a\\in\\mathcal{A}(s)}\\sum_{s' \\in \\mathcal{S}, r\\in\\mathcal{R}}p(s',r|s,a)(r + \\gamma V(s'))$\\\\\n\t\t\t$\\Delta \\leftarrow \\max(\\Delta, |v-V(s)|)$\n\t\t}\n    \t}\n\t$\\pi \\leftarrow \\textbf{Policy\\_Improvement}(\\text{MDP}, V)$ \\\\\n\t\\KwRet{$\\pi$}\n\t\\caption{Value Iteration}\n\\end{algorithm}\n\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n\\clearpage\n\n\\section{Monte Carlo Methods}\n\n%%%%%%%%%%%% 8 FIRST-VISIT MC PREDICTION (STATE VALUES)\n\\begin{algorithm}\n\t\\KwIn{policy $\\pi$, positive integer $num\\_episodes$}\n    \t\\KwOut{value function $V$ ($\\approx v_\\pi$ if $num\\_episodes$ is large enough)}\n\tInitialize $N(s) = 0$ for all $s\\in\\mathcal{S}$ \\\\\n\tInitialize $returns\\_sum(s) = 0$ for all $s\\in\\mathcal{S}$ \\\\\n    \t\\For{$i \\leftarrow 1 \\textbf{ to } num\\_episodes$}{\n    \t\tGenerate an episode $S_0, A_0, R_1, \\ldots, S_T$ using $\\pi$\\\\\n\t\t\\For{$t \\leftarrow 0 \\textbf{ to }T-1$}{\n\t\t\t\\uIf{$S_t$ is a first visit (with return $G_t$)}{\n\t\t\t\t$N(S_t) \\leftarrow N(S_t) + 1$\\\\\n\t\t\t\t$returns\\_sum(S_t) \\leftarrow returns\\_sum(S_t) + G_t$\n\t\t\t}\n\t\t}\n\t}\n\t$V(s) \\leftarrow returns\\_sum(s)/N(s)$ for all $s\\in\\mathcal{S}$\\\\\n\t\\KwRet{$V$}\n\t\\caption{First-Visit MC Prediction (\\textit{for state values})}\n\\end{algorithm}\n\n%%%%%%%%%%%% 9 FIRST-VISIT MC PREDICTION (ACTION VALUES)\n\\begin{algorithm}\n\t\\KwIn{policy $\\pi$, positive integer $num\\_episodes$}\n    \t\\KwOut{value function $Q$ ($\\approx q_\\pi$ if $num\\_episodes$ is large enough)}\n\tInitialize $N(s,a) = 0$ for all $s\\in\\mathcal{S}, a\\in\\mathcal{A}(s)$ \\\\\n\tInitialize $returns\\_sum(s,a) = 0$ for all $s\\in\\mathcal{S}, a\\in\\mathcal{A}(s)$ \\\\\n    \t\\For{$i \\leftarrow 1 \\textbf{ to } num\\_episodes$}{\n    \t\tGenerate an episode $S_0, A_0, R_1, \\ldots, S_T$ using $\\pi$\\\\\n\t\t\\For{$t \\leftarrow 0 \\textbf{ to }T-1$}{\n\t\t\t\\uIf{$(S_t,A_t)$ is a first visit (with return $G_t$)}{\n\t\t\t\t$N(S_t, A_t) \\leftarrow N(S_t, A_t) + 1$\\\\\n\t\t\t\t$returns\\_sum(S_t, A_t) \\leftarrow returns\\_sum(S_t, A_t) + G_t$\n\t\t\t}\n\t\t}\n\t}\n\t$Q(s,a) \\leftarrow returns\\_sum(s,a)/N(s,a)$ for all $s\\in\\mathcal{S}$, $a\\in\\mathcal{A}(s)$\\\\\n\t\\KwRet{$Q$}\n\t\\caption{First-Visit MC Prediction (\\textit{for action values})}\n\\end{algorithm}\n\n%%%%%%%%%%%% 10 GLIE MC CONTROL\n\\begin{algorithm}\n\t\\KwIn{positive integer $num\\_episodes$, GLIE $\\{\\epsilon_i\\}$}\n    \t\\KwOut{policy $\\pi$ ($\\approx \\pi_*$ if $num\\_episodes$ is large enough)}\n\tInitialize $Q(s,a) = 0$ for all $s\\in\\mathcal{S}$ and $a\\in\\mathcal{A}(s)$ \\\\\n\tInitialize $N(s,a) = 0$ for all $s\\in\\mathcal{S}, a\\in\\mathcal{A}(s)$ \\\\\n\t\\For{$i \\leftarrow 1 \\textbf{ to } num\\_episodes$}{\n\t$\\epsilon \\leftarrow \\epsilon_i$\\\\\n\t$\\pi \\leftarrow \\epsilon\\text{-greedy}(Q)$\\\\\n    \tGenerate an episode $S_0, A_0, R_1, \\ldots, S_T$ using $\\pi$\\\\\n\t\\For{$t \\leftarrow 0 \\textbf{ to }T-1$}{\n\t\t\t\\uIf{$(S_t,A_t)$ is a first visit (with return $G_t$)}{\n\t\t\t\t$N(S_t,A_t) \\leftarrow N(S_t,A_t) + 1$\\\\\n\t\t\t\t$Q(S_t, A_t) \\leftarrow Q(S_t, A_t) + \\frac{1}{N(S_t,A_t)}(G_t - Q(S_t, A_t))$\n\t\t\t}\n\t\t}\n\t}\n\t\\KwRet{$\\pi$}\n\t\\caption{First-Visit GLIE MC Control}\n\\end{algorithm}\n\n\n%%%%%%%%%%%% 11 CONSTANT-ALPHA MC CONTROL\n\\begin{algorithm}\n\t\\KwIn{positive integer $num\\_episodes$, small positive fraction $\\alpha$, GLIE $\\{\\epsilon_i\\}$}\n    \t\\KwOut{policy $\\pi$ ($\\approx \\pi_*$ if $num\\_episodes$ is large enough)}\n\tInitialize $Q$ arbitrarily (e.g., $Q(s,a) = 0$ for all $s\\in\\mathcal{S}$ and $a\\in\\mathcal{A}(s)$) \\\\\n\t\\For{$i \\leftarrow 1 \\textbf{ to } num\\_episodes$}{\n\t$\\epsilon \\leftarrow \\epsilon_i$\\\\\n\t$\\pi \\leftarrow \\epsilon\\text{-greedy}(Q)$\\\\\n    \tGenerate an episode $S_0, A_0, R_1, \\ldots, S_T$ using $\\pi$\\\\\n\t\\For{$t \\leftarrow 0 \\textbf{ to }T-1$}{\n\t\t\t\\uIf{$(S_t,A_t)$ is a first visit (with return $G_t$)}{\n\t\t\t\t$Q(S_t, A_t) \\leftarrow Q(S_t, A_t) + \\alpha(G_t - Q(S_t, A_t))$\n\t\t\t}\n\t\t}\n\t}\n\t\\KwRet{$\\pi$}\n\t\\caption{First-Visit Constant-$\\alpha$ (GLIE) MC Control}\n\\end{algorithm}\n\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n\\clearpage\n\n\\section{Temporal-Difference Methods}\n\n%%%%%%%%%%%% 12 TD(0)\n\\begin{algorithm}\n\t\\KwIn{policy $\\pi$, positive integer $num\\_episodes$}\n    \t\\KwOut{value function $V$ ($\\approx v_\\pi$ if $num\\_episodes$ is large enough)}\n\tInitialize $V$ arbitrarily (e.g., $V(s) = 0$ for all $s\\in\\mathcal{S}^+$) \\\\\n    \t\\For{$i \\leftarrow 1 \\textbf{ to } num\\_episodes$}{\n    \t\tObserve $S_0$\\\\\n\t\t$t\\leftarrow 0$\\\\\n\t\t\\Repeat{$S_t$ is terminal}{\n\t\tChoose action $A_t$ using policy $\\pi$\\\\\n\t\tTake action $A_t$ and observe $R_{t+1}, S_{t+1}$\\\\\n\t\t$V(S_t) \\leftarrow V(S_t) + \\alpha (R_{t+1} + \\gamma V(S_{t+1}) - V(S_t))$\\\\\n\t\t$t \\leftarrow t+1$\n\t\t}\n\t}\n\t\\KwRet{$V$}\n\t\\caption{TD(0)}\n\\end{algorithm}\n\n%%%%%%%%%%%% 13 Sarsa\n\\begin{algorithm}\n\t\\KwIn{policy $\\pi$, positive integer $num\\_episodes$, small positive fraction $\\alpha$, GLIE $\\{\\epsilon_i\\}$}\n    \t\\KwOut{value function $Q$ ($\\approx q_\\pi$ if $num\\_episodes$ is large enough)}\n\tInitialize $Q$ arbitrarily (e.g., $Q(s,a) = 0$ for all $s\\in\\mathcal{S}$ and $a\\in\\mathcal{A}(s)$, and $Q(terminal\\text{-}state, \\cdot)=0$) \\\\\n    \t\\For{$i \\leftarrow 1 \\textbf{ to } num\\_episodes$}{\n\t\t$\\epsilon \\leftarrow \\epsilon_i$\\\\\n\t\tObserve $S_0$\\\\\n\t\tChoose action $A_0$ using policy derived from $Q$ (e.g., $\\epsilon$-greedy)\\\\\n\t\t$t\\leftarrow 0$\\\\\n\t\t\\Repeat{$S_t$ is terminal}{\n\t\tTake action $A_{t}$ and observe $R_{t+1}, S_{t+1}$\\\\\n\t\tChoose action $A_{t+1}$ using policy derived from $Q$ (e.g., $\\epsilon$-greedy)\\\\\n\t\t$Q(S_t, A_t) \\leftarrow Q(S_t, A_t) + \\alpha (R_{t+1} + \\gamma Q(S_{t+1}, A_{t+1}) - Q(S_t, A_t))$\\\\\n\t\t$t \\leftarrow t+1$\n\t\t}\n\t}\n\t\\KwRet{$Q$}\n\t\\caption{Sarsa}\n\\end{algorithm}\n\n%%%%%%%%%%%% 14 Q-Learning\n\\begin{algorithm}\n\t\\KwIn{policy $\\pi$, positive integer $num\\_episodes$, small positive fraction $\\alpha$, GLIE $\\{\\epsilon_i\\}$}\n    \t\\KwOut{value function $Q$ ($\\approx q_\\pi$ if $num\\_episodes$ is large enough)}\n\tInitialize $Q$ arbitrarily (e.g., $Q(s,a) = 0$ for all $s\\in\\mathcal{S}$ and $a\\in\\mathcal{A}(s)$, and $Q(terminal\\text{-}state, \\cdot)=0$) \\\\\n    \t\\For{$i \\leftarrow 1 \\textbf{ to } num\\_episodes$}{\n\t\t$\\epsilon \\leftarrow \\epsilon_i$\\\\\n\t\tObserve $S_0$\\\\\n\t\t$t\\leftarrow 0$\\\\\n\t\t\\Repeat{$S_t$ is terminal}{\n\t\tChoose action $A_t$ using policy derived from $Q$ (e.g., $\\epsilon$-greedy)\\\\\n\t\tTake action $A_{t}$ and observe $R_{t+1}, S_{t+1}$\\\\\n\t\t$Q(S_t, A_t) \\leftarrow Q(S_t, A_t) + \\alpha (R_{t+1} + \\gamma \\max_{a}Q(S_{t+1}, a) - Q(S_t, A_t))$\\\\\n\t\t$t \\leftarrow t+1$\n\t\t}\n\t}\n\t\\KwRet{$Q$}\n\t\\caption{Sarsamax (Q-Learning)}\n\\end{algorithm}\n\n\n%%%%%%%%%%%% 15 Expected Sarsa\n\\begin{algorithm}\n\t\\KwIn{policy $\\pi$, positive integer $num\\_episodes$, small positive fraction $\\alpha$, GLIE $\\{\\epsilon_i\\}$}\n    \t\\KwOut{value function $Q$ ($\\approx q_\\pi$ if $num\\_episodes$ is large enough)}\n\tInitialize $Q$ arbitrarily (e.g., $Q(s,a) = 0$ for all $s\\in\\mathcal{S}$ and $a\\in\\mathcal{A}(s)$, and $Q(terminal\\text{-}state, \\cdot)=0$) \\\\\n    \t\\For{$i \\leftarrow 1 \\textbf{ to } num\\_episodes$}{\n\t\t$\\epsilon \\leftarrow \\epsilon_i$\\\\\n\t\tObserve $S_0$\\\\\n\t\t$t\\leftarrow 0$\\\\\n\t\t\\Repeat{$S_t$ is terminal}{\n\t\tChoose action $A_t$ using policy derived from $Q$ (e.g., $\\epsilon$-greedy)\\\\\n\t\tTake action $A_{t}$ and observe $R_{t+1}, S_{t+1}$\\\\\n\t\t$Q(S_t, A_t) \\leftarrow Q(S_t, A_t) + \\alpha (R_{t+1} + \\gamma \\sum_{a}\\pi(a|S_{t+1})Q(S_{t+1}, a) - Q(S_t, A_t))$\\\\\n\t\t$t \\leftarrow t+1$\n\t\t}\n\t}\n\t\\KwRet{$Q$}\n\t\\caption{Expected Sarsa}\n\\end{algorithm}\n\n\\end{document}  "
  }
]