Repository: udacity/rl-cheatsheet Branch: master Commit: 837b00b64943 Files: 5 Total size: 19.2 KB Directory structure: gitextract_nv914m6c/ ├── .github/ │ └── workflows/ │ └── manual.yml ├── CODEOWNERS ├── LICENSE.txt ├── README.md └── cheatsheet.tex ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/manual.yml ================================================ # Workflow to ensure whenever a Github PR is submitted, # a JIRA ticket gets created automatically. name: Manual Workflow # Controls when the action will run. on: # Triggers the workflow on pull request events but only for the master branch pull_request_target: types: [opened, reopened] # Allows you to run this workflow manually from the Actions tab workflow_dispatch: jobs: test-transition-issue: name: Convert Github Issue to Jira Issue runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@master - name: Login uses: atlassian/gajira-login@master env: JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }} JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }} JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }} - name: Create NEW JIRA ticket id: create uses: atlassian/gajira-create@master with: project: CONUPDATE issuetype: Task summary: | Github PR [Assign the ND component] | Repo: ${{ github.repository }} | PR# ${{github.event.number}} description: | Repo link: https://github.com/${{ github.repository }} PR no. ${{ github.event.pull_request.number }} PR title: ${{ github.event.pull_request.title }} PR description: ${{ github.event.pull_request.description }} In addition, please resolve other issues, if any. fields: '{"components": [{"name":"nd013 - Self Driving Car Engineer ND"}], "customfield_16449":"https://classroom.udacity.com/", "customfield_16450":"Resolve the PR", "labels": ["github"], "priority":{"id": "4"}}' - name: Log created issue run: echo "Issue ${{ steps.create.outputs.issue }} was created" ================================================ FILE: CODEOWNERS ================================================ * @udacity/active-public-content ================================================ FILE: LICENSE.txt ================================================ Copyright (c) 2017 Udacity, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Reinforcement Learning (RL) Cheatsheet You are encouraged to use the [PDF file](https://github.com/udacity/rl-cheatsheet/blob/master/cheatsheet.pdf) in the repository to guide your study of RL. If you would like to learn how to implement these algorithms, please check out Udacity's [Machine Learning Engineer Nanodegree Program](http://www.udacity.com/course/machine-learning-engineer-nanodegree--nd009). ================================================ FILE: cheatsheet.tex ================================================ \documentclass[10pt]{amsart} \usepackage[top=1in, bottom=1in, left=1in, right=1in]{geometry} \geometry{letterpaper} \geometry{landscape} %\usepackage[parfill]{parskip} % Activate to begin paragraphs with an empty line rather than an indent \usepackage{graphicx} \usepackage{amssymb} \usepackage{epstopdf} \usepackage{tabto} \usepackage{empheq, comment} \usepackage[ruled]{algorithm2e} \usepackage{fancyhdr} \renewcommand{\headrulewidth}{0pt} \fancyhead[L]{} \fancyhead[R]{ \includegraphics[width=4cm]{udacity-logo.png} } \DeclareGraphicsRule{.tif}{png}{.png}{`convert #1 `dirname #1`/`basename #1 .tif`.png} \pagestyle{fancy} \title{Reinforcement Learning} \begin{document} \maketitle \thispagestyle{fancy} \section{The Problem} \begin{itemize} \item[] $S_t$ \tabto{2cm} state at time $t$ \item[] $A_t$ \tabto{2cm} action at time $t$ \item[] $R_t$ \tabto{2cm} reward at time $t$ \item[] $\gamma$ \tabto{2cm} discount rate (where $0 \leq \gamma \leq 1$) \item[] $G_t$ \tabto{2cm} discounted return at time $t$ ($\sum_{k=0}^\infty \gamma^k R_{t+k+1}$) \item[] $\mathcal{S}$ \tabto{2cm} set of all nonterminal states \item[] $\mathcal{S}^+$ \tabto{2cm} set of all states (including terminal states) \item[] $\mathcal{A}$ \tabto{2cm} set of all actions \item[] $\mathcal{A}(s)$ \tabto{2cm} set of all actions available in state $s$ \item[] $\mathcal{R}$ \tabto{2cm} set of all rewards \item[] $p(s',r|s,a)$ \tabto{2cm} probability of next state $s'$ and reward $r$, given current state $s$ and current action $a$ ($\mathbb{P}(S_{t+1}=s', R_{t+1}=r|S_t = s, A_t = a)$) \end{itemize} \section{The Solution} \begin{itemize} \item[] $\pi$ \tabto{2cm} policy \item[] \tabto{2.5cm} \textit{if deterministic}: $\pi(s) \in \mathcal{A}(s)$ for all $s \in \mathcal{S}$ \item[] \tabto{2.5cm} \textit{if stochastic}: $\pi(a|s) = \mathbb{P}(A_t=a|S_t=s)$ for all $s \in \mathcal{S}$ and $a \in \mathcal{A}(s)$ \item[] $v_\pi$ \tabto{2cm} state-value function for policy $\pi$ ($v_\pi(s) \doteq \mathbb{E}[G_t|S_t=s]$ for all $s\in\mathcal{S}$) \item[] $q_\pi$ \tabto{2cm} action-value function for policy $\pi$ ($q_\pi(s,a) \doteq \mathbb{E}[G_t|S_t=s, A_t=a]$ for all $s \in \mathcal{S}$ and $a \in \mathcal{A}(s)$) \item[] $v_*$ \tabto{2cm} optimal state-value function ($v_*(s) \doteq \max_\pi v_\pi(s)$ for all $s \in \mathcal{S}$) \item[] $q_*$ \tabto{2cm} optimal action-value function ($q_*(s,a) \doteq \max_\pi q_\pi(s,a)$ for all $s \in \mathcal{S}$ and $a \in \mathcal{A}(s)$) \end{itemize} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newpage \section{Bellman Equations} \subsection{Bellman Expectation Equations} \begin{empheq}[box=\fbox]{align} v_\pi(s) = \sum_{a \in \mathcal{A}(s)}\pi(a|s)\sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma v_\pi(s'))\nonumber \end{empheq} \begin{empheq}[box=\fbox]{align} q_\pi(s,a) = \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma\sum_{a' \in \mathcal{A}(s')} \pi(a'|s') q_\pi(s',a'))\nonumber \end{empheq} \subsection{Bellman Optimality Equations} \begin{empheq}[box=\fbox]{align} v_*(s) = \max_{a \in \mathcal{A}(s)}\sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma v_*(s')) \nonumber \end{empheq} \begin{empheq}[box=\fbox]{align} q_*(s,a) = \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma \max_{a'\in\mathcal{A}(s')}q_*(s',a')) \nonumber \end{empheq} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Useful Formulas for Deriving the Bellman Equations} \begin{equation*} v_\pi(s) = \sum_{a \in \mathcal{A}(s)}\pi(a|s)q_\pi(s,a) \end{equation*} \begin{equation*} v_*(s) = \max_{a \in \mathcal{A}(s)}q_*(s,a) \end{equation*} \begin{equation*} q_\pi(s,a) = \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma v_\pi(s')) \end{equation*} \begin{equation*} q_*(s,a) = \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma v_*(s')) \end{equation*} \begin{align*} q_\pi(s,a) &\doteq \mathbb{E}_{\pi}[ G_t | S_t = s, A_t = a ] & (1)\\ &= \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}\mathbb{P}(S_{t+1}=s', R_{t+1}=r|S_t=s, A_t=a)\mathbb{E}_{\pi}[ G_{t} | S_t = s, A_t = a, S_{t+1}=s', R_{t+1}=r] & (2)\\ &= \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)\mathbb{E}_{\pi}[ G_{t} | S_t = s, A_t = a, S_{t+1}=s', R_{t+1}=r] & (3)\\ &= \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)\mathbb{E}_{\pi}[ G_{t} | S_{t+1}=s', R_{t+1}=r] & (4)\\ &= \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)\mathbb{E}_{\pi}[ R_{t+1} + \gamma G_{t+1} | S_{t+1}=s', R_{t+1}=r] & (5)\\ &= \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma \mathbb{E}_\pi[G_{t+1} | S_{t+1}=s'] ) & (6)\\ &= \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma v_\pi(s') ) & (7) \end{align*} \vspace{.5in} The reasoning for the above is as follows: \vspace{.2in} \begin{itemize} \item (1) by definition ($q_\pi(s,a) \doteq \mathbb{E}_{\pi}[ G_t | S_t = s, A_t = a ]$) \\ \item (2) Law of Total Expectation\\ \item (3) by definition ($p(s',r|s,a)\doteq\mathbb{P}(S_{t+1}=s', R_{t+1}=r|S_t=s, A_t=a)$)\\ \item (4) $\mathbb{E}_{\pi}[ G_{t} | S_t = s, A_t = a, S_{t+1}=s', R_{t+1}=r] = \mathbb{E}_{\pi}[ G_{t} | S_{t+1}=s', R_{t+1}=r]$\\ \item (5) $G_t = R_{t+1} + \gamma G_{t+1}$\\ \item (6) Linearity of Expectation\\ \item (7) $v_\pi(s') = \mathbb{E}_\pi[G_{t+1} | S_{t+1}=s']$ \end{itemize} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newpage \section{Dynamic Programming} %%%%%%%%%%%% 1 POLICY EVALUATION \begin{algorithm} \KwIn{MDP, policy $\pi$, small positive number $\theta$} \KwOut{$V \approx v_\pi$} Initialize $V$ arbitrarily (e.g., $V(s)=0$ for all $s \in \mathcal{S}^+$)\\ \Repeat{$\Delta < \theta$}{ $\Delta \leftarrow 0$\\ \For{$s \in \mathcal{S}$}{ $v \leftarrow V(s)$\\ $V(s) \leftarrow \sum_{a\in\mathcal{A}(s)} \pi(a|s) \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma V(s'))$\\ $\Delta \leftarrow \max(\Delta, |v-V(s)|)$ } } \KwRet{$V$} \caption{Policy Evaluation} \end{algorithm} %%%%%%%%%%%% 2 ESTIMATION OF ACTION VALUES \begin{algorithm} \KwIn{MDP, state-value function $V$} \KwOut{action-value function $Q$} \For{$s \in \mathcal{S}$}{ \For{$a \in \mathcal{A}(s)$}{ $Q(s,a) \leftarrow \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r+\gamma V(s'))$ } } \KwRet{$Q$} \caption{Estimation of Action Values} \end{algorithm} %%%%%%%%%%%% 3 POLICY IMPROVEMENT \begin{algorithm} \KwIn{MDP, value function $V$} \KwOut{policy $\pi'$} \For{$s \in \mathcal{S}$}{ \For{$a \in \mathcal{A}(s)$}{ $Q(s,a) \leftarrow \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r+\gamma V(s'))$ } $\pi'(s) \leftarrow \arg\max_{a\in\mathcal{A}(s)}Q(s,a)$ } \KwRet{$\pi'$} \caption{Policy Improvement} \end{algorithm} %%%%%%%%%%%% 4 POLICY ITERATION \begin{algorithm} \KwIn{MDP, small positive number $\theta$} \KwOut{policy $\pi \approx \pi_*$} Initialize $\pi$ arbitrarily (e.g., $\pi(a|s)=\frac{1}{|\mathcal{A}(s)|}$ for all $s \in \mathcal{S}$ and $a \in \mathcal{A}(s)$)\\ $policy\text{-}stable \leftarrow false$\\ \Repeat{$policy\text{-}stable = true$}{ $V \leftarrow \textbf{Policy\_Evaluation}(\text{MDP}, \pi, \theta)$\\ $\pi' \leftarrow \textbf{Policy\_Improvement}(\text{MDP}, V)$\\ \If{$\pi= \pi'$}{ $policy\text{-}stable \leftarrow true$\\ } $\pi \leftarrow \pi'$ } \KwRet{$\pi$} \caption{Policy Iteration} \end{algorithm} %%%%%%%%%%%% 5 TRUNCATED POLICY EVALUATION \begin{algorithm} \KwIn{MDP, policy $\pi$, value function $V$, positive integer $max\_iterations$} \KwOut{$V \approx v_\pi$ (if $max\_iterations$ is large enough)} $counter \leftarrow 0$\\ \While{$counter < max\_iterations$}{ \For{$s \in \mathcal{S}$}{ $V(s) \leftarrow \sum_{a\in\mathcal{A}(s)} \pi(a|s) \sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma V(s'))$\\ } $counter \leftarrow counter + 1$ } \KwRet{$V$} \caption{Truncated Policy Evaluation} \end{algorithm} %%%%%%%%%%%% 6 TRUNCATED POLICY ITERATION \begin{algorithm} \KwIn{MDP, positive integer $max\_iterations$, small positive number $\theta$} \KwOut{policy $\pi \approx \pi_*$} Initialize $V$ arbitrarily (e.g., $V(s)=0$ for all $s \in \mathcal{S}^+$)\\ Initialize $\pi$ arbitrarily (e.g., $\pi(a|s)=\frac{1}{|\mathcal{A}(s)|}$ for all $s \in \mathcal{S}$ and $a \in \mathcal{A}(s)$)\\ \Repeat{$\max_{s\in\mathcal{S}}|V(s) - V_{old}(s)| < \theta$}{ $\pi \leftarrow \textbf{Policy\_Improvement}(\text{MDP}, V)$\\ $V_{old} \leftarrow V$\\ $V \leftarrow \textbf{Truncated\_Policy\_Evaluation}(\text{MDP}, \pi, V, max\_iterations)$ } \KwRet{$\pi$} \caption{Truncated Policy Iteration} \end{algorithm} %%%%%%%%%%%% 7 VALUE ITERATION \begin{algorithm} \KwIn{MDP, small positive number $\theta$} \KwOut{policy $\pi \approx \pi_*$} Initialize $V$ arbitrarily (e.g., $V(s)=0$ for all $s \in \mathcal{S}^+$)\\ \Repeat{$\Delta < \theta$}{ $\Delta \leftarrow 0$\\ \For{$s \in \mathcal{S}$}{ $v \leftarrow V(s)$\\ $V(s) \leftarrow \max_{a\in\mathcal{A}(s)}\sum_{s' \in \mathcal{S}, r\in\mathcal{R}}p(s',r|s,a)(r + \gamma V(s'))$\\ $\Delta \leftarrow \max(\Delta, |v-V(s)|)$ } } $\pi \leftarrow \textbf{Policy\_Improvement}(\text{MDP}, V)$ \\ \KwRet{$\pi$} \caption{Value Iteration} \end{algorithm} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \clearpage \section{Monte Carlo Methods} %%%%%%%%%%%% 8 FIRST-VISIT MC PREDICTION (STATE VALUES) \begin{algorithm} \KwIn{policy $\pi$, positive integer $num\_episodes$} \KwOut{value function $V$ ($\approx v_\pi$ if $num\_episodes$ is large enough)} Initialize $N(s) = 0$ for all $s\in\mathcal{S}$ \\ Initialize $returns\_sum(s) = 0$ for all $s\in\mathcal{S}$ \\ \For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{ Generate an episode $S_0, A_0, R_1, \ldots, S_T$ using $\pi$\\ \For{$t \leftarrow 0 \textbf{ to }T-1$}{ \uIf{$S_t$ is a first visit (with return $G_t$)}{ $N(S_t) \leftarrow N(S_t) + 1$\\ $returns\_sum(S_t) \leftarrow returns\_sum(S_t) + G_t$ } } } $V(s) \leftarrow returns\_sum(s)/N(s)$ for all $s\in\mathcal{S}$\\ \KwRet{$V$} \caption{First-Visit MC Prediction (\textit{for state values})} \end{algorithm} %%%%%%%%%%%% 9 FIRST-VISIT MC PREDICTION (ACTION VALUES) \begin{algorithm} \KwIn{policy $\pi$, positive integer $num\_episodes$} \KwOut{value function $Q$ ($\approx q_\pi$ if $num\_episodes$ is large enough)} Initialize $N(s,a) = 0$ for all $s\in\mathcal{S}, a\in\mathcal{A}(s)$ \\ Initialize $returns\_sum(s,a) = 0$ for all $s\in\mathcal{S}, a\in\mathcal{A}(s)$ \\ \For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{ Generate an episode $S_0, A_0, R_1, \ldots, S_T$ using $\pi$\\ \For{$t \leftarrow 0 \textbf{ to }T-1$}{ \uIf{$(S_t,A_t)$ is a first visit (with return $G_t$)}{ $N(S_t, A_t) \leftarrow N(S_t, A_t) + 1$\\ $returns\_sum(S_t, A_t) \leftarrow returns\_sum(S_t, A_t) + G_t$ } } } $Q(s,a) \leftarrow returns\_sum(s,a)/N(s,a)$ for all $s\in\mathcal{S}$, $a\in\mathcal{A}(s)$\\ \KwRet{$Q$} \caption{First-Visit MC Prediction (\textit{for action values})} \end{algorithm} %%%%%%%%%%%% 10 GLIE MC CONTROL \begin{algorithm} \KwIn{positive integer $num\_episodes$, GLIE $\{\epsilon_i\}$} \KwOut{policy $\pi$ ($\approx \pi_*$ if $num\_episodes$ is large enough)} Initialize $Q(s,a) = 0$ for all $s\in\mathcal{S}$ and $a\in\mathcal{A}(s)$ \\ Initialize $N(s,a) = 0$ for all $s\in\mathcal{S}, a\in\mathcal{A}(s)$ \\ \For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{ $\epsilon \leftarrow \epsilon_i$\\ $\pi \leftarrow \epsilon\text{-greedy}(Q)$\\ Generate an episode $S_0, A_0, R_1, \ldots, S_T$ using $\pi$\\ \For{$t \leftarrow 0 \textbf{ to }T-1$}{ \uIf{$(S_t,A_t)$ is a first visit (with return $G_t$)}{ $N(S_t,A_t) \leftarrow N(S_t,A_t) + 1$\\ $Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \frac{1}{N(S_t,A_t)}(G_t - Q(S_t, A_t))$ } } } \KwRet{$\pi$} \caption{First-Visit GLIE MC Control} \end{algorithm} %%%%%%%%%%%% 11 CONSTANT-ALPHA MC CONTROL \begin{algorithm} \KwIn{positive integer $num\_episodes$, small positive fraction $\alpha$, GLIE $\{\epsilon_i\}$} \KwOut{policy $\pi$ ($\approx \pi_*$ if $num\_episodes$ is large enough)} Initialize $Q$ arbitrarily (e.g., $Q(s,a) = 0$ for all $s\in\mathcal{S}$ and $a\in\mathcal{A}(s)$) \\ \For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{ $\epsilon \leftarrow \epsilon_i$\\ $\pi \leftarrow \epsilon\text{-greedy}(Q)$\\ Generate an episode $S_0, A_0, R_1, \ldots, S_T$ using $\pi$\\ \For{$t \leftarrow 0 \textbf{ to }T-1$}{ \uIf{$(S_t,A_t)$ is a first visit (with return $G_t$)}{ $Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \alpha(G_t - Q(S_t, A_t))$ } } } \KwRet{$\pi$} \caption{First-Visit Constant-$\alpha$ (GLIE) MC Control} \end{algorithm} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \clearpage \section{Temporal-Difference Methods} %%%%%%%%%%%% 12 TD(0) \begin{algorithm} \KwIn{policy $\pi$, positive integer $num\_episodes$} \KwOut{value function $V$ ($\approx v_\pi$ if $num\_episodes$ is large enough)} Initialize $V$ arbitrarily (e.g., $V(s) = 0$ for all $s\in\mathcal{S}^+$) \\ \For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{ Observe $S_0$\\ $t\leftarrow 0$\\ \Repeat{$S_t$ is terminal}{ Choose action $A_t$ using policy $\pi$\\ Take action $A_t$ and observe $R_{t+1}, S_{t+1}$\\ $V(S_t) \leftarrow V(S_t) + \alpha (R_{t+1} + \gamma V(S_{t+1}) - V(S_t))$\\ $t \leftarrow t+1$ } } \KwRet{$V$} \caption{TD(0)} \end{algorithm} %%%%%%%%%%%% 13 Sarsa \begin{algorithm} \KwIn{policy $\pi$, positive integer $num\_episodes$, small positive fraction $\alpha$, GLIE $\{\epsilon_i\}$} \KwOut{value function $Q$ ($\approx q_\pi$ if $num\_episodes$ is large enough)} Initialize $Q$ arbitrarily (e.g., $Q(s,a) = 0$ for all $s\in\mathcal{S}$ and $a\in\mathcal{A}(s)$, and $Q(terminal\text{-}state, \cdot)=0$) \\ \For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{ $\epsilon \leftarrow \epsilon_i$\\ Observe $S_0$\\ Choose action $A_0$ using policy derived from $Q$ (e.g., $\epsilon$-greedy)\\ $t\leftarrow 0$\\ \Repeat{$S_t$ is terminal}{ Take action $A_{t}$ and observe $R_{t+1}, S_{t+1}$\\ Choose action $A_{t+1}$ using policy derived from $Q$ (e.g., $\epsilon$-greedy)\\ $Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \alpha (R_{t+1} + \gamma Q(S_{t+1}, A_{t+1}) - Q(S_t, A_t))$\\ $t \leftarrow t+1$ } } \KwRet{$Q$} \caption{Sarsa} \end{algorithm} %%%%%%%%%%%% 14 Q-Learning \begin{algorithm} \KwIn{policy $\pi$, positive integer $num\_episodes$, small positive fraction $\alpha$, GLIE $\{\epsilon_i\}$} \KwOut{value function $Q$ ($\approx q_\pi$ if $num\_episodes$ is large enough)} Initialize $Q$ arbitrarily (e.g., $Q(s,a) = 0$ for all $s\in\mathcal{S}$ and $a\in\mathcal{A}(s)$, and $Q(terminal\text{-}state, \cdot)=0$) \\ \For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{ $\epsilon \leftarrow \epsilon_i$\\ Observe $S_0$\\ $t\leftarrow 0$\\ \Repeat{$S_t$ is terminal}{ Choose action $A_t$ using policy derived from $Q$ (e.g., $\epsilon$-greedy)\\ Take action $A_{t}$ and observe $R_{t+1}, S_{t+1}$\\ $Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \alpha (R_{t+1} + \gamma \max_{a}Q(S_{t+1}, a) - Q(S_t, A_t))$\\ $t \leftarrow t+1$ } } \KwRet{$Q$} \caption{Sarsamax (Q-Learning)} \end{algorithm} %%%%%%%%%%%% 15 Expected Sarsa \begin{algorithm} \KwIn{policy $\pi$, positive integer $num\_episodes$, small positive fraction $\alpha$, GLIE $\{\epsilon_i\}$} \KwOut{value function $Q$ ($\approx q_\pi$ if $num\_episodes$ is large enough)} Initialize $Q$ arbitrarily (e.g., $Q(s,a) = 0$ for all $s\in\mathcal{S}$ and $a\in\mathcal{A}(s)$, and $Q(terminal\text{-}state, \cdot)=0$) \\ \For{$i \leftarrow 1 \textbf{ to } num\_episodes$}{ $\epsilon \leftarrow \epsilon_i$\\ Observe $S_0$\\ $t\leftarrow 0$\\ \Repeat{$S_t$ is terminal}{ Choose action $A_t$ using policy derived from $Q$ (e.g., $\epsilon$-greedy)\\ Take action $A_{t}$ and observe $R_{t+1}, S_{t+1}$\\ $Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \alpha (R_{t+1} + \gamma \sum_{a}\pi(a|S_{t+1})Q(S_{t+1}, a) - Q(S_t, A_t))$\\ $t \leftarrow t+1$ } } \KwRet{$Q$} \caption{Expected Sarsa} \end{algorithm} \end{document}