Showing preview only (215K chars total). Download the full file or copy to clipboard to get everything.
Repository: PacktPublishing/Reinforcement-Learning-Algorithms-with-Python
Branch: master
Commit: d144d314b3b5
Files: 26
Total size: 205.3 KB
Directory structure:
gitextract_o5sg2_86/
├── Chapter02/
│ └── Code.ipynb
├── Chapter03/
│ ├── frozenlake8x8_policyiteration.py
│ └── frozenlake8x8_valueiteration.py
├── Chapter04/
│ └── SARSA Q_learning Taxi-v2.py
├── Chapter05/
│ ├── .ipynb_checkpoints/
│ │ └── Untitled-checkpoint.ipynb
│ ├── DQN_Atari.py
│ ├── DQN_variations_Atari.py
│ ├── Untitled.ipynb
│ ├── atari_wrappers.py
│ └── untitled
├── Chapter06/
│ ├── AC.py
│ ├── REINFORCE.py
│ └── REINFORCE_baseline.py
├── Chapter07/
│ ├── PPO.py
│ └── TRPO.py
├── Chapter08/
│ ├── DDPG.py
│ └── TD3.py
├── Chapter09/
│ └── ME-TRPO.py
├── Chapter10/
│ ├── DAgger.py
│ └── expert/
│ ├── checkpoint
│ ├── model.ckpt.data-00000-of-00001
│ ├── model.ckpt.index
│ └── model.ckpt.meta
├── Chapter11/
│ └── ES.py
├── Chapter12/
│ └── ESBAS.py
└── README.md
================================================
FILE CONTENTS
================================================
================================================
FILE: Chapter02/Code.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### TensorFlow installation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`pip3 install tensorflow`\n",
"\n",
"or\n",
"\n",
"`pip3 install tensorflow-gpu`"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### OpenAI Gym installation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On OSX: \n",
"\n",
"`brew install cmake boost boost-python sdl2 swig wget`\n",
" \n",
"On Ubuntu 16.04:\n",
"\n",
"`apt-get install -y python-pyglet python3-opengl zlib1g-dev libjpeg-dev patchelf cmake swig libboost-all-dev libsdl2-dev libosmesa6-dev xvfb ffmpeg`\n",
"\n",
"On Ubuntu 18.04\n",
"\n",
"`sudo apt install -y python3-dev zlib1g-dev libjpeg-dev cmake swig python-pyglet python3-opengl libboost-all-dev libsdl2-dev libosmesa6-dev patchelf ffmpeg xvfb `\n",
"\n",
"Then:\n",
"\n",
"```\n",
"git clone https://github.com/openai/gym.git \n",
"\n",
"cd gym\n",
"\n",
"pip install -e '.[all]'\n",
"```\n",
"\n",
"PyBox2D:\n",
"\n",
"```\n",
"git clone https://github.com/pybox2d/pybox2d\n",
"cd pybox2d\n",
"pip3 install -e .\n",
"```\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Duckietown installation\n",
"\n",
"```\n",
"git clone https://github.com/duckietown/gym-duckietown.git\n",
"cd gym-duckietown\n",
"pip3 install -e .\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Roboschool installation\n",
"\n",
"```\n",
"git clone https://github.com/openai/roboschool\n",
"cd roboschool\n",
"ROBOSCHOOL_PATH=`pwd`\n",
"git clone https://github.com/olegklimov/bullet3 -b roboschool_self_collision\n",
"mkdir bullet3/build\n",
"cd bullet3/build\n",
"cmake -DBUILD_SHARED_LIBS=ON -DUSE_DOUBLE_PRECISION=1 -DCMAKE_INSTALL_PREFIX:PATH=$ROBOSCHOOL_PATH/roboschool/cpp-household/bullet_local_install -DBUILD_CPU_DEMOS=OFF -DBUILD_BULLET2_DEMOS=OFF -DBUILD_EXTRAS=OFF -DBUILD_UNIT_TESTS=OFF -DBUILD_CLSOCKET=OFF -DBUILD_ENET=OFF -DBUILD_OPENGL3_DEMOS=OFF ..\n",
"\n",
"make -j4\n",
"make install\n",
"cd ../..\n",
"pip3 install -e $ROBOSCHOOL_PATH\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## RL cycle"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.\u001b[0m\n"
]
}
],
"source": [
"import gym\n",
"\n",
"# create the environment \n",
"env = gym.make(\"CartPole-v1\")\n",
"# reset the environment before starting\n",
"env.reset()\n",
"\n",
"# loop 10 times\n",
"for i in range(10):\n",
" # take a random action\n",
" env.step(env.action_space.sample())\n",
" # render the game\n",
" env.render()\n",
"\n",
"# close the environment\n",
"env.close()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.\u001b[0m\n",
"Episode 0 finished, reward:15\n",
"Episode 1 finished, reward:13\n",
"Episode 2 finished, reward:20\n",
"Episode 3 finished, reward:22\n",
"Episode 4 finished, reward:13\n",
"Episode 5 finished, reward:18\n",
"Episode 6 finished, reward:15\n",
"Episode 7 finished, reward:12\n",
"Episode 8 finished, reward:58\n",
"Episode 9 finished, reward:15\n"
]
}
],
"source": [
"import gym\n",
"\n",
"# create and initialize the environment\n",
"env = gym.make(\"CartPole-v1\")\n",
"env.reset()\n",
"\n",
"# play 10 games\n",
"for i in range(10):\n",
" # initialize the variables\n",
" done = False\n",
" game_rew = 0\n",
"\n",
" while not done:\n",
" # choose a random action\n",
" action = env.action_space.sample()\n",
" # take a step in the environment\n",
" new_obs, rew, done, info = env.step(action)\n",
" game_rew += rew\n",
" \n",
" # when is done, print the cumulative reward of the game and reset the environment\n",
" if done:\n",
" print('Episode %d finished, reward:%d' % (i, game_rew))\n",
" env.reset()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.\u001b[0m\n",
"Box(4,)\n"
]
}
],
"source": [
"import gym\n",
"\n",
"env = gym.make('CartPole-v1')\n",
"print(env.observation_space)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Discrete(2)\n"
]
}
],
"source": [
"print(env.action_space)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1\n",
"0\n",
"0\n"
]
}
],
"source": [
"print(env.action_space.sample())\n",
"print(env.action_space.sample())\n",
"print(env.action_space.sample())"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]\n"
]
}
],
"source": [
"print(env.observation_space.low)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]\n"
]
}
],
"source": [
"print(env.observation_space.high)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TensorFlow"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\users\\andrea\\appdata\\local\\programs\\python\\python35\\lib\\site-packages\\h5py\\__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
" from ._conv import register_converters as _register_converters\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tensor(\"add:0\", shape=(), dtype=int32)\n",
"7\n"
]
}
],
"source": [
"import tensorflow as tf\n",
"\n",
"# create two constants: a and b\n",
"a = tf.constant(4)\n",
"b = tf.constant(3)\n",
"\n",
"# perform a computation\n",
"c = a + b\n",
"print(c) # print the shape of c\n",
"\n",
"# create a session\n",
"session = tf.Session()\n",
"# run the session. It compute the sum\n",
"res = session.run(c)\n",
"print(res) # print the actual result"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# reset the graph\n",
"tf.reset_default_graph()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Tensor"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"()\n"
]
}
],
"source": [
"a = tf.constant(1)\n",
"print(a.shape)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(5,)\n"
]
}
],
"source": [
"# array of five elements\n",
"b = tf.constant([1,2,3,4,5])\n",
"print(b.shape)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1 2 3]\n"
]
}
],
"source": [
"#NB: a can be of any type of tensor\n",
"a = tf.constant([1,2,3,4,5])\n",
"first_three_elem = a[:3]\n",
"fourth_elem = a[3]\n",
"\n",
"sess = tf.Session()\n",
"print(sess.run(first_three_elem))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4\n"
]
}
],
"source": [
"print(sess.run(fourth_elem))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Constant"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tensor(\"a_const:0\", shape=(4,), dtype=float32)\n"
]
}
],
"source": [
"a = tf.constant([1.0, 1.1, 2.1, 3.1], dtype=tf.float32, name='a_const')\n",
"print(a)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Placeholder"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[10.1 10.2 10.3]]\n"
]
}
],
"source": [
"a = tf.placeholder(shape=(1,3), dtype=tf.float32)\n",
"b = tf.constant([[10,10,10]], dtype=tf.float32)\n",
"\n",
"c = a + b\n",
"\n",
"sess = tf.Session()\n",
"res = sess.run(c, feed_dict={a:[[0.1,0.2,0.3]]})\n",
"print(res)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"tf.reset_default_graph()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tensor(\"Placeholder:0\", shape=(?, 3), dtype=float32)\n",
"[[10.1 10.2 10.3]]\n",
"[[7. 7. 7.]\n",
" [7. 7. 7.]]\n"
]
}
],
"source": [
"import numpy as np\n",
"\n",
"# NB: the fist dimension is 'None', meaning that it can be of any lenght\n",
"a = tf.placeholder(shape=(None,3), dtype=tf.float32)\n",
"b = tf.placeholder(shape=(None,3), dtype=tf.float32)\n",
"\n",
"c = a + b\n",
"\n",
"print(a)\n",
"\n",
"sess = tf.Session()\n",
"print(sess.run(c, feed_dict={a:[[0.1,0.2,0.3]], b:[[10,10,10]]}))\n",
"\n",
"v_a = np.array([[1,2,3],[4,5,6]])\n",
"v_b = np.array([[6,5,4],[3,2,1]])\n",
"print(sess.run(c, feed_dict={a:v_a, b:v_b}))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[10.1 10.2 10.3]]\n"
]
}
],
"source": [
"sess = tf.Session()\n",
"print(sess.run(c, feed_dict={a:[[0.1,0.2,0.3]], b:[[10,10,10]]}))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Variable"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0.4478302 0.7014905 0.36300516]]\n",
"[[4 5]]\n"
]
}
],
"source": [
"tf.reset_default_graph()\n",
"\n",
"# variable initialized using the glorot uniform initializer\n",
"var = tf.get_variable(\"first_variable\", shape=[1,3], dtype=tf.float32, initializer=tf.glorot_uniform_initializer)\n",
"\n",
"# variable initialized with constant values\n",
"init_val = np.array([4,5])\n",
"var2 = tf.get_variable(\"second_variable\", shape=[1,2], dtype=tf.int32, initializer=tf.constant_initializer(init_val))\n",
"\n",
"# create the session\n",
"sess = tf.Session()\n",
"# initialize all the variables\n",
"sess.run(tf.global_variables_initializer())\n",
"\n",
"print(sess.run(var))\n",
"\n",
"print(sess.run(var2))"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"# not trainable variable\n",
"var2 = tf.get_variable(\"variable\", shape=[1,2], trainable=False, dtype=tf.int32)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[<tf.Variable 'first_variable:0' shape=(1, 3) dtype=float32_ref>, <tf.Variable 'second_variable:0' shape=(1, 2) dtype=int32_ref>, <tf.Variable 'variable:0' shape=(1, 2) dtype=int32_ref>]\n"
]
}
],
"source": [
"print(tf.global_variables())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Graph"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"-0.015899599"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tf.reset_default_graph()\n",
"\n",
"const1 = tf.constant(3.0, name='constant1')\n",
"\n",
"var = tf.get_variable(\"variable1\", shape=[1,2], dtype=tf.float32)\n",
"var2 = tf.get_variable(\"variable2\", shape=[1,2], trainable=False, dtype=tf.float32)\n",
"\n",
"op1 = const1 * var\n",
"op2 = op1 + var2\n",
"op3 = tf.reduce_mean(op2)\n",
"\n",
"sess = tf.Session()\n",
"sess.run(tf.global_variables_initializer())\n",
"sess.run(op3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Simple Linear Regression Example\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch: 0, MSE: 4617.4390, W: 1.295, b: -0.407\n",
"Epoch: 40, MSE: 5.3334, W: 0.496, b: -0.727\n",
"Epoch: 80, MSE: 4.5894, W: 0.529, b: -0.012\n",
"Epoch: 120, MSE: 4.1029, W: 0.512, b: 0.608\n",
"Epoch: 160, MSE: 3.8552, W: 0.506, b: 1.092\n",
"Epoch: 200, MSE: 3.7597, W: 0.501, b: 1.418\n",
"Final weight: 0.500, bias: 1.473\n"
]
}
],
"source": [
"tf.reset_default_graph()\n",
"\n",
"np.random.seed(10)\n",
"tf.set_random_seed(10)\n",
"\n",
"W, b = 0.5, 1.4\n",
"# create a dataset of 100 examples\n",
"X = np.linspace(0,100, num=100)\n",
"# add random noise to the y labels\n",
"y = np.random.normal(loc=W * X + b, scale=2.0, size=len(X))\n",
"\n",
"# create the placeholders\n",
"x_ph = tf.placeholder(shape=[None,], dtype=tf.float32)\n",
"y_ph = tf.placeholder(shape=[None,], dtype=tf.float32)\n",
"\n",
"# create the variables.\n",
"v_weight = tf.get_variable(\"weight\", shape=[1], dtype=tf.float32)\n",
"v_bias = tf.get_variable(\"bias\", shape=[1], dtype=tf.float32)\n",
"\n",
"# linear computation\n",
"out = v_weight * x_ph + v_bias\n",
"\n",
"# compute the Mean Squared Error\n",
"loss = tf.reduce_mean((out - y_ph)**2)\n",
"\n",
"# optimizer\n",
"opt = tf.train.AdamOptimizer(0.4).minimize(loss)\n",
"\n",
"# create the session\n",
"session = tf.Session()\n",
"session.run(tf.global_variables_initializer())\n",
"\n",
"# loop to train the parameters\n",
"for ep in range(210):\n",
" # run the optimizer and get the loss\n",
" train_loss, _ = session.run([loss, opt], feed_dict={x_ph:X, y_ph:y})\n",
" \n",
" # print epoch number and loss\n",
" if ep % 40 == 0:\n",
" print('Epoch: %3d, MSE: %.4f, W: %.3f, b: %.3f' % (ep, train_loss, session.run(v_weight), session.run(v_bias)))\n",
" \n",
"print('Final weight: %.3f, bias: %.3f' % (session.run(v_weight), session.run(v_bias)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### .. with TensorBoard"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch: 0, MSE: 4617.4390, W: 1.295, b: -0.407\n",
"Epoch: 40, MSE: 5.3334, W: 0.496, b: -0.727\n",
"Epoch: 80, MSE: 4.5894, W: 0.529, b: -0.012\n",
"Epoch: 120, MSE: 4.1029, W: 0.512, b: 0.608\n",
"Epoch: 160, MSE: 3.8552, W: 0.506, b: 1.092\n",
"Epoch: 200, MSE: 3.7597, W: 0.501, b: 1.418\n",
"Final weight: 0.500, bias: 1.473\n"
]
}
],
"source": [
"from datetime import datetime\n",
"\n",
"tf.reset_default_graph()\n",
"\n",
"np.random.seed(10)\n",
"tf.set_random_seed(10)\n",
"\n",
"W, b = 0.5, 1.4\n",
"# create a dataset of 100 examples\n",
"X = np.linspace(0,100, num=100)\n",
"# add random noise to the y labels\n",
"y = np.random.normal(loc=W * X + b, scale=2.0, size=len(X))\n",
"\n",
"# create the placeholders\n",
"x_ph = tf.placeholder(shape=[None,], dtype=tf.float32)\n",
"y_ph = tf.placeholder(shape=[None,], dtype=tf.float32)\n",
"\n",
"# create the variables.\n",
"v_weight = tf.get_variable(\"weight\", shape=[1], dtype=tf.float32)\n",
"v_bias = tf.get_variable(\"bias\", shape=[1], dtype=tf.float32)\n",
"\n",
"# linear computation\n",
"out = v_weight * x_ph + v_bias\n",
"\n",
"# compute the Mean Squared Error\n",
"loss = tf.reduce_mean((out - y_ph)**2)\n",
"\n",
"# optimizer\n",
"opt = tf.train.AdamOptimizer(0.4).minimize(loss)\n",
"\n",
"\n",
"tf.summary.scalar('MSEloss', loss)\n",
"tf.summary.histogram('model_weight', v_weight)\n",
"tf.summary.histogram('model_bias', v_bias)\n",
"all_summary = tf.summary.merge_all()\n",
"\n",
"now = datetime.now()\n",
"clock_time = \"{}_{}.{}.{}\".format(now.day, now.hour, now.minute, now.second)\n",
"file_writer = tf.summary.FileWriter('log_dir/'+clock_time, tf.get_default_graph())\n",
"\n",
"\n",
"# create the session\n",
"session = tf.Session()\n",
"session.run(tf.global_variables_initializer())\n",
"\n",
"# loop to train the parameters\n",
"for ep in range(210):\n",
" # run the optimizer and get the loss\n",
" train_loss, _, train_summary = session.run([loss, opt, all_summary], feed_dict={x_ph:X, y_ph:y})\n",
" file_writer.add_summary(train_summary, ep)\n",
" \n",
" # print epoch number and loss\n",
" if ep % 40 == 0:\n",
" print('Epoch: %3d, MSE: %.4f, W: %.3f, b: %.3f' % (ep, train_loss, session.run(v_weight), session.run(v_bias)))\n",
" \n",
"print('Final weight: %.3f, bias: %.3f' % (session.run(v_weight), session.run(v_bias)))\n",
"file_writer.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
================================================
FILE: Chapter03/frozenlake8x8_policyiteration.py
================================================
import numpy as np
import gym
def eval_state_action(V, s, a, gamma=0.99):
return np.sum([p * (rew + gamma*V[next_s]) for p, next_s, rew, _ in env.P[s][a]])
def policy_evaluation(V, policy, eps=0.0001):
'''
Policy evaluation. Update the value function until it reach a steady state
'''
while True:
delta = 0
# loop over all states
for s in range(nS):
old_v = V[s]
# update V[s] using the Bellman equation
V[s] = eval_state_action(V, s, policy[s])
delta = max(delta, np.abs(old_v - V[s]))
if delta < eps:
break
def policy_improvement(V, policy):
'''
Policy improvement. Update the policy based on the value function
'''
policy_stable = True
for s in range(nS):
old_a = policy[s]
# update the policy with the action that bring to the highest state value
policy[s] = np.argmax([eval_state_action(V, s, a) for a in range(nA)])
if old_a != policy[s]:
policy_stable = False
return policy_stable
def run_episodes(env, policy, num_games=100):
'''
Run some games to test a policy
'''
tot_rew = 0
state = env.reset()
for _ in range(num_games):
done = False
while not done:
# select the action accordingly to the policy
next_state, reward, done, _ = env.step(policy[state])
state = next_state
tot_rew += reward
if done:
state = env.reset()
print('Won %i of %i games!'%(tot_rew, num_games))
if __name__ == '__main__':
# create the environment
env = gym.make('FrozenLake-v0')
# enwrap it to have additional information from it
env = env.unwrapped
# spaces dimension
nA = env.action_space.n
nS = env.observation_space.n
# initializing value function and policy
V = np.zeros(nS)
policy = np.zeros(nS)
# some useful variable
policy_stable = False
it = 0
while not policy_stable:
policy_evaluation(V, policy)
policy_stable = policy_improvement(V, policy)
it += 1
print('Converged after %i policy iterations'%(it))
run_episodes(env, policy)
print(V.reshape((4,4)))
print(policy.reshape((4,4)))
================================================
FILE: Chapter03/frozenlake8x8_valueiteration.py
================================================
import numpy as np
import gym
def eval_state_action(V, s, a, gamma=0.99):
return np.sum([p * (rew + gamma*V[next_s]) for p, next_s, rew, _ in env.P[s][a]])
def value_iteration(eps=0.0001):
'''
Value iteration algorithm
'''
V = np.zeros(nS)
it = 0
while True:
delta = 0
# update the value of each state using as "policy" the max operator
for s in range(nS):
old_v = V[s]
V[s] = np.max([eval_state_action(V, s, a) for a in range(nA)])
delta = max(delta, np.abs(old_v - V[s]))
if delta < eps:
break
else:
print('Iter:', it, ' delta:', np.round(delta, 5))
it += 1
return V
def run_episodes(env, V, num_games=100):
'''
Run some test games
'''
tot_rew = 0
state = env.reset()
for _ in range(num_games):
done = False
while not done:
action = np.argmax([eval_state_action(V, state, a) for a in range(nA)])
next_state, reward, done, _ = env.step(action)
state = next_state
tot_rew += reward
if done:
state = env.reset()
print('Won %i of %i games!'%(tot_rew, num_games))
if __name__ == '__main__':
# create the environment
env = gym.make('FrozenLake-v0')
# enwrap it to have additional information from it
env = env.unwrapped
# spaces dimension
nA = env.action_space.n
nS = env.observation_space.n
# Value iteration
V = value_iteration(eps=0.0001)
# test the value function on 100 games
run_episodes(env, V, 100)
# print the state values
print(V.reshape((4,4)))
================================================
FILE: Chapter04/SARSA Q_learning Taxi-v2.py
================================================
import numpy as np
import gym
def eps_greedy(Q, s, eps=0.1):
'''
Epsilon greedy policy
'''
if np.random.uniform(0,1) < eps:
# Choose a random action
return np.random.randint(Q.shape[1])
else:
# Choose the action of a greedy policy
return greedy(Q, s)
def greedy(Q, s):
'''
Greedy policy
return the index corresponding to the maximum action-state value
'''
return np.argmax(Q[s])
def run_episodes(env, Q, num_episodes=100, to_print=False):
'''
Run some episodes to test the policy
'''
tot_rew = []
state = env.reset()
for _ in range(num_episodes):
done = False
game_rew = 0
while not done:
# select a greedy action
next_state, rew, done, _ = env.step(greedy(Q, state))
state = next_state
game_rew += rew
if done:
state = env.reset()
tot_rew.append(game_rew)
if to_print:
print('Mean score: %.3f of %i games!'%(np.mean(tot_rew), num_episodes))
return np.mean(tot_rew)
def Q_learning(env, lr=0.01, num_episodes=10000, eps=0.3, gamma=0.95, eps_decay=0.00005):
nA = env.action_space.n
nS = env.observation_space.n
# Initialize the Q matrix
# Q: matrix nS*nA where each row represent a state and each colums represent a different action
Q = np.zeros((nS, nA))
games_reward = []
test_rewards = []
for ep in range(num_episodes):
state = env.reset()
done = False
tot_rew = 0
# decay the epsilon value until it reaches the threshold of 0.01
if eps > 0.01:
eps -= eps_decay
# loop the main body until the environment stops
while not done:
# select an action following the eps-greedy policy
action = eps_greedy(Q, state, eps)
next_state, rew, done, _ = env.step(action) # Take one step in the environment
# Q-learning update the state-action value (get the max Q value for the next state)
Q[state][action] = Q[state][action] + lr*(rew + gamma*np.max(Q[next_state]) - Q[state][action])
state = next_state
tot_rew += rew
if done:
games_reward.append(tot_rew)
# Test the policy every 300 episodes and print the results
if (ep % 300) == 0:
test_rew = run_episodes(env, Q, 1000)
print("Episode:{:5d} Eps:{:2.4f} Rew:{:2.4f}".format(ep, eps, test_rew))
test_rewards.append(test_rew)
return Q
def SARSA(env, lr=0.01, num_episodes=10000, eps=0.3, gamma=0.95, eps_decay=0.00005):
nA = env.action_space.n
nS = env.observation_space.n
# Initialize the Q matrix
# Q: matrix nS*nA where each row represent a state and each colums represent a different action
Q = np.zeros((nS, nA))
games_reward = []
test_rewards = []
for ep in range(num_episodes):
state = env.reset()
done = False
tot_rew = 0
# decay the epsilon value until it reaches the threshold of 0.01
if eps > 0.01:
eps -= eps_decay
action = eps_greedy(Q, state, eps)
# loop the main body until the environment stops
while not done:
next_state, rew, done, _ = env.step(action) # Take one step in the environment
# choose the next action (needed for the SARSA update)
next_action = eps_greedy(Q, next_state, eps)
# SARSA update
Q[state][action] = Q[state][action] + lr*(rew + gamma*Q[next_state][next_action] - Q[state][action])
state = next_state
action = next_action
tot_rew += rew
if done:
games_reward.append(tot_rew)
# Test the policy every 300 episodes and print the results
if (ep % 300) == 0:
test_rew = run_episodes(env, Q, 1000)
print("Episode:{:5d} Eps:{:2.4f} Rew:{:2.4f}".format(ep, eps, test_rew))
test_rewards.append(test_rew)
return Q
if __name__ == '__main__':
env = gym.make('Taxi-v2')
Q_qlearning = Q_learning(env, lr=.1, num_episodes=5000, eps=0.4, gamma=0.95, eps_decay=0.001)
Q_sarsa = SARSA(env, lr=.1, num_episodes=5000, eps=0.4, gamma=0.95, eps_decay=0.001)
================================================
FILE: Chapter05/.ipynb_checkpoints/Untitled-checkpoint.ipynb
================================================
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 2
}
================================================
FILE: Chapter05/DQN_Atari.py
================================================
import numpy as np
import tensorflow as tf
import gym
from datetime import datetime
from collections import deque
import time
import sys
from atari_wrappers import make_env
gym.logger.set_level(40)
current_milli_time = lambda: int(round(time.time() * 1000))
def cnn(x):
'''
Convolutional neural network
'''
x = tf.layers.conv2d(x, filters=16, kernel_size=8, strides=4, padding='valid', activation='relu')
x = tf.layers.conv2d(x, filters=32, kernel_size=4, strides=2, padding='valid', activation='relu')
return tf.layers.conv2d(x, filters=32, kernel_size=3, strides=1, padding='valid', activation='relu')
def fnn(x, hidden_layers, output_layer, activation=tf.nn.relu, last_activation=None):
'''
Feed-forward neural network
'''
for l in hidden_layers:
x = tf.layers.dense(x, units=l, activation=activation)
return tf.layers.dense(x, units=output_layer, activation=last_activation)
def qnet(x, hidden_layers, output_size, fnn_activation=tf.nn.relu, last_activation=None):
'''
Deep Q network: CNN followed by FNN
'''
x = cnn(x)
x = tf.layers.flatten(x)
return fnn(x, hidden_layers, output_size, fnn_activation, last_activation)
class ExperienceBuffer():
'''
Experience Replay Buffer
'''
def __init__(self, buffer_size):
self.obs_buf = deque(maxlen=buffer_size)
self.rew_buf = deque(maxlen=buffer_size)
self.act_buf = deque(maxlen=buffer_size)
self.obs2_buf = deque(maxlen=buffer_size)
self.done_buf = deque(maxlen=buffer_size)
def add(self, obs, rew, act, obs2, done):
# Add a new transition to the buffers
self.obs_buf.append(obs)
self.rew_buf.append(rew)
self.act_buf.append(act)
self.obs2_buf.append(obs2)
self.done_buf.append(done)
def sample_minibatch(self, batch_size):
# Sample a minibatch of size batch_size
mb_indices = np.random.randint(len(self.obs_buf), size=batch_size)
mb_obs = scale_frames([self.obs_buf[i] for i in mb_indices])
mb_rew = [self.rew_buf[i] for i in mb_indices]
mb_act = [self.act_buf[i] for i in mb_indices]
mb_obs2 = scale_frames([self.obs2_buf[i] for i in mb_indices])
mb_done = [self.done_buf[i] for i in mb_indices]
return mb_obs, mb_rew, mb_act, mb_obs2, mb_done
def __len__(self):
return len(self.obs_buf)
def q_target_values(mini_batch_rw, mini_batch_done, av, discounted_value):
'''
Calculate the target value y for each transition
'''
max_av = np.max(av, axis=1)
# if episode terminate, y take value r
# otherwise, q-learning step
ys = []
for r, d, av in zip(mini_batch_rw, mini_batch_done, max_av):
if d:
ys.append(r)
else:
q_step = r + discounted_value * av
ys.append(q_step)
assert len(ys) == len(mini_batch_rw)
return ys
def greedy(action_values):
'''
Greedy policy
'''
return np.argmax(action_values)
def eps_greedy(action_values, eps=0.1):
'''
Eps-greedy policy
'''
if np.random.uniform(0,1) < eps:
# Choose a uniform random action
return np.random.randint(len(action_values))
else:
# Choose the greedy action
return np.argmax(action_values)
def test_agent(env_test, agent_op, num_games=20):
'''
Test an agent
'''
games_r = []
for _ in range(num_games):
d = False
game_r = 0
o = env_test.reset()
while not d:
# Use an eps-greedy policy with eps=0.05 (to add stochasticity to the policy)
# Needed because Atari envs are deterministic
# If you would use a greedy policy, the results will be always the same
a = eps_greedy(np.squeeze(agent_op(o)), eps=0.05)
o, r, d, _ = env_test.step(a)
game_r += r
games_r.append(game_r)
return games_r
def scale_frames(frames):
'''
Scale the frame with number between 0 and 1
'''
return np.array(frames, dtype=np.float32) / 255.0
def DQN(env_name, hidden_sizes=[32], lr=1e-2, num_epochs=2000, buffer_size=100000, discount=0.99, render_cycle=100, update_target_net=1000,
batch_size=64, update_freq=4, frames_num=2, min_buffer_size=5000, test_frequency=20, start_explor=1, end_explor=0.1, explor_steps=100000):
# Create the environment both for train and test
env = make_env(env_name, frames_num=frames_num, skip_frames=True, noop_num=20)
env_test = make_env(env_name, frames_num=frames_num, skip_frames=True, noop_num=20)
# Add a monitor to the test env to store the videos
env_test = gym.wrappers.Monitor(env_test, "VIDEOS/TEST_VIDEOS"+env_name+str(current_milli_time()),force=True, video_callable=lambda x: x%20==0)
tf.reset_default_graph()
obs_dim = env.observation_space.shape
act_dim = env.action_space.n
# Create all the placeholders
obs_ph = tf.placeholder(shape=(None, obs_dim[0], obs_dim[1], obs_dim[2]), dtype=tf.float32, name='obs')
act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act')
y_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='y')
# Create the target network
with tf.variable_scope('target_network'):
target_qv = qnet(obs_ph, hidden_sizes, act_dim)
target_vars = tf.trainable_variables()
# Create the online network (i.e. the behavior policy)
with tf.variable_scope('online_network'):
online_qv = qnet(obs_ph, hidden_sizes, act_dim)
train_vars = tf.trainable_variables()
# Update the target network by assigning to it the variables of the online network
# Note that the target network and the online network have the same exact architecture
update_target = [train_vars[i].assign(train_vars[i+len(target_vars)]) for i in range(len(train_vars) - len(target_vars))]
update_target_op = tf.group(*update_target)
# One hot encoding of the action
act_onehot = tf.one_hot(act_ph, depth=act_dim)
# We are interested only in the Q-values of those actions
q_values = tf.reduce_sum(act_onehot * online_qv, axis=1)
# MSE loss function
v_loss = tf.reduce_mean((y_ph - q_values)**2)
# Adam optimize that minimize the loss v_loss
v_opt = tf.train.AdamOptimizer(lr).minimize(v_loss)
def agent_op(o):
'''
Forward pass to obtain the Q-values from the online network of a single observation
'''
# Scale the frames
o = scale_frames(o)
return sess.run(online_qv, feed_dict={obs_ph:[o]})
# Time
now = datetime.now()
clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, int(now.second))
print('Time:', clock_time)
mr_v = tf.Variable(0.0)
ml_v = tf.Variable(0.0)
# TensorBoard summaries
tf.summary.scalar('v_loss', v_loss)
tf.summary.scalar('Q-value', tf.reduce_mean(q_values))
tf.summary.histogram('Q-values', q_values)
scalar_summary = tf.summary.merge_all()
reward_summary = tf.summary.scalar('test_rew', mr_v)
mean_loss_summary = tf.summary.scalar('mean_loss', ml_v)
LOG_DIR = 'log_dir/'+env_name
hyp_str = "-lr_{}-upTN_{}-upF_{}-frms_{}" .format(lr, update_target_net, update_freq, frames_num)
# initialize the File Writer for writing TensorBoard summaries
file_writer = tf.summary.FileWriter(LOG_DIR+'/DQN_'+clock_time+'_'+hyp_str, tf.get_default_graph())
# open a session
sess = tf.Session()
# and initialize all the variables
sess.run(tf.global_variables_initializer())
render_the_game = False
step_count = 0
last_update_loss = []
ep_time = current_milli_time()
batch_rew = []
old_step_count = 0
obs = env.reset()
# Initialize the experience buffer
buffer = ExperienceBuffer(buffer_size)
# Copy the online network in the target network
sess.run(update_target_op)
########## EXPLORATION INITIALIZATION ######
eps = start_explor
eps_decay = (start_explor - end_explor) / explor_steps
for ep in range(num_epochs):
g_rew = 0
done = False
# Until the environment does not end..
while not done:
# Epsilon decay
if eps > end_explor:
eps -= eps_decay
# Choose an eps-greedy action
act = eps_greedy(np.squeeze(agent_op(obs)), eps=eps)
# execute the action in the environment
obs2, rew, done, _ = env.step(act)
# Render the game if you want to
if render_the_game:
env.render()
# Add the transition to the replay buffer
buffer.add(obs, rew, act, obs2, done)
obs = obs2
g_rew += rew
step_count += 1
################ TRAINING ###############
# If it's time to train the network:
if len(buffer) > min_buffer_size and (step_count % update_freq == 0):
# sample a minibatch from the buffer
mb_obs, mb_rew, mb_act, mb_obs2, mb_done = buffer.sample_minibatch(batch_size)
mb_trg_qv = sess.run(target_qv, feed_dict={obs_ph:mb_obs2})
y_r = q_target_values(mb_rew, mb_done, mb_trg_qv, discount)
# TRAINING STEP
# optimize, compute the loss and return the TB summary
train_summary, train_loss, _ = sess.run([scalar_summary, v_loss, v_opt], feed_dict={obs_ph:mb_obs, y_ph:y_r, act_ph: mb_act})
# Add the train summary to the file_writer
file_writer.add_summary(train_summary, step_count)
last_update_loss.append(train_loss)
# Every update_target_net steps, update the target network
if (len(buffer) > min_buffer_size) and (step_count % update_target_net == 0):
# run the session to update the target network and get the mean loss sumamry
_, train_summary = sess.run([update_target_op, mean_loss_summary], feed_dict={ml_v:np.mean(last_update_loss)})
file_writer.add_summary(train_summary, step_count)
last_update_loss = []
# If the environment is ended, reset it and initialize the variables
if done:
obs = env.reset()
batch_rew.append(g_rew)
g_rew, render_the_game = 0, False
# every test_frequency episodes, test the agent and write some stats in TensorBoard
if ep % test_frequency == 0:
# Test the agent to 10 games
test_rw = test_agent(env_test, agent_op, num_games=10)
# Run the test stats and add them to the file_writer
test_summary = sess.run(reward_summary, feed_dict={mr_v: np.mean(test_rw)})
file_writer.add_summary(test_summary, step_count)
# Print some useful stats
ep_sec_time = int((current_milli_time()-ep_time) / 1000)
print('Ep:%4d Rew:%4.2f, Eps:%2.2f -- Step:%5d -- Test:%4.2f %4.2f -- Time:%d -- Ep_Steps:%d' %
(ep,np.mean(batch_rew), eps, step_count, np.mean(test_rw), np.std(test_rw), ep_sec_time, (step_count-old_step_count)/test_frequency))
ep_time = current_milli_time()
batch_rew = []
old_step_count = step_count
if ep % render_cycle == 0:
render_the_game = True
file_writer.close()
env.close()
if __name__ == '__main__':
DQN('PongNoFrameskip-v4', hidden_sizes=[128], lr=2e-4, buffer_size=100000, update_target_net=1000, batch_size=32,
update_freq=2, frames_num=2, min_buffer_size=10000, render_cycle=10000)
================================================
FILE: Chapter05/DQN_variations_Atari.py
================================================
import numpy as np
import tensorflow as tf
import gym
from datetime import datetime
from collections import deque
import time
import sys
from atari_wrappers import make_env
gym.logger.set_level(40)
current_milli_time = lambda: int(round(time.time() * 1000))
def cnn(x):
'''
Convolutional neural network
'''
x = tf.layers.conv2d(x, filters=16, kernel_size=8, strides=4, padding='valid', activation='relu')
x = tf.layers.conv2d(x, filters=32, kernel_size=4, strides=2, padding='valid', activation='relu')
return tf.layers.conv2d(x, filters=32, kernel_size=3, strides=1, padding='valid', activation='relu')
def fnn(x, hidden_layers, output_layer, activation=tf.nn.relu, last_activation=None):
'''
Feed-forward neural network
'''
for l in hidden_layers:
x = tf.layers.dense(x, units=l, activation=activation)
return tf.layers.dense(x, units=output_layer, activation=last_activation)
def qnet(x, hidden_layers, output_size, fnn_activation=tf.nn.relu, last_activation=None):
'''
Deep Q network: CNN followed by FNN
'''
x = cnn(x)
x = tf.layers.flatten(x)
return fnn(x, hidden_layers, output_size, fnn_activation, last_activation)
def greedy(action_values):
'''
Greedy policy
'''
return np.argmax(action_values)
def eps_greedy(action_values, eps=0.1):
'''
Eps-greedy policy
'''
if np.random.uniform(0,1) < eps:
# Choose a uniform random action
return np.random.randint(len(action_values))
else:
# Choose the greedy action
return np.argmax(action_values)
def q_target_values(mini_batch_rw, mini_batch_done, av, discounted_value):
'''
Calculate the target value y for each transition
'''
max_av = np.max(av, axis=1)
# if episode terminate, y take value r
# otherwise, q-learning step
ys = []
for r, d, av in zip(mini_batch_rw, mini_batch_done, max_av):
if d:
ys.append(r)
else:
q_step = r + discounted_value * av
ys.append(q_step)
assert len(ys) == len(mini_batch_rw)
return ys
def test_agent(env_test, agent_op, num_games=20):
'''
Test an agent
'''
games_r = []
for _ in range(num_games):
d = False
game_r = 0
o = env_test.reset()
while not d:
# Use an eps-greedy policy with eps=0.05 (to add stochasticity to the policy)
# Needed because Atari envs are deterministic
# If you would use a greedy policy, the results will be always the same
a = eps_greedy(np.squeeze(agent_op(o)), eps=0.05)
o, r, d, _ = env_test.step(a)
game_r += r
games_r.append(game_r)
return games_r
def scale_frames(frames):
'''
Scale the frame with number between 0 and 1
'''
return np.array(frames, dtype=np.float32) / 255.0
def dueling_qnet(x, hidden_layers, output_size, fnn_activation=tf.nn.relu, last_activation=None):
'''
Dueling neural network
'''
x = cnn(x)
x = tf.layers.flatten(x)
qf = fnn(x, hidden_layers, 1, fnn_activation, last_activation)
aaqf = fnn(x, hidden_layers, output_size, fnn_activation, last_activation)
return qf + aaqf - tf.reduce_mean(aaqf)
def double_q_target_values(mini_batch_rw, mini_batch_done, target_qv, online_qv, discounted_value): ## IS THE NAME CORRECT???
'''
Calculate the target value y following the double Q-learning update
'''
argmax_online_qv = np.argmax(online_qv, axis=1)
# if episode terminate, y take value r
# otherwise, q-learning step
ys = []
assert len(mini_batch_rw) == len(mini_batch_done) == len(target_qv) == len(argmax_online_qv)
for r, d, t_av, arg_a in zip(mini_batch_rw, mini_batch_done, target_qv, argmax_online_qv):
if d:
ys.append(r)
else:
q_value = r + discounted_value * t_av[arg_a]
ys.append(q_value)
assert len(ys) == len(mini_batch_rw)
return ys
class MultiStepExperienceBuffer():
'''
Experience Replay Buffer for multi-step learning
'''
def __init__(self, buffer_size, n_step, gamma):
self.obs_buf = deque(maxlen=buffer_size)
self.act_buf = deque(maxlen=buffer_size)
self.n_obs_buf = deque(maxlen=buffer_size)
self.n_done_buf = deque(maxlen=buffer_size)
self.n_rew_buf = deque(maxlen=buffer_size)
self.n_step = n_step
self.last_rews = deque(maxlen=self.n_step+1)
self.gamma = gamma
def add(self, obs, rew, act, obs2, done):
self.obs_buf.append(obs)
self.act_buf.append(act)
# the following buffers will be updated in the next n_step steps
# their values are not known, yet
self.n_obs_buf.append(None)
self.n_rew_buf.append(None)
self.n_done_buf.append(None)
self.last_rews.append(rew)
ln = len(self.obs_buf)
len_rews = len(self.last_rews)
# Update the indices of the buffer that are n_steps old
if done:
# In case it's the last step, update up to the n_steps indices fo the buffer
# it cannot update more than len(last_rews), otherwise will update the previous traj
for i in range(len_rews):
self.n_obs_buf[ln-(len_rews-i-1)-1] = obs2
self.n_done_buf[ln-(len_rews-i-1)-1] = done
rgt = np.sum([(self.gamma**k)*r for k,r in enumerate(np.array(self.last_rews)[i:len_rews])])
self.n_rew_buf[ln-(len_rews-i-1)-1] = rgt
# reset the reward deque
self.last_rews = deque(maxlen=self.n_step+1)
else:
# Update the elements of the buffer that has been added n_step steps ago
# Add only if the multi-step values are updated
if len(self.last_rews) >= (self.n_step+1):
self.n_obs_buf[ln-self.n_step-1] = obs2
self.n_done_buf[ln-self.n_step-1] = done
rgt = np.sum([(self.gamma**k)*r for k,r in enumerate(np.array(self.last_rews)[:len_rews])])
self.n_rew_buf[ln-self.n_step-1] = rgt
def sample_minibatch(self, batch_size):
# Sample a minibatch of size batch_size
# Note: the samples should be at least of n_step steps ago
mb_indices = np.random.randint(len(self.obs_buf)-self.n_step, size=batch_size)
mb_obs = scale_frames([self.obs_buf[i] for i in mb_indices])
mb_rew = [self.n_rew_buf[i] for i in mb_indices]
mb_act = [self.act_buf[i] for i in mb_indices]
mb_obs2 = scale_frames([self.n_obs_buf[i] for i in mb_indices])
mb_done = [self.n_done_buf[i] for i in mb_indices]
return mb_obs, mb_rew, mb_act, mb_obs2, mb_done
def __len__(self):
return len(self.obs_buf)
def DQN_with_variations(env_name, extensions_hyp, hidden_sizes=[32], lr=1e-2, num_epochs=2000, buffer_size=100000, discount=0.99, render_cycle=100, update_target_net=1000,
batch_size=64, update_freq=4, frames_num=2, min_buffer_size=5000, test_frequency=20, start_explor=1, end_explor=0.1, explor_steps=100000):
# Create the environment both for train and test
env = make_env(env_name, frames_num=frames_num, skip_frames=True, noop_num=20)
env_test = make_env(env_name, frames_num=frames_num, skip_frames=True, noop_num=20)
# Add a monitor to the test env to store the videos
env_test = gym.wrappers.Monitor(env_test, "VIDEOS/TEST_VIDEOS"+env_name+str(current_milli_time()),force=True, video_callable=lambda x: x%20==0)
tf.reset_default_graph()
obs_dim = env.observation_space.shape
act_dim = env.action_space.n
# Create all the placeholders
obs_ph = tf.placeholder(shape=(None, obs_dim[0], obs_dim[1], obs_dim[2]), dtype=tf.float32, name='obs')
act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act')
y_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='y')
# Create the target network
with tf.variable_scope('target_network'):
if extensions_hyp['dueling']:
target_qv = dueling_qnet(obs_ph, hidden_sizes, act_dim)
else:
target_qv = qnet(obs_ph, hidden_sizes, act_dim)
target_vars = tf.trainable_variables()
# Create the online network (i.e. the behavior policy)
with tf.variable_scope('online_network'):
if extensions_hyp['dueling']:
online_qv = dueling_qnet(obs_ph, hidden_sizes, act_dim)
else:
online_qv = qnet(obs_ph, hidden_sizes, act_dim)
train_vars = tf.trainable_variables()
# Update the target network by assigning to it the variables of the online network
# Note that the target network and the online network have the same exact architecture
update_target = [train_vars[i].assign(train_vars[i+len(target_vars)]) for i in range(len(train_vars) - len(target_vars))]
update_target_op = tf.group(*update_target)
# One hot encoding of the action
act_onehot = tf.one_hot(act_ph, depth=act_dim)
# We are interested only in the Q-values of those actions
q_values = tf.reduce_sum(act_onehot * online_qv, axis=1)
# MSE loss function
v_loss = tf.reduce_mean((y_ph - q_values)**2)
# Adam optimize that minimize the loss v_loss
v_opt = tf.train.AdamOptimizer(lr).minimize(v_loss)
def agent_op(o):
'''
Forward pass to obtain the Q-values from the online network of a single observation
'''
# Scale the frames
o = scale_frames(o)
return sess.run(online_qv, feed_dict={obs_ph:[o]})
# Time
now = datetime.now()
clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, int(now.second))
print('Time:', clock_time)
mr_v = tf.Variable(0.0)
ml_v = tf.Variable(0.0)
# TensorBoard summaries
tf.summary.scalar('v_loss', v_loss)
tf.summary.scalar('Q-value', tf.reduce_mean(q_values))
tf.summary.histogram('Q-values', q_values)
scalar_summary = tf.summary.merge_all()
reward_summary = tf.summary.scalar('test_rew', mr_v)
mean_loss_summary = tf.summary.scalar('mean_loss', ml_v)
LOG_DIR = 'log_dir/'+env_name
hyp_str = "-lr_{}-upTN_{}-upF_{}-frms_{}-ddqn_{}-duel_{}-nstep_{}" \
.format(lr, update_target_net, update_freq, frames_num, extensions_hyp['DDQN'], extensions_hyp['dueling'], extensions_hyp['multi_step'])
# initialize the File Writer for writing TensorBoard summaries
file_writer = tf.summary.FileWriter(LOG_DIR+'/DQN_'+clock_time+'_'+hyp_str, tf.get_default_graph())
# open a session
sess = tf.Session()
# and initialize all the variables
sess.run(tf.global_variables_initializer())
render_the_game = False
step_count = 0
last_update_loss = []
ep_time = current_milli_time()
batch_rew = []
old_step_count = 0
obs = env.reset()
# Initialize the experience buffer
#buffer = ExperienceBuffer(buffer_size)
buffer = MultiStepExperienceBuffer(buffer_size, extensions_hyp['multi_step'], discount)
# Copy the online network in the target network
sess.run(update_target_op)
########## EXPLORATION INITIALIZATION ######
eps = start_explor
eps_decay = (start_explor - end_explor) / explor_steps
for ep in range(num_epochs):
g_rew = 0
done = False
# Until the environment does not end..
while not done:
# Epsilon decay
if eps > end_explor:
eps -= eps_decay
# Choose an eps-greedy action
act = eps_greedy(np.squeeze(agent_op(obs)), eps=eps)
# execute the action in the environment
obs2, rew, done, _ = env.step(act)
# Render the game if you want to
if render_the_game:
env.render()
# Add the transition to the replay buffer
buffer.add(obs, rew, act, obs2, done)
obs = obs2
g_rew += rew
step_count += 1
################ TRAINING ###############
# If it's time to train the network:
if len(buffer) > min_buffer_size and (step_count % update_freq == 0):
# sample a minibatch from the buffer
mb_obs, mb_rew, mb_act, mb_obs2, mb_done = buffer.sample_minibatch(batch_size)
if extensions_hyp['DDQN']:
mb_onl_qv, mb_trg_qv = sess.run([online_qv,target_qv], feed_dict={obs_ph:mb_obs2})
y_r = double_q_target_values(mb_rew, mb_done, mb_trg_qv, mb_onl_qv, discount)
else:
mb_trg_qv = sess.run(target_qv, feed_dict={obs_ph:mb_obs2})
y_r = q_target_values(mb_rew, mb_done, mb_trg_qv, discount)
# optimize, compute the loss and return the TB summary
train_summary, train_loss, _ = sess.run([scalar_summary, v_loss, v_opt], feed_dict={obs_ph:mb_obs, y_ph:y_r, act_ph: mb_act})
# Add the train summary to the file_writer
file_writer.add_summary(train_summary, step_count)
last_update_loss.append(train_loss)
# Every update_target_net steps, update the target network
if (len(buffer) > min_buffer_size) and (step_count % update_target_net == 0):
# run the session to update the target network and get the mean loss sumamry
_, train_summary = sess.run([update_target_op, mean_loss_summary], feed_dict={ml_v:np.mean(last_update_loss)})
file_writer.add_summary(train_summary, step_count)
last_update_loss = []
# If the environment is ended, reset it and initialize the variables
if done:
obs = env.reset()
batch_rew.append(g_rew)
g_rew, render_the_game = 0, False
# every test_frequency episodes, test the agent and write some stats in TensorBoard
if ep % test_frequency == 0:
# Test the agent to 10 games
test_rw = test_agent(env_test, agent_op, num_games=10)
# Run the test stats and add them to the file_writer
test_summary = sess.run(reward_summary, feed_dict={mr_v: np.mean(test_rw)})
file_writer.add_summary(test_summary, step_count)
# Print some useful stats
ep_sec_time = int((current_milli_time()-ep_time) / 1000)
print('Ep:%4d Rew:%4.2f, Eps:%2.2f -- Step:%5d -- Test:%4.2f %4.2f -- Time:%d -- Ep_Steps:%d' %
(ep,np.mean(batch_rew), eps, step_count, np.mean(test_rw), np.std(test_rw), ep_sec_time, (step_count-old_step_count)/test_frequency))
ep_time = current_milli_time()
batch_rew = []
old_step_count = step_count
if ep % render_cycle == 0:
render_the_game = True
file_writer.close()
env.close()
if __name__ == '__main__':
extensions_hyp={
'DDQN':False,
'dueling':False,
'multi_step':1
}
DQN_with_variations('PongNoFrameskip-v4', extensions_hyp, hidden_sizes=[128], lr=2e-4, buffer_size=100000, update_target_net=1000, batch_size=32,
update_freq=2, frames_num=2, min_buffer_size=10000, render_cycle=10000)
================================================
FILE: Chapter05/Untitled.ipynb
================================================
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 2
}
================================================
FILE: Chapter05/atari_wrappers.py
================================================
import numpy as np
import os
from collections import deque
import gym
from gym import spaces
import cv2
'''
Atari Wrapper copied from https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py
'''
class NoopResetEnv(gym.Wrapper):
def __init__(self, env, noop_max=30):
"""Sample initial states by taking random number of no-ops on reset.
No-op is assumed to be action 0.
"""
gym.Wrapper.__init__(self, env)
self.noop_max = noop_max
self.override_num_noops = None
self.noop_action = 0
assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
def reset(self, **kwargs):
""" Do no-op action for a number of steps in [1, noop_max]."""
self.env.reset(**kwargs)
if self.override_num_noops is not None:
noops = self.override_num_noops
else:
noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
assert noops > 0
obs = None
for _ in range(noops):
obs, _, done, _ = self.env.step(self.noop_action)
if done:
obs = self.env.reset(**kwargs)
return obs
def step(self, ac):
return self.env.step(ac)
class LazyFrames(object):
def __init__(self, frames):
"""This object ensures that common frames between the observations are only stored once.
It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
buffers.
This object should only be converted to numpy array before being passed to the model.
You'd not believe how complex the previous solution was."""
self._frames = frames
self._out = None
def _force(self):
if self._out is None:
self._out = np.concatenate(self._frames, axis=2)
self._frames = None
return self._out
def __array__(self, dtype=None):
out = self._force()
if dtype is not None:
out = out.astype(dtype)
return out
def __len__(self):
return len(self._force())
def __getitem__(self, i):
return self._force()[i]
class FireResetEnv(gym.Wrapper):
def __init__(self, env):
"""Take action on reset for environments that are fixed until firing."""
gym.Wrapper.__init__(self, env)
assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
assert len(env.unwrapped.get_action_meanings()) >= 3
def reset(self, **kwargs):
self.env.reset(**kwargs)
obs, _, done, _ = self.env.step(1)
if done:
self.env.reset(**kwargs)
obs, _, done, _ = self.env.step(2)
if done:
self.env.reset(**kwargs)
return obs
def step(self, ac):
return self.env.step(ac)
class MaxAndSkipEnv(gym.Wrapper):
def __init__(self, env, skip=4):
"""Return only every `skip`-th frame"""
gym.Wrapper.__init__(self, env)
# most recent raw observations (for max pooling across time steps)
self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
self._skip = skip
def step(self, action):
"""Repeat action, sum reward, and max over last observations."""
total_reward = 0.0
done = None
for i in range(self._skip):
obs, reward, done, info = self.env.step(action)
if i == self._skip - 2: self._obs_buffer[0] = obs
if i == self._skip - 1: self._obs_buffer[1] = obs
total_reward += reward
if done:
break
# Note that the observation on the done=True frame
# doesn't matter
max_frame = self._obs_buffer.max(axis=0)
return max_frame, total_reward, done, info
def reset(self, **kwargs):
return self.env.reset(**kwargs)
class WarpFrame(gym.ObservationWrapper):
def __init__(self, env):
"""Warp frames to 84x84 as done in the Nature paper and later work."""
gym.ObservationWrapper.__init__(self, env)
self.width = 84
self.height = 84
self.observation_space = spaces.Box(low=0, high=255,
shape=(self.height, self.width, 1), dtype=np.uint8)
def observation(self, frame):
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
return frame[:, :, None]
class FrameStack(gym.Wrapper):
def __init__(self, env, k):
"""Stack k last frames.
Returns lazy array, which is much more memory efficient.
See Also
baselines.common.atari_wrappers.LazyFrames
"""
gym.Wrapper.__init__(self, env)
self.k = k
self.frames = deque([], maxlen=k)
shp = env.observation_space.shape
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=env.observation_space.dtype)
def reset(self):
ob = self.env.reset()
for _ in range(self.k):
self.frames.append(ob)
return self._get_ob()
def step(self, action):
ob, reward, done, info = self.env.step(action)
self.frames.append(ob)
return self._get_ob(), reward, done, info
def _get_ob(self):
assert len(self.frames) == self.k
return LazyFrames(list(self.frames))
class ScaledFloatFrame(gym.ObservationWrapper):
def __init__(self, env):
gym.ObservationWrapper.__init__(self, env)
self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32)
def observation(self, observation):
# careful! This undoes the memory optimization, use
# with smaller replay buffers only.
return np.array(observation).astype(np.float32) / 255.0
def make_env(env_name, fire=True, frames_num=2, noop_num=30, skip_frames=True):
env = gym.make(env_name)
if skip_frames:
env = MaxAndSkipEnv(env) ## Return only every `skip`-th frame
if fire:
env = FireResetEnv(env) ## Fire at the beginning
env = NoopResetEnv(env, noop_max=noop_num)
env = WarpFrame(env) ## Reshape image
env = FrameStack(env, frames_num) ## Stack last 4 frames
#env = ScaledFloatFrame(env) ## Scale frames
return env
================================================
FILE: Chapter05/untitled
================================================
================================================
FILE: Chapter06/AC.py
================================================
import numpy as np
import tensorflow as tf
import gym
from datetime import datetime
import time
def mlp(x, hidden_layers, output_size, activation=tf.nn.relu, last_activation=None):
'''
Multi-layer perceptron
'''
for l in hidden_layers:
x = tf.layers.dense(x, units=l, activation=activation)
return tf.layers.dense(x, units=output_size, activation=last_activation)
def softmax_entropy(logits):
'''
Softmax Entropy
'''
return tf.reduce_sum(tf.nn.softmax(logits, axis=-1) * tf.nn.log_softmax(logits, axis=-1), axis=-1)
def discounted_rewards(rews, last_sv, gamma):
'''
Discounted reward to go
Parameters:
----------
rews: list of rewards
last_sv: value of the last state
gamma: discount value
'''
rtg = np.zeros_like(rews, dtype=np.float32)
rtg[-1] = rews[-1] + gamma*last_sv
for i in reversed(range(len(rews)-1)):
rtg[i] = rews[i] + gamma*rtg[i+1]
return rtg
class Buffer():
'''
Buffer class to store the experience from a unique policy
'''
def __init__(self, gamma=0.99):
self.gamma = gamma
self.obs = []
self.act = []
self.ret = []
self.rtg = []
def store(self, temp_traj, last_sv):
'''
Add temp_traj values to the buffers and compute the advantage and reward to go
Parameters:
-----------
temp_traj: list where each element is a list that contains: observation, reward, action, state-value
last_sv: value of the last state (Used to Bootstrap)
'''
# store only if the temp_traj list is not empty
if len(temp_traj) > 0:
self.obs.extend(temp_traj[:,0])
rtg = discounted_rewards(temp_traj[:,1], last_sv, self.gamma)
self.ret.extend(rtg - temp_traj[:,3])
self.rtg.extend(rtg)
self.act.extend(temp_traj[:,2])
def get_batch(self):
return self.obs, self.act, self.ret, self.rtg
def __len__(self):
assert(len(self.obs) == len(self.act) == len(self.ret) == len(self.rtg))
return len(self.obs)
def AC(env_name, hidden_sizes=[32], ac_lr=5e-3, cr_lr=8e-3, num_epochs=50, gamma=0.99, steps_per_epoch=100, steps_to_print=100):
'''
Actor-Critic Algorithm
s
Parameters:
-----------
env_name: Name of the environment
hidden_size: list of the number of hidden units for each layer
ac_lr: actor learning rate
cr_lr: critic learning rate
num_epochs: number of training epochs
gamma: discount factor
steps_per_epoch: number of steps per epoch
'''
tf.reset_default_graph()
env = gym.make(env_name)
obs_dim = env.observation_space.shape
act_dim = env.action_space.n
# Placeholders
obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs')
act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act')
ret_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='ret')
rtg_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='rtg')
#####################################################
########### COMPUTE THE PG LOSS FUNCTIONS ###########
#####################################################
# policy
p_logits = mlp(obs_ph, hidden_sizes, act_dim, activation=tf.tanh)
act_multn = tf.squeeze(tf.random.multinomial(p_logits, 1))
actions_mask = tf.one_hot(act_ph, depth=act_dim)
p_log = tf.reduce_sum(actions_mask * tf.nn.log_softmax(p_logits), axis=1)
# entropy useful to study the algorithms
entropy = -tf.reduce_mean(softmax_entropy(p_logits))
p_loss = -tf.reduce_mean(p_log*ret_ph)
# policy optimization
p_opt = tf.train.AdamOptimizer(ac_lr).minimize(p_loss)
#######################################
########### VALUE FUNCTION ###########
#######################################
# value function
s_values = tf.squeeze(mlp(obs_ph, hidden_sizes, 1, activation=tf.tanh))
# MSE loss function
v_loss = tf.reduce_mean((rtg_ph - s_values)**2)
# value function optimization
v_opt = tf.train.AdamOptimizer(cr_lr).minimize(v_loss)
# Time
now = datetime.now()
clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)
print('Time:', clock_time)
# Set scalars and hisograms for TensorBoard
tf.summary.scalar('p_loss', p_loss, collections=['train'])
tf.summary.scalar('v_loss', v_loss, collections=['train'])
tf.summary.scalar('entropy', entropy, collections=['train'])
tf.summary.scalar('s_values', tf.reduce_mean(s_values), collections=['train'])
tf.summary.histogram('p_soft', tf.nn.softmax(p_logits), collections=['train'])
tf.summary.histogram('p_log', p_log, collections=['train'])
tf.summary.histogram('act_multn', act_multn, collections=['train'])
tf.summary.histogram('p_logits', p_logits, collections=['train'])
tf.summary.histogram('ret_ph', ret_ph, collections=['train'])
tf.summary.histogram('rtg_ph', rtg_ph, collections=['train'])
tf.summary.histogram('s_values', s_values, collections=['train'])
train_summary = tf.summary.merge_all('train')
tf.summary.scalar('old_v_loss', v_loss, collections=['pre_train'])
tf.summary.scalar('old_p_loss', p_loss, collections=['pre_train'])
pre_scalar_summary = tf.summary.merge_all('pre_train')
hyp_str = '-steps_{}-aclr_{}-crlr_{}'.format(steps_per_epoch, ac_lr, cr_lr)
file_writer = tf.summary.FileWriter('log_dir/{}/AC_{}_{}'.format(env_name, clock_time, hyp_str), tf.get_default_graph())
# create a session
sess = tf.Session()
# initialize the variables
sess.run(tf.global_variables_initializer())
# few variables
step_count = 0
train_rewards = []
train_ep_len = []
timer = time.time()
last_print_step = 0
#Reset the environment at the beginning of the cycle
obs = env.reset()
ep_rews = []
# main cycle
for ep in range(num_epochs):
# intiaizlie buffer and other variables for the new epochs
buffer = Buffer(gamma)
env_buf = []
#iterate always over a fixed number of iterations
for _ in range(steps_per_epoch):
# run the policy
act, val = sess.run([act_multn, s_values], feed_dict={obs_ph:[obs]})
# take a step in the environment
obs2, rew, done, _ = env.step(np.squeeze(act))
# add the new transition
env_buf.append([obs.copy(), rew, act, np.squeeze(val)])
obs = obs2.copy()
step_count += 1
last_print_step += 1
ep_rews.append(rew)
if done:
# store the trajectory just completed
# Changed from REINFORCE! The second parameter is the estimated value of the next state. Because the environment is done.
# we pass a value of 0
buffer.store(np.array(env_buf), 0)
env_buf = []
# store additionl information about the episode
train_rewards.append(np.sum(ep_rews))
train_ep_len.append(len(ep_rews))
# reset the environment
obs = env.reset()
ep_rews = []
# Bootstrap with the estimated state value of the next state!
if len(env_buf) > 0:
last_sv = sess.run(s_values, feed_dict={obs_ph:[obs]})
buffer.store(np.array(env_buf), last_sv)
# collect the episodes' information
obs_batch, act_batch, ret_batch, rtg_batch = buffer.get_batch()
# run pre_scalar_summary before the optimization phase
old_p_loss, old_v_loss, epochs_summary = sess.run([p_loss, v_loss, pre_scalar_summary], feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch, rtg_ph:rtg_batch})
file_writer.add_summary(epochs_summary, step_count)
# Optimize the actor and the critic
sess.run([p_opt, v_opt], feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch, rtg_ph:rtg_batch})
# run train_summary to save the summary after the optimization
new_p_loss, new_v_loss, train_summary_run = sess.run([p_loss, v_loss, train_summary], feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch, rtg_ph:rtg_batch})
file_writer.add_summary(train_summary_run, step_count)
summary = tf.Summary()
summary.value.add(tag='diff/p_loss', simple_value=(old_p_loss - new_p_loss))
summary.value.add(tag='diff/v_loss', simple_value=(old_v_loss - new_v_loss))
file_writer.add_summary(summary, step_count)
file_writer.flush()
# it's time to print some useful information
if last_print_step > steps_to_print:
print('Ep:%d MnRew:%.2f MxRew:%.1f EpLen:%.1f Buffer:%d -- Step:%d -- Time:%d' % (ep, np.mean(train_rewards), np.max(train_rewards), np.mean(train_ep_len), len(buffer), step_count,time.time()-timer))
summary = tf.Summary()
summary.value.add(tag='supplementary/len', simple_value=np.mean(train_ep_len))
summary.value.add(tag='supplementary/train_rew', simple_value=np.mean(train_rewards))
file_writer.add_summary(summary, step_count)
file_writer.flush()
timer = time.time()
train_rewards = []
train_ep_len = []
last_print_step = 0
env.close()
file_writer.close()
if __name__ == '__main__':
AC('LunarLander-v2', hidden_sizes=[64], ac_lr=4e-3, cr_lr=1.5e-2, gamma=0.99, steps_per_epoch=100, steps_to_print=5000, num_epochs=8000)
================================================
FILE: Chapter06/REINFORCE.py
================================================
import numpy as np
import tensorflow as tf
import gym
from datetime import datetime
import time
def mlp(x, hidden_layers, output_size, activation=tf.nn.relu, last_activation=None):
'''
Multi-layer perceptron
'''
for l in hidden_layers:
x = tf.layers.dense(x, units=l, activation=activation)
return tf.layers.dense(x, units=output_size, activation=last_activation)
def softmax_entropy(logits):
'''
Softmax Entropy
'''
return tf.reduce_sum(tf.nn.softmax(logits, axis=-1) * tf.nn.log_softmax(logits, axis=-1), axis=-1)
def discounted_rewards(rews, gamma):
'''
Discounted reward to go
Parameters:
----------
rews: list of rewards
gamma: discount value
'''
rtg = np.zeros_like(rews, dtype=np.float32)
rtg[-1] = rews[-1]
for i in reversed(range(len(rews)-1)):
rtg[i] = rews[i] + gamma*rtg[i+1]
return rtg
class Buffer():
'''
Buffer class to store the experience from a unique policy
'''
def __init__(self, gamma=0.99):
self.gamma = gamma
self.obs = []
self.act = []
self.ret = []
def store(self, temp_traj):
'''
Add temp_traj values to the buffers and compute the advantage and reward to go
Parameters:
-----------
temp_traj: list where each element is a list that contains: observation, reward, action, state-value
'''
# store only if the temp_traj list is not empty
if len(temp_traj) > 0:
self.obs.extend(temp_traj[:,0])
rtg = discounted_rewards(temp_traj[:,1], self.gamma)
self.ret.extend(rtg)
self.act.extend(temp_traj[:,2])
def get_batch(self):
b_ret = self.ret
return self.obs, self.act, b_ret
def __len__(self):
assert(len(self.obs) == len(self.act) == len(self.ret))
return len(self.obs)
def REINFORCE(env_name, hidden_sizes=[32], lr=5e-3, num_epochs=50, gamma=0.99, steps_per_epoch=100):
'''
REINFORCE Algorithm
Parameters:
-----------
env_name: Name of the environment
hidden_size: list of the number of hidden units for each layer
lr: policy learning rate
gamma: discount factor
steps_per_epoch: number of steps per epoch
num_epochs: number train epochs (Note: they aren't properly epochs)
'''
tf.reset_default_graph()
env = gym.make(env_name)
obs_dim = env.observation_space.shape
act_dim = env.action_space.n
# Placeholders
obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs')
act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act')
ret_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='ret')
##################################################
########### COMPUTE THE LOSS FUNCTIONS ###########
##################################################
# policy
p_logits = mlp(obs_ph, hidden_sizes, act_dim, activation=tf.tanh)
act_multn = tf.squeeze(tf.random.multinomial(p_logits, 1))
actions_mask = tf.one_hot(act_ph, depth=act_dim)
p_log = tf.reduce_sum(actions_mask * tf.nn.log_softmax(p_logits), axis=1)
# entropy useful to study the algorithms
entropy = -tf.reduce_mean(softmax_entropy(p_logits))
p_loss = -tf.reduce_mean(p_log*ret_ph)
# policy optimization
p_opt = tf.train.AdamOptimizer(lr).minimize(p_loss)
# Time
now = datetime.now()
clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)
print('Time:', clock_time)
# Set scalars and hisograms for TensorBoard
tf.summary.scalar('p_loss', p_loss, collections=['train'])
tf.summary.scalar('entropy', entropy, collections=['train'])
tf.summary.histogram('p_soft', tf.nn.softmax(p_logits), collections=['train'])
tf.summary.histogram('p_log', p_log, collections=['train'])
tf.summary.histogram('act_multn', act_multn, collections=['train'])
tf.summary.histogram('p_logits', p_logits, collections=['train'])
tf.summary.histogram('ret_ph', ret_ph, collections=['train'])
train_summary = tf.summary.merge_all('train')
tf.summary.scalar('old_p_loss', p_loss, collections=['pre_train'])
pre_scalar_summary = tf.summary.merge_all('pre_train')
hyp_str = '-steps_{}-aclr_{}'.format(steps_per_epoch, lr)
file_writer = tf.summary.FileWriter('log_dir/{}/REINFORCE_{}_{}'.format(env_name, clock_time, hyp_str), tf.get_default_graph())
# create a session
sess = tf.Session()
# initialize the variables
sess.run(tf.global_variables_initializer())
# few variables
step_count = 0
train_rewards = []
train_ep_len = []
timer = time.time()
# main cycle
for ep in range(num_epochs):
# initialize environment for the new epochs
obs = env.reset()
# intiaizlie buffer and other variables for the new epochs
buffer = Buffer(gamma)
env_buf = []
ep_rews = []
while len(buffer) < steps_per_epoch:
# run the policy
act = sess.run(act_multn, feed_dict={obs_ph:[obs]})
# take a step in the environment
obs2, rew, done, _ = env.step(np.squeeze(act))
# add the new transition
env_buf.append([obs.copy(), rew, act])
obs = obs2.copy()
step_count += 1
ep_rews.append(rew)
if done:
# store the trajectory just completed
buffer.store(np.array(env_buf))
env_buf = []
# store additionl information about the episode
train_rewards.append(np.sum(ep_rews))
train_ep_len.append(len(ep_rews))
# reset the environment
obs = env.reset()
ep_rews = []
# collect the episodes' information
obs_batch, act_batch, ret_batch = buffer.get_batch()
# run pre_scalar_summary before the optimization phase
epochs_summary = sess.run(pre_scalar_summary, feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch})
file_writer.add_summary(epochs_summary, step_count)
# Optimize the policy
sess.run(p_opt, feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch})
# run train_summary to save the summary after the optimization
train_summary_run = sess.run(train_summary, feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch})
file_writer.add_summary(train_summary_run, step_count)
# it's time to print some useful information
if ep % 10 == 0:
print('Ep:%d MnRew:%.2f MxRew:%.1f EpLen:%.1f Buffer:%d -- Step:%d -- Time:%d' % (ep, np.mean(train_rewards), np.max(train_rewards), np.mean(train_ep_len), len(buffer), step_count,time.time()-timer))
summary = tf.Summary()
summary.value.add(tag='supplementary/len', simple_value=np.mean(train_ep_len))
summary.value.add(tag='supplementary/train_rew', simple_value=np.mean(train_rewards))
file_writer.add_summary(summary, step_count)
file_writer.flush()
timer = time.time()
train_rewards = []
train_ep_len = []
env.close()
file_writer.close()
if __name__ == '__main__':
REINFORCE('LunarLander-v2', hidden_sizes=[64], lr=8e-3, gamma=0.99, num_epochs=1000, steps_per_epoch=1000)
================================================
FILE: Chapter06/REINFORCE_baseline.py
================================================
import numpy as np
import tensorflow as tf
import gym
from datetime import datetime
import time
def mlp(x, hidden_layers, output_size, activation=tf.nn.relu, last_activation=None):
'''
Multi-layer perceptron
'''
for l in hidden_layers:
x = tf.layers.dense(x, units=l, activation=activation)
return tf.layers.dense(x, units=output_size, activation=last_activation)
def softmax_entropy(logits):
'''
Softmax Entropy
'''
return tf.reduce_sum(tf.nn.softmax(logits, axis=-1) * tf.nn.log_softmax(logits, axis=-1), axis=-1)
def discounted_rewards(rews, gamma):
'''
Discounted reward to go
Parameters:
----------
rews: list of rewards
gamma: discount value
'''
rtg = np.zeros_like(rews, dtype=np.float32)
rtg[-1] = rews[-1]
for i in reversed(range(len(rews)-1)):
rtg[i] = rews[i] + gamma*rtg[i+1]
return rtg
class Buffer():
'''
Buffer class to store the experience from a unique policy
'''
def __init__(self, gamma=0.99):
self.gamma = gamma
self.obs = []
self.act = []
self.ret = []
self.rtg = []
def store(self, temp_traj):
'''
Add temp_traj values to the buffers and compute the advantage and reward to go
Parameters:
-----------
temp_traj: list where each element is a list that contains: observation, reward, action, state-value
'''
# store only if the temp_traj list is not empty
if len(temp_traj) > 0:
self.obs.extend(temp_traj[:,0])
rtg = discounted_rewards(temp_traj[:,1], self.gamma)
# NEW
self.ret.extend(rtg - temp_traj[:,3])
self.rtg.extend(rtg)
self.act.extend(temp_traj[:,2])
def get_batch(self):
# MODIFIED
return self.obs, self.act, self.ret, self.rtg
def __len__(self):
assert(len(self.obs) == len(self.act) == len(self.ret) == len(self.rtg))
return len(self.obs)
def REINFORCE_baseline(env_name, hidden_sizes=[32], p_lr=5e-3, vf_lr=8e-3, gamma=0.99, steps_per_epoch=100, num_epochs=1000):
'''
REINFORCE with baseline Algorithm
Parameters:
-----------
env_name: Name of the environment
hidden_size: list of the number of hidden units for each layer
p_lr: policy learning rate
vf_lr: value function learning rate
gamma: discount factor
steps_per_epoch: number of steps per epoch
num_epochs: number train epochs (Note: they aren't properly epochs)
'''
tf.reset_default_graph()
env = gym.make(env_name)
obs_dim = env.observation_space.shape
act_dim = env.action_space.n
# Placeholders
obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs')
act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act')
ret_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='ret')
rtg_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='rtg')
#####################################################
########### COMPUTE THE PG LOSS FUNCTIONS ###########
#####################################################
# policy
p_logits = mlp(obs_ph, hidden_sizes, act_dim, activation=tf.tanh)
act_multn = tf.squeeze(tf.random.multinomial(p_logits, 1))
actions_mask = tf.one_hot(act_ph, depth=act_dim)
p_log = tf.reduce_sum(actions_mask * tf.nn.log_softmax(p_logits), axis=1)
# entropy useful to study the algorithms
entropy = -tf.reduce_mean(softmax_entropy(p_logits))
p_loss = -tf.reduce_mean(p_log*ret_ph)
# policy optimization
p_opt = tf.train.AdamOptimizer(p_lr).minimize(p_loss)
#######################################
########### VALUE FUNCTION ###########
#######################################
########### NEW ###########
# value function
s_values = tf.squeeze(mlp(obs_ph, hidden_sizes, 1, activation=tf.tanh))
# MSE loss function
v_loss = tf.reduce_mean((rtg_ph - s_values)**2)
# value function optimization
v_opt = tf.train.AdamOptimizer(vf_lr).minimize(v_loss)
# Time
now = datetime.now()
clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)
print('Time:', clock_time)
# Set scalars and hisograms for TensorBoard
tf.summary.scalar('p_loss', p_loss, collections=['train'])
tf.summary.scalar('v_loss', v_loss, collections=['train'])
tf.summary.scalar('entropy', entropy, collections=['train'])
tf.summary.scalar('s_values', tf.reduce_mean(s_values), collections=['train'])
tf.summary.histogram('p_soft', tf.nn.softmax(p_logits), collections=['train'])
tf.summary.histogram('p_log', p_log, collections=['train'])
tf.summary.histogram('act_multn', act_multn, collections=['train'])
tf.summary.histogram('p_logits', p_logits, collections=['train'])
tf.summary.histogram('ret_ph', ret_ph, collections=['train'])
tf.summary.histogram('rtg_ph', rtg_ph, collections=['train'])
tf.summary.histogram('s_values', s_values, collections=['train'])
train_summary = tf.summary.merge_all('train')
tf.summary.scalar('old_v_loss', v_loss, collections=['pre_train'])
tf.summary.scalar('old_p_loss', p_loss, collections=['pre_train'])
pre_scalar_summary = tf.summary.merge_all('pre_train')
hyp_str = '-steps_{}-plr_{}-vflr_{}'.format(steps_per_epoch, p_lr, vf_lr)
file_writer = tf.summary.FileWriter('log_dir/{}/REINFORCE_basel_{}_{}'.format(env_name, clock_time, hyp_str), tf.get_default_graph())
# create a session
sess = tf.Session()
# initialize the variables
sess.run(tf.global_variables_initializer())
# few variables
step_count = 0
train_rewards = []
train_ep_len = []
timer = time.time()
# main cycle
for ep in range(num_epochs):
# initialize environment for the new epochs
obs = env.reset()
# intiaizlie buffer and other variables for the new epochs
buffer = Buffer(gamma)
env_buf = []
ep_rews = []
while len(buffer) < steps_per_epoch:
# run the policy
act, val = sess.run([act_multn, s_values], feed_dict={obs_ph:[obs]})
# take a step in the environment
obs2, rew, done, _ = env.step(np.squeeze(act))
# add the new transition
env_buf.append([obs.copy(), rew, act, np.squeeze(val)])
obs = obs2.copy()
step_count += 1
ep_rews.append(rew)
if done:
# store the trajectory just completed
buffer.store(np.array(env_buf))
env_buf = []
# store additionl information about the episode
train_rewards.append(np.sum(ep_rews))
train_ep_len.append(len(ep_rews))
# reset the environment
obs = env.reset()
ep_rews = []
# collect the episodes' information
obs_batch, act_batch, ret_batch, rtg_batch = buffer.get_batch()
# run pre_scalar_summary before the optimization phase
epochs_summary = sess.run(pre_scalar_summary, feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch, rtg_ph:rtg_batch})
file_writer.add_summary(epochs_summary, step_count)
# Optimize the NN policy and the NN value function
sess.run([p_opt, v_opt], feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch, rtg_ph:rtg_batch})
# run train_summary to save the summary after the optimization
train_summary_run = sess.run(train_summary, feed_dict={obs_ph:obs_batch, act_ph:act_batch, ret_ph:ret_batch, rtg_ph:rtg_batch})
file_writer.add_summary(train_summary_run, step_count)
# it's time to print some useful information
if ep % 10 == 0:
print('Ep:%d MnRew:%.2f MxRew:%.1f EpLen:%.1f Buffer:%d -- Step:%d -- Time:%d' % (ep, np.mean(train_rewards), np.max(train_rewards), np.mean(train_ep_len), len(buffer), step_count,time.time()-timer))
summary = tf.Summary()
summary.value.add(tag='supplementary/len', simple_value=np.mean(train_ep_len))
summary.value.add(tag='supplementary/train_rew', simple_value=np.mean(train_rewards))
file_writer.add_summary(summary, step_count)
file_writer.flush()
timer = time.time()
train_rewards = []
train_ep_len = []
env.close()
file_writer.close()
if __name__ == '__main__':
REINFORCE_baseline('LunarLander-v2', hidden_sizes=[64], p_lr=8e-3, vf_lr=7e-3, gamma=0.99, steps_per_epoch=1000, num_epochs=1000)
================================================
FILE: Chapter07/PPO.py
================================================
import numpy as np
import tensorflow as tf
import gym
from datetime import datetime
import time
import roboschool
def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activation=None):
'''
Multi-layer perceptron
'''
for l in hidden_layers:
x = tf.layers.dense(x, units=l, activation=activation)
return tf.layers.dense(x, units=output_layer, activation=last_activation)
def softmax_entropy(logits):
'''
Softmax Entropy
'''
return -tf.reduce_sum(tf.nn.softmax(logits, axis=-1) * tf.nn.log_softmax(logits, axis=-1), axis=-1)
def clipped_surrogate_obj(new_p, old_p, adv, eps):
'''
Clipped surrogate objective function
'''
rt = tf.exp(new_p - old_p) # i.e. pi / old_pi
return -tf.reduce_mean(tf.minimum(rt*adv, tf.clip_by_value(rt, 1-eps, 1+eps)*adv))
def GAE(rews, v, v_last, gamma=0.99, lam=0.95):
'''
Generalized Advantage Estimation
'''
assert len(rews) == len(v)
vs = np.append(v, v_last)
delta = np.array(rews) + gamma*vs[1:] - vs[:-1]
gae_advantage = discounted_rewards(delta, 0, gamma*lam)
return gae_advantage
def discounted_rewards(rews, last_sv, gamma):
'''
Discounted reward to go
Parameters:
----------
rews: list of rewards
last_sv: value of the last state
gamma: discount value
'''
rtg = np.zeros_like(rews, dtype=np.float32)
rtg[-1] = rews[-1] + gamma*last_sv
for i in reversed(range(len(rews)-1)):
rtg[i] = rews[i] + gamma*rtg[i+1]
return rtg
class StructEnv(gym.Wrapper):
'''
Gym Wrapper to store information like number of steps and total reward of the last espisode.
'''
def __init__(self, env):
gym.Wrapper.__init__(self, env)
self.n_obs = self.env.reset()
self.rew_episode = 0
self.len_episode = 0
def reset(self, **kwargs):
self.n_obs = self.env.reset(**kwargs)
self.rew_episode = 0
self.len_episode = 0
return self.n_obs.copy()
def step(self, action):
ob, reward, done, info = self.env.step(action)
self.rew_episode += reward
self.len_episode += 1
return ob, reward, done, info
def get_episode_reward(self):
return self.rew_episode
def get_episode_length(self):
return self.len_episode
class Buffer():
'''
Class to store the experience from a unique policy
'''
def __init__(self, gamma=0.99, lam=0.95):
self.gamma = gamma
self.lam = lam
self.adv = []
self.ob = []
self.ac = []
self.rtg = []
def store(self, temp_traj, last_sv):
'''
Add temp_traj values to the buffers and compute the advantage and reward to go
Parameters:
-----------
temp_traj: list where each element is a list that contains: observation, reward, action, state-value
last_sv: value of the last state (Used to Bootstrap)
'''
# store only if there are temporary trajectories
if len(temp_traj) > 0:
self.ob.extend(temp_traj[:,0])
rtg = discounted_rewards(temp_traj[:,1], last_sv, self.gamma)
self.adv.extend(GAE(temp_traj[:,1], temp_traj[:,3], last_sv, self.gamma, self.lam))
self.rtg.extend(rtg)
self.ac.extend(temp_traj[:,2])
def get_batch(self):
# standardize the advantage values
norm_adv = (self.adv - np.mean(self.adv)) / (np.std(self.adv) + 1e-10)
return np.array(self.ob), np.array(self.ac), np.array(norm_adv), np.array(self.rtg)
def __len__(self):
assert(len(self.adv) == len(self.ob) == len(self.ac) == len(self.rtg))
return len(self.ob)
def gaussian_log_likelihood(x, mean, log_std):
'''
Gaussian Log Likelihood
'''
log_p = -0.5 *((x-mean)**2 / (tf.exp(log_std)**2+1e-9) + 2*log_std + np.log(2*np.pi))
return tf.reduce_sum(log_p, axis=-1)
def PPO(env_name, hidden_sizes=[32], cr_lr=5e-3, ac_lr=5e-3, num_epochs=50, minibatch_size=5000, gamma=0.99, lam=0.95, number_envs=1, eps=0.1,
actor_iter=5, critic_iter=10, steps_per_env=100, action_type='Discrete'):
'''
Proximal Policy Optimization
Parameters:
-----------
env_name: Name of the environment
hidden_size: list of the number of hidden units for each layer
ac_lr: actor learning rate
cr_lr: critic learning rate
num_epochs: number of training epochs
minibatch_size: Batch size used to train the critic and actor
gamma: discount factor
lam: lambda parameter for computing the GAE
number_envs: number of parallel synchronous environments
# NB: it isn't distributed across multiple CPUs
eps: Clip threshold. Max deviation from previous policy.
actor_iter: Number of SGD iterations on the actor per epoch
critic_iter: NUmber of SGD iterations on the critic per epoch
steps_per_env: number of steps per environment
# NB: the total number of steps per epoch will be: steps_per_env*number_envs
action_type: class name of the action space: Either "Discrete' or "Box"
'''
tf.reset_default_graph()
# Create some environments to collect the trajectories
envs = [StructEnv(gym.make(env_name)) for _ in range(number_envs)]
obs_dim = envs[0].observation_space.shape
# Placeholders
if action_type == 'Discrete':
act_dim = envs[0].action_space.n
act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act')
elif action_type == 'Box':
low_action_space = envs[0].action_space.low
high_action_space = envs[0].action_space.high
act_dim = envs[0].action_space.shape[0]
act_ph = tf.placeholder(shape=(None,act_dim), dtype=tf.float32, name='act')
obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs')
ret_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='ret')
adv_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='adv')
old_p_log_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='old_p_log')
# Computational graph for the policy in case of a continuous action space
if action_type == 'Discrete':
with tf.variable_scope('actor_nn'):
p_logits = mlp(obs_ph, hidden_sizes, act_dim, tf.nn.relu, last_activation=tf.tanh)
act_smp = tf.squeeze(tf.random.multinomial(p_logits, 1))
act_onehot = tf.one_hot(act_ph, depth=act_dim)
p_log = tf.reduce_sum(act_onehot * tf.nn.log_softmax(p_logits), axis=-1)
# Computational graph for the policy in case of a continuous action space
else:
with tf.variable_scope('actor_nn'):
p_logits = mlp(obs_ph, hidden_sizes, act_dim, tf.tanh, last_activation=tf.tanh)
log_std = tf.get_variable(name='log_std', initializer=np.zeros(act_dim, dtype=np.float32)-0.5)
# Add noise to the mean values predicted
# The noise is proportional to the standard deviation
p_noisy = p_logits + tf.random_normal(tf.shape(p_logits), 0, 1) * tf.exp(log_std)
# Clip the noisy actions
act_smp = tf.clip_by_value(p_noisy, low_action_space, high_action_space)
# Compute the gaussian log likelihood
p_log = gaussian_log_likelihood(act_ph, p_logits, log_std)
# Nerual nework value function approximizer
with tf.variable_scope('critic_nn'):
s_values = mlp(obs_ph, hidden_sizes, 1, tf.tanh, last_activation=None)
s_values = tf.squeeze(s_values)
# PPO loss function
p_loss = clipped_surrogate_obj(p_log, old_p_log_ph, adv_ph, eps)
# MSE loss function
v_loss = tf.reduce_mean((ret_ph - s_values)**2)
# policy optimizer
p_opt = tf.train.AdamOptimizer(ac_lr).minimize(p_loss)
# value function optimizer
v_opt = tf.train.AdamOptimizer(cr_lr).minimize(v_loss)
# Time
now = datetime.now()
clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)
print('Time:', clock_time)
# Set scalars and hisograms for TensorBoard
tf.summary.scalar('p_loss', p_loss, collections=['train'])
tf.summary.scalar('v_loss', v_loss, collections=['train'])
tf.summary.scalar('s_values_m', tf.reduce_mean(s_values), collections=['train'])
if action_type == 'Box':
tf.summary.scalar('p_std', tf.reduce_mean(tf.exp(log_std)), collections=['train'])
tf.summary.histogram('log_std',log_std, collections=['train'])
tf.summary.histogram('p_log', p_log, collections=['train'])
tf.summary.histogram('p_logits', p_logits, collections=['train'])
tf.summary.histogram('s_values', s_values, collections=['train'])
tf.summary.histogram('adv_ph',adv_ph, collections=['train'])
scalar_summary = tf.summary.merge_all('train')
# .. summary to run before the optimization steps
tf.summary.scalar('old_v_loss', v_loss, collections=['pre_train'])
tf.summary.scalar('old_p_loss', p_loss, collections=['pre_train'])
pre_scalar_summary = tf.summary.merge_all('pre_train')
hyp_str = '-bs_'+str(minibatch_size)+'-envs_'+str(number_envs)+'-ac_lr_'+str(ac_lr)+'-cr_lr'+str(cr_lr)+'-act_it_'+str(actor_iter)+'-crit_it_'+str(critic_iter)
file_writer = tf.summary.FileWriter('log_dir/'+env_name+'/PPO_'+clock_time+'_'+hyp_str, tf.get_default_graph())
# create a session
sess = tf.Session()
# initialize the variables
sess.run(tf.global_variables_initializer())
# variable to store the total number of steps
step_count = 0
print('Env batch size:',steps_per_env, ' Batch size:',steps_per_env*number_envs)
for ep in range(num_epochs):
# Create the buffer that will contain the trajectories (full or partial)
# run with the last policy
buffer = Buffer(gamma, lam)
# lists to store rewards and length of the trajectories completed
batch_rew = []
batch_len = []
# Execute in serial the environments, storing temporarily the trajectories.
for env in envs:
temp_buf = []
#iterate over a fixed number of steps
for _ in range(steps_per_env):
# run the policy
act, val = sess.run([act_smp, s_values], feed_dict={obs_ph:[env.n_obs]})
act = np.squeeze(act)
# take a step in the environment
obs2, rew, done, _ = env.step(act)
# add the new transition to the temporary buffer
temp_buf.append([env.n_obs.copy(), rew, act, np.squeeze(val)])
env.n_obs = obs2.copy()
step_count += 1
if done:
# Store the full trajectory in the buffer
# (the value of the last state is 0 as the trajectory is completed)
buffer.store(np.array(temp_buf), 0)
# Empty temporary buffer
temp_buf = []
batch_rew.append(env.get_episode_reward())
batch_len.append(env.get_episode_length())
# reset the environment
env.reset()
# Bootstrap with the estimated state value of the next state!
last_v = sess.run(s_values, feed_dict={obs_ph:[env.n_obs]})
buffer.store(np.array(temp_buf), np.squeeze(last_v))
# Gather the entire batch from the buffer
# NB: all the batch is used and deleted after the optimization. That is because PPO is on-policy
obs_batch, act_batch, adv_batch, rtg_batch = buffer.get_batch()
old_p_log = sess.run(p_log, feed_dict={obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch})
old_p_batch = np.array(old_p_log)
summary = sess.run(pre_scalar_summary, feed_dict={obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch, old_p_log_ph:old_p_batch})
file_writer.add_summary(summary, step_count)
lb = len(buffer)
shuffled_batch = np.arange(lb)
# Policy optimization steps
for _ in range(actor_iter):
# shuffle the batch on every iteration
np.random.shuffle(shuffled_batch)
for idx in range(0,lb, minibatch_size):
minib = shuffled_batch[idx:min(idx+minibatch_size,lb)]
sess.run(p_opt, feed_dict={obs_ph:obs_batch[minib], act_ph:act_batch[minib], adv_ph:adv_batch[minib], old_p_log_ph:old_p_batch[minib]})
# Value function optimization steps
for _ in range(critic_iter):
# shuffle the batch on every iteration
np.random.shuffle(shuffled_batch)
for idx in range(0,lb, minibatch_size):
minib = shuffled_batch[idx:min(idx+minibatch_size,lb)]
sess.run(v_opt, feed_dict={obs_ph:obs_batch[minib], ret_ph:rtg_batch[minib]})
# print some statistics and run the summary for visualizing it on TB
if len(batch_rew) > 0:
train_summary = sess.run(scalar_summary, feed_dict={obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch,
old_p_log_ph:old_p_batch, ret_ph:rtg_batch})
file_writer.add_summary(train_summary, step_count)
summary = tf.Summary()
summary.value.add(tag='supplementary/performance', simple_value=np.mean(batch_rew))
summary.value.add(tag='supplementary/len', simple_value=np.mean(batch_len))
file_writer.add_summary(summary, step_count)
file_writer.flush()
print('Ep:%d Rew:%.2f -- Step:%d' % (ep, np.mean(batch_rew), step_count))
# closing environments..
for env in envs:
env.close()
# Close the writer
file_writer.close()
if __name__ == '__main__':
PPO('RoboschoolWalker2d-v1', hidden_sizes=[64,64], cr_lr=5e-4, ac_lr=2e-4, gamma=0.99, lam=0.95, steps_per_env=5000,
number_envs=1, eps=0.15, actor_iter=6, critic_iter=10, action_type='Box', num_epochs=5000, minibatch_size=256)
================================================
FILE: Chapter07/TRPO.py
================================================
import numpy as np
import tensorflow as tf
import gym
from datetime import datetime
import roboschool
def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activation=None):
'''
Multi-layer perceptron
'''
for l in hidden_layers:
x = tf.layers.dense(x, units=l, activation=activation)
return tf.layers.dense(x, units=output_layer, activation=last_activation)
def softmax_entropy(logits):
'''
Softmax Entropy
'''
return -tf.reduce_sum(tf.nn.softmax(logits, axis=-1) * tf.nn.log_softmax(logits, axis=-1), axis=-1)
def gaussian_log_likelihood(ac, mean, log_std):
'''
Gaussian Log Likelihood
'''
log_p = ((ac-mean)**2 / (tf.exp(log_std)**2+1e-9) + 2*log_std) + np.log(2*np.pi)
return -0.5 * tf.reduce_sum(log_p, axis=-1)
def conjugate_gradient(A, b, x=None, iters=10):
'''
Conjugate gradient method: approximate the solution of Ax=b
It solve Ax=b without forming the full matrix, just compute the matrix-vector product (The Fisher-vector product)
NB: A is not the full matrix but is a useful matrix-vector product between the averaged Fisher information matrix and arbitrary vectors
Descibed in Appendix C.1 of the TRPO paper
'''
if x is None:
x = np.zeros_like(b)
r = A(x) - b
p = -r
for _ in range(iters):
a = np.dot(r, r) / (np.dot(p, A(p))+1e-8)
x += a*p
r_n = r + a*A(p)
b = np.dot(r_n, r_n) / (np.dot(r, r)+1e-8)
p = -r_n + b*p
r = r_n
return x
def gaussian_DKL(mu_q, log_std_q, mu_p, log_std_p):
'''
Gaussian KL divergence in case of a diagonal covariance matrix
'''
return tf.reduce_mean(tf.reduce_sum(0.5 * (log_std_p - log_std_q + tf.exp(log_std_q - log_std_p) + (mu_q - mu_p)**2 / tf.exp(log_std_p) - 1), axis=1))
def backtracking_line_search(Dkl, delta, old_loss, p=0.8):
'''
Backtracking line searc. It look for a coefficient s.t. the constraint on the DKL is satisfied
It has both to
- improve the non-linear objective
- satisfy the constraint
'''
## Explained in Appendix C of the TRPO paper
a = 1
it = 0
new_dkl, new_loss = Dkl(a)
while (new_dkl > delta) or (new_loss > old_loss):
a *= p
it += 1
new_dkl, new_loss = Dkl(a)
return a
def GAE(rews, v, v_last, gamma=0.99, lam=0.95):
'''
Generalized Advantage Estimation
'''
assert len(rews) == len(v)
vs = np.append(v, v_last)
d = np.array(rews) + gamma*vs[1:] - vs[:-1]
gae_advantage = discounted_rewards(d, 0, gamma*lam)
return gae_advantage
def discounted_rewards(rews, last_sv, gamma):
'''
Discounted reward to go
Parameters:
----------
rews: list of rewards
last_sv: value of the last state
gamma: discount value
'''
rtg = np.zeros_like(rews, dtype=np.float32)
rtg[-1] = rews[-1] + gamma*last_sv
for i in reversed(range(len(rews)-1)):
rtg[i] = rews[i] + gamma*rtg[i+1]
return rtg
class Buffer():
'''
Class to store the experience from a unique policy
'''
def __init__(self, gamma=0.99, lam=0.95):
self.gamma = gamma
self.lam = lam
self.adv = []
self.ob = []
self.ac = []
self.rtg = []
def store(self, temp_traj, last_sv):
'''
Add temp_traj values to the buffers and compute the advantage and reward to go
Parameters:
-----------
temp_traj: list where each element is a list that contains: observation, reward, action, state-value
last_sv: value of the last state (Used to Bootstrap)
'''
# store only if there are temporary trajectories
if len(temp_traj) > 0:
self.ob.extend(temp_traj[:,0])
rtg = discounted_rewards(temp_traj[:,1], last_sv, self.gamma)
self.adv.extend(GAE(temp_traj[:,1], temp_traj[:,3], last_sv, self.gamma, self.lam))
self.rtg.extend(rtg)
self.ac.extend(temp_traj[:,2])
def get_batch(self):
# standardize the advantage values
norm_adv = (self.adv - np.mean(self.adv)) / (np.std(self.adv) + 1e-10)
return np.array(self.ob), np.array(self.ac), np.array(norm_adv), np.array(self.rtg)
def __len__(self):
assert(len(self.adv) == len(self.ob) == len(self.ac) == len(self.rtg))
return len(self.ob)
def flatten_list(tensor_list):
'''
Flatten a list of tensors
'''
return tf.concat([flatten(t) for t in tensor_list], axis=0)
def flatten(tensor):
'''
Flatten a tensor
'''
return tf.reshape(tensor, shape=(-1,))
class StructEnv(gym.Wrapper):
'''
Gym Wrapper to store information like number of steps and total reward of the last espisode.
'''
def __init__(self, env):
gym.Wrapper.__init__(self, env)
self.n_obs = self.env.reset()
self.total_rew = 0
self.len_episode = 0
def reset(self, **kwargs):
self.n_obs = self.env.reset(**kwargs)
self.total_rew = 0
self.len_episode = 0
return self.n_obs.copy()
def step(self, action):
ob, reward, done, info = self.env.step(action)
self.total_rew += reward
self.len_episode += 1
return ob, reward, done, info
def get_episode_reward(self):
return self.total_rew
def get_episode_length(self):
return self.len_episode
def TRPO(env_name, hidden_sizes=[32], cr_lr=5e-3, num_epochs=50, gamma=0.99, lam=0.95, number_envs=1,
critic_iter=10, steps_per_env=100, delta=0.002, algorithm='TRPO', conj_iters=10, minibatch_size=1000):
'''
Trust Region Policy Optimization
Parameters:
-----------
env_name: Name of the environment
hidden_sizes: list of the number of hidden units for each layer
cr_lr: critic learning rate
num_epochs: number of training epochs
gamma: discount factor
lam: lambda parameter for computing the GAE
number_envs: number of "parallel" synchronous environments
# NB: it isn't distributed across multiple CPUs
critic_iter: NUmber of SGD iterations on the critic per epoch
steps_per_env: number of steps per environment
# NB: the total number of steps per epoch will be: steps_per_env*number_envs
delta: Maximum KL divergence between two policies. Scalar value
algorithm: type of algorithm. Either 'TRPO' or 'NPO'
conj_iters: number of conjugate gradient iterations
minibatch_size: Batch size used to train the critic
'''
tf.reset_default_graph()
# Create a few environments to collect the trajectories
envs = [StructEnv(gym.make(env_name)) for _ in range(number_envs)]
low_action_space = envs[0].action_space.low
high_action_space = envs[0].action_space.high
obs_dim = envs[0].observation_space.shape
act_dim = envs[0].action_space.shape[0]
# Placeholders
act_ph = tf.placeholder(shape=(None,act_dim), dtype=tf.float32, name='act')
obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs')
ret_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='ret')
adv_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='adv')
old_p_log_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='old_p_log')
old_mu_ph = tf.placeholder(shape=(None, act_dim), dtype=tf.float32, name='old_mu')
old_log_std_ph = tf.placeholder(shape=(act_dim), dtype=tf.float32, name='old_log_std')
p_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='p_ph')
# result of the conjugate gradient algorithm
cg_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='cg')
# Neural network that represent the policy
with tf.variable_scope('actor_nn'):
p_means = mlp(obs_ph, hidden_sizes, act_dim, tf.tanh, last_activation=tf.tanh)
log_std = tf.get_variable(name='log_std', initializer=np.zeros(act_dim, dtype=np.float32) - 0.5)
# Neural network that represent the value function
with tf.variable_scope('critic_nn'):
s_values = mlp(obs_ph, hidden_sizes, 1, tf.tanh, last_activation=None)
s_values = tf.squeeze(s_values)
# Add "noise" to the predicted mean following the Guassian distribution with standard deviation e^(log_std)
p_noisy = p_means + tf.random_normal(tf.shape(p_means), 0, 1) * tf.exp(log_std)
# Clip the noisy actions
a_sampl = tf.clip_by_value(p_noisy, low_action_space, high_action_space)
# Compute the gaussian log likelihood
p_log = gaussian_log_likelihood(act_ph, p_means, log_std)
# Measure the divergence
diverg = tf.reduce_mean(tf.exp(old_p_log_ph - p_log))
# ratio
ratio_new_old = tf.exp(p_log - old_p_log_ph)
# TRPO surrogate loss function
p_loss = - tf.reduce_mean(ratio_new_old * adv_ph)
# MSE loss function
v_loss = tf.reduce_mean((ret_ph - s_values)**2)
# Critic optimization
v_opt = tf.train.AdamOptimizer(cr_lr).minimize(v_loss)
def variables_in_scope(scope):
# get all trainable variables in 'scope'
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
# Gather and flatten the actor parameters
p_variables = variables_in_scope('actor_nn')
p_var_flatten = flatten_list(p_variables)
# Gradient of the policy loss with respect to the actor parameters
p_grads = tf.gradients(p_loss, p_variables)
p_grads_flatten = flatten_list(p_grads)
########### RESTORE ACTOR PARAMETERS ###########
p_old_variables = tf.placeholder(shape=(None,), dtype=tf.float32, name='p_old_variables')
# variable used as index for restoring the actor's parameters
it_v1 = tf.Variable(0, trainable=False)
restore_params = []
for p_v in p_variables:
upd_rsh = tf.reshape(p_old_variables[it_v1 : it_v1+tf.reduce_prod(p_v.shape)], shape=p_v.shape)
restore_params.append(p_v.assign(upd_rsh))
it_v1 += tf.reduce_prod(p_v.shape)
restore_params = tf.group(*restore_params)
# gaussian KL divergence of the two policies
dkl_diverg = gaussian_DKL(old_mu_ph, old_log_std_ph, p_means, log_std)
# Jacobian of the KL divergence (Needed for the Fisher matrix-vector product)
dkl_diverg_grad = tf.gradients(dkl_diverg, p_variables)
dkl_matrix_product = tf.reduce_sum(flatten_list(dkl_diverg_grad) * p_ph)
print('dkl_matrix_product', dkl_matrix_product.shape)
# Fisher vector product
# The Fisher-vector product is a way to compute the A matrix without the need of the full A
Fx = flatten_list(tf.gradients(dkl_matrix_product, p_variables))
## Step length
beta_ph = tf.placeholder(shape=(), dtype=tf.float32, name='beta')
# NPG update
npg_update = beta_ph * cg_ph
## alpha is found through line search
alpha = tf.Variable(1., trainable=False)
# TRPO update
trpo_update = alpha * npg_update
#################### POLICY UPDATE ###################
# variable used as an index
it_v = tf.Variable(0, trainable=False)
p_opt = []
# Apply the updates to the policy
for p_v in p_variables:
upd_rsh = tf.reshape(trpo_update[it_v : it_v+tf.reduce_prod(p_v.shape)], shape=p_v.shape)
p_opt.append(p_v.assign_sub(upd_rsh))
it_v += tf.reduce_prod(p_v.shape)
p_opt = tf.group(*p_opt)
# Time
now = datetime.now()
clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)
print('Time:', clock_time)
# Set scalars and hisograms for TensorBoard
tf.summary.scalar('p_loss', p_loss, collections=['train'])
tf.summary.scalar('v_loss', v_loss, collections=['train'])
tf.summary.scalar('p_divergence', diverg, collections=['train'])
tf.summary.scalar('ratio_new_old',tf.reduce_mean(ratio_new_old), collections=['train'])
tf.summary.scalar('dkl_diverg', dkl_diverg, collections=['train'])
tf.summary.scalar('alpha', alpha, collections=['train'])
tf.summary.scalar('beta', beta_ph, collections=['train'])
tf.summary.scalar('p_std_mn', tf.reduce_mean(tf.exp(log_std)), collections=['train'])
tf.summary.scalar('s_values_mn', tf.reduce_mean(s_values), collections=['train'])
tf.summary.histogram('p_log', p_log, collections=['train'])
tf.summary.histogram('p_means', p_means, collections=['train'])
tf.summary.histogram('s_values', s_values, collections=['train'])
tf.summary.histogram('adv_ph',adv_ph, collections=['train'])
tf.summary.histogram('log_std',log_std, collections=['train'])
scalar_summary = tf.summary.merge_all('train')
tf.summary.scalar('old_v_loss', v_loss, collections=['pre_train'])
tf.summary.scalar('old_p_loss', p_loss, collections=['pre_train'])
pre_scalar_summary = tf.summary.merge_all('pre_train')
hyp_str = '-spe_'+str(steps_per_env)+'-envs_'+str(number_envs)+'-cr_lr'+str(cr_lr)+'-crit_it_'+str(critic_iter)+'-delta_'+str(delta)+'-conj_iters_'+str(conj_iters)
file_writer = tf.summary.FileWriter('log_dir/'+env_name+'/'+algorithm+'_'+clock_time+'_'+hyp_str, tf.get_default_graph())
# create a session
sess = tf.Session()
# initialize the variables
sess.run(tf.global_variables_initializer())
# variable to store the total number of steps
step_count = 0
print('Env batch size:',steps_per_env, ' Batch size:',steps_per_env*number_envs)
for ep in range(num_epochs):
# Create the buffer that will contain the trajectories (full or partial)
# run with the last policy
buffer = Buffer(gamma, lam)
# lists to store rewards and length of the trajectories completed
batch_rew = []
batch_len = []
# Execute in serial the environment, storing temporarily the trajectories.
for env in envs:
temp_buf = []
# iterate over a fixed number of steps
for _ in range(steps_per_env):
# run the policy
act, val = sess.run([a_sampl, s_values], feed_dict={obs_ph:[env.n_obs]})
act = np.squeeze(act)
# take a step in the environment
obs2, rew, done, _ = env.step(act)
# add the new transition to the temporary buffer
temp_buf.append([env.n_obs.copy(), rew, act, np.squeeze(val)])
env.n_obs = obs2.copy()
step_count += 1
if done:
# Store the full trajectory in the buffer
# (the value of the last state is 0 as the trajectory is completed)
buffer.store(np.array(temp_buf), 0)
# Empty temporary buffer
temp_buf = []
batch_rew.append(env.get_episode_reward())
batch_len.append(env.get_episode_length())
env.reset()
# Bootstrap with the estimated state value of the next state!
lsv = sess.run(s_values, feed_dict={obs_ph:[env.n_obs]})
buffer.store(np.array(temp_buf), np.squeeze(lsv))
# Get the entire batch from the buffer
# NB: all the batch is used and deleted after the optimization. This is because PPO is on-policy
obs_batch, act_batch, adv_batch, rtg_batch = buffer.get_batch()
# log probabilities, logits and log std of the "old" policy
# "old" policy refer to the policy to optimize and that has been used to sample from the environment
old_p_log, old_p_means, old_log_std = sess.run([p_log, p_means, log_std], feed_dict={obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch})
# get also the "old" parameters
old_actor_params = sess.run(p_var_flatten)
# old_p_loss is later used in the line search
# run pre_scalar_summary for a summary before the optimization
old_p_loss, summary = sess.run([p_loss,pre_scalar_summary], feed_dict={obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch, old_p_log_ph:old_p_log})
file_writer.add_summary(summary, step_count)
def H_f(p):
'''
Run the Fisher-Vector product on 'p' to approximate the Hessian of the DKL
'''
return sess.run(Fx, feed_dict={old_mu_ph:old_p_means, old_log_std_ph:old_log_std, p_ph:p, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch})
g_f = sess.run(p_grads_flatten, feed_dict={old_mu_ph:old_p_means,obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch, old_p_log_ph:old_p_log})
## Compute the Conjugate Gradient so to obtain an approximation of H^(-1)*g
# Where H in reality isn't the true Hessian of the KL divergence but an approximation of it computed via Fisher-Vector Product (F)
conj_grad = conjugate_gradient(H_f, g_f, iters=conj_iters)
# Compute the step length
beta_np = np.sqrt(2*delta / np.sum(conj_grad * H_f(conj_grad)))
def DKL(alpha_v):
'''
Compute the KL divergence.
It optimize the function to compute the DKL. Afterwards it restore the old parameters.
'''
sess.run(p_opt, feed_dict={beta_ph:beta_np, alpha:alpha_v, cg_ph:conj_grad, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, old_p_log_ph:old_p_log})
a_res = sess.run([dkl_diverg, p_loss], feed_dict={old_mu_ph:old_p_means, old_log_std_ph:old_log_std, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch, old_p_log_ph:old_p_log})
sess.run(restore_params, feed_dict={p_old_variables: old_actor_params})
return a_res
# Actor optimization step
# Different for TRPO or NPG
if algorithm=='TRPO':
# Backtracing line search to find the maximum alpha coefficient s.t. the constraint is valid
best_alpha = backtracking_line_search(DKL, delta, old_p_loss, p=0.8)
sess.run(p_opt, feed_dict={beta_ph:beta_np, alpha:best_alpha, cg_ph:conj_grad, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, old_p_log_ph:old_p_log})
elif algorithm=='NPG':
# In case of NPG, no line search
sess.run(p_opt, feed_dict={beta_ph:beta_np, alpha:1, cg_ph:conj_grad, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, old_p_log_ph:old_p_log})
lb = len(buffer)
shuffled_batch = np.arange(lb)
np.random.shuffle(shuffled_batch)
# Value function optimization steps
for _ in range(critic_iter):
# shuffle the batch on every iteration
np.random.shuffle(shuffled_batch)
for idx in range(0,lb, minibatch_size):
minib = shuffled_batch[idx:min(idx+minibatch_size,lb)]
sess.run(v_opt, feed_dict={obs_ph:obs_batch[minib], ret_ph:rtg_batch[minib]})
# print some statistics and run the summary for visualizing it on TB
if len(batch_rew) > 0:
train_summary = sess.run(scalar_summary, feed_dict={beta_ph:beta_np, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, cg_ph:conj_grad,
old_p_log_ph:old_p_log, ret_ph:rtg_batch, old_mu_ph:old_p_means, old_log_std_ph:old_log_std})
file_writer.add_summary(train_summary, step_count)
summary = tf.Summary()
summary.value.add(tag='supplementary/performance', simple_value=np.mean(batch_rew))
summary.value.add(tag='supplementary/len', simple_value=np.mean(batch_len))
file_writer.add_summary(summary, step_count)
file_writer.flush()
print('Ep:%d Rew:%.2f -- Step:%d' % (ep, np.mean(batch_rew), step_count))
# closing environments..
for env in envs:
env.close()
file_writer.close()
if __name__ == '__main__':
TRPO('RoboschoolWalker2d-v1', hidden_sizes=[64,64], cr_lr=2e-3, gamma=0.99, lam=0.95, num_epochs=1000, steps_per_env=6000,
number_envs=1, critic_iter=10, delta=0.01, algorithm='TRPO', conj_iters=10, minibatch_size=1000)
================================================
FILE: Chapter08/DDPG.py
================================================
import numpy as np
import tensorflow as tf
import gym
from datetime import datetime
from collections import deque
import time
current_milli_time = lambda: int(round(time.time() * 1000))
def mlp(x, hidden_layers, output_layer, activation=tf.nn.relu, last_activation=None):
'''
Multi-layer perceptron
'''
for l in hidden_layers:
x = tf.layers.dense(x, units=l, activation=activation)
return tf.layers.dense(x, units=output_layer, activation=last_activation)
def deterministic_actor_critic(x, a, hidden_sizes, act_dim, max_act):
'''
Deterministic Actor-Critic
'''
# Actor
with tf.variable_scope('p_mlp'):
p_means = max_act * mlp(x, hidden_sizes, act_dim, last_activation=tf.tanh)
# Critic with as input the deterministic action of the actor
with tf.variable_scope('q_mlp'):
q_d = mlp(tf.concat([x,p_means], axis=-1), hidden_sizes, 1, last_activation=None)
# Critic with as input an arbirtary action
with tf.variable_scope('q_mlp', reuse=True): # Use the weights of the mlp just defined
q_a = mlp(tf.concat([x,a], axis=-1), hidden_sizes, 1, last_activation=None)
return p_means, tf.squeeze(q_d), tf.squeeze(q_a)
class ExperiencedBuffer():
'''
Experienced buffer
'''
def __init__(self, buffer_size):
# Contains up to 'buffer_size' experience
self.obs_buf = deque(maxlen=buffer_size)
self.rew_buf = deque(maxlen=buffer_size)
self.act_buf = deque(maxlen=buffer_size)
self.obs2_buf = deque(maxlen=buffer_size)
self.done_buf = deque(maxlen=buffer_size)
def add(self, obs, rew, act, obs2, done):
'''
Add a new transition to the buffers
'''
self.obs_buf.append(obs)
self.rew_buf.append(rew)
self.act_buf.append(act)
self.obs2_buf.append(obs2)
self.done_buf.append(done)
def sample_minibatch(self, batch_size):
'''
Sample a mini-batch of size 'batch_size'
'''
mb_indices = np.random.randint(len(self.obs_buf), size=batch_size)
mb_obs = [self.obs_buf[i] for i in mb_indices]
mb_rew = [self.rew_buf[i] for i in mb_indices]
mb_act = [self.act_buf[i] for i in mb_indices]
mb_obs2 = [self.obs2_buf[i] for i in mb_indices]
mb_done = [self.done_buf[i] for i in mb_indices]
return mb_obs, mb_rew, mb_act, mb_obs2, mb_done
def __len__(self):
return len(self.obs_buf)
def test_agent(env_test, agent_op, num_games=10):
'''
Test an agent 'agent_op', 'num_games' times
Return mean and std
'''
games_r = []
for _ in range(num_games):
d = False
game_r = 0
o = env_test.reset()
while not d:
a_s = agent_op(o)
o, r, d, _ = env_test.step(a_s)
game_r += r
games_r.append(game_r)
return np.mean(games_r), np.std(games_r)
def DDPG(env_name, hidden_sizes=[32], ac_lr=1e-2, cr_lr=1e-2, num_epochs=2000, buffer_size=5000, discount=0.99, render_cycle=100, mean_summaries_steps=1000,
batch_size=128, min_buffer_size=5000, tau=0.005):
# Create an environment for training
env = gym.make(env_name)
# Create an environment for testing the actor
env_test = gym.make(env_name)
tf.reset_default_graph()
obs_dim = env.observation_space.shape
act_dim = env.action_space.shape
print('-- Observation space:', obs_dim, ' Action space:', act_dim, '--')
# Create some placeholders
obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs')
act_ph = tf.placeholder(shape=(None, act_dim[0]), dtype=tf.float32, name='act')
y_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='y')
# Create an online deterministic actor-critic
with tf.variable_scope('online'):
p_onl, qd_onl, qa_onl = deterministic_actor_critic(obs_ph, act_ph, hidden_sizes, act_dim[0], np.max(env.action_space.high))
# and a target one
with tf.variable_scope('target'):
_, qd_tar, _ = deterministic_actor_critic(obs_ph, act_ph, hidden_sizes, act_dim[0], np.max(env.action_space.high))
def variables_in_scope(scope):
'''
Retrieve all the variables in the scope 'scope'
'''
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope)
# Copy all the online variables to the target networks i.e. target = online
# Needed only at the beginning
init_target = [target_var.assign(online_var) for target_var, online_var in zip(variables_in_scope('target'), variables_in_scope('online'))]
init_target_op = tf.group(*init_target)
# Soft update
update_target = [target_var.assign(tau*online_var + (1-tau)*target_var) for target_var, online_var in zip(variables_in_scope('target'), variables_in_scope('online'))]
update_target_op = tf.group(*update_target)
# Critic loss (MSE)
q_loss = tf.reduce_mean((qa_onl - y_ph)**2)
# Actor loss
p_loss = -tf.reduce_mean(qd_onl)
# Optimize the critic
q_opt = tf.train.AdamOptimizer(cr_lr).minimize(q_loss)
# Optimize the actor
p_opt = tf.train.AdamOptimizer(ac_lr).minimize(p_loss, var_list=variables_in_scope('online/p_mlp'))
def agent_op(o):
a = np.squeeze(sess.run(p_onl, feed_dict={obs_ph:[o]}))
return np.clip(a, env.action_space.low, env.action_space.high)
def agent_noisy_op(o, scale):
action = agent_op(o)
noisy_action = action + np.random.normal(loc=0.0, scale=scale, size=action.shape)
return np.clip(noisy_action, env.action_space.low, env.action_space.high)
# Time
now = datetime.now()
clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, int(now.second))
print('Time:', clock_time)
# Set TensorBoard
tf.summary.scalar('loss/q', q_loss)
tf.summary.scalar('loss/p', p_loss)
scalar_summary = tf.summary.merge_all()
hyp_str = '-aclr_'+str(ac_lr)+'-crlr_'+str(cr_lr)+'-tau_'+str(tau)
file_writer = tf.summary.FileWriter('log_dir/'+env_name+'/DDPG_'+clock_time+'_'+hyp_str, tf.get_default_graph())
# Create a session and initialize the variables
sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(init_target_op)
# Some useful variables..
render_the_game = False
step_count = 0
last_q_update_loss = []
last_p_update_loss = []
ep_time = current_milli_time()
batch_rew = []
# Reset the environment
obs = env.reset()
# Initialize the buffer
buffer = ExperiencedBuffer(buffer_size)
for ep in range(num_epochs):
g_rew = 0
done = False
while not done:
# If not gathered enough experience yet, act randomly
if len(buffer) < min_buffer_size:
act = env.action_space.sample()
else:
act = agent_noisy_op(obs, 0.1)
# Take a step in the environment
obs2, rew, done, _ = env.step(act)
if render_the_game:
env.render()
# Add the transition in the buffer
buffer.add(obs.copy(), rew, act, obs2.copy(), done)
obs = obs2
g_rew += rew
step_count += 1
if len(buffer) > min_buffer_size:
# sample a mini batch from the buffer
mb_obs, mb_rew, mb_act, mb_obs2, mb_done = buffer.sample_minibatch(batch_size)
# Compute the target values
q_target_mb = sess.run(qd_tar, feed_dict={obs_ph:mb_obs2})
y_r = np.array(mb_rew) + discount*(1-np.array(mb_done))*q_target_mb
# optimize the critic
train_summary, _, q_train_loss = sess.run([scalar_summary, q_opt, q_loss], feed_dict={obs_ph:mb_obs, y_ph:y_r, act_ph: mb_act})
# optimize the actor
_, p_train_loss = sess.run([p_opt, p_loss], feed_dict={obs_ph:mb_obs})
# summaries..
file_writer.add_summary(train_summary, step_count)
last_q_update_loss.append(q_train_loss)
last_p_update_loss.append(p_train_loss)
# Soft update of the target networks
sess.run(update_target_op)
# some 'mean' summaries to plot more smooth functions
if step_count % mean_summaries_steps == 0:
summary = tf.Summary()
summary.value.add(tag='loss/mean_q', simple_value=np.mean(last_q_update_loss))
summary.value.add(tag='loss/mean_p', simple_value=np.mean(last_p_update_loss))
file_writer.add_summary(summary, step_count)
file_writer.flush()
last_q_update_loss = []
last_p_update_loss = []
if done:
obs = env.reset()
batch_rew.append(g_rew)
g_rew, render_the_game = 0, False
# Test the actor every 10 epochs
if ep % 10 == 0:
test_mn_rw, test_std_rw = test_agent(env_test, agent_op)
summary = tf.Summary()
summary.value.add(tag='test/reward', simple_value=test_mn_rw)
file_writer.add_summary(summary, step_count)
file_writer.flush()
ep_sec_time = int((current_milli_time()-ep_time) / 1000)
print('Ep:%4d Rew:%4.2f -- Step:%5d -- Test:%4.2f %4.2f -- Time:%d' % (ep,np.mean(batch_rew), step_count, test_mn_rw, test_std_rw, ep_sec_time))
ep_time = current_milli_time()
batch_rew = []
if ep % render_cycle == 0:
render_the_game = True
# close everything
file_writer.close()
env.close()
env_test.close()
if __name__ == '__main__':
DDPG('BipedalWalker-v2', hidden_sizes=[64,64], ac_lr=3e-4, cr_lr=4e-4, buffer_size=200000, mean_summaries_steps=100, batch_size=64,
min_buffer_size=10000, tau=0.003)
================================================
FILE: Chapter08/TD3.py
================================================
import numpy as np
import tensorflow as tf
import gym
from datetime import datetime
from collections import deque
import time
current_milli_time = lambda: int(round(time.time() * 1000))
def mlp(x, hidden_layers, output_layer, activation=tf.nn.relu, last_activation=None):
'''
Multi-layer perceptron
'''
for l in hidden_layers:
x = tf.layers.dense(x, units=l, activation=activation)
return tf.layers.dense(x, units=output_layer, activation=last_activation)
# CHANGED FROM DDPG!
def deterministic_actor_double_critic(x, a, hidden_sizes, act_dim, max_act=1):
'''
Deterministic Actor-Critic
'''
# Actor
with tf.variable_scope('p_mlp'):
p_means = max_act * mlp(x, hidden_sizes, act_dim, last_activation=tf.tanh)
# First critic
with tf.variable_scope('q1_mlp'):
q1_d = mlp(tf.concat([x,p_means], axis=-1), hidden_sizes, 1, last_activation=None)
with tf.variable_scope('q1_mlp', reuse=True): # Use the weights of the mlp just defined
q1_a = mlp(tf.concat([x,a], axis=-1), hidden_sizes, 1, last_activation=None)
# Second critic
with tf.variable_scope('q2_mlp'):
q2_d = mlp(tf.concat([x,p_means], axis=-1), hidden_sizes, 1, last_activation=None)
with tf.variable_scope('q2_mlp', reuse=True):
q2_a = mlp(tf.concat([x,a], axis=-1), hidden_sizes, 1, last_activation=None)
return p_means, tf.squeeze(q1_d), tf.squeeze(q1_a), tf.squeeze(q2_d), tf.squeeze(q2_a)
class ExperiencedBuffer():
'''
Experienced buffer
'''
def __init__(self, buffer_size):
# Contains up to 'buffer_size' experience
self.obs_buf = deque(maxlen=buffer_size)
self.rew_buf = deque(maxlen=buffer_size)
self.act_buf = deque(maxlen=buffer_size)
self.obs2_buf = deque(maxlen=buffer_size)
self.done_buf = deque(maxlen=buffer_size)
def add(self, obs, rew, act, obs2, done):
'''
Add a new transition to the buffers
'''
self.obs_buf.append(obs)
self.rew_buf.append(rew)
self.act_buf.append(act)
self.obs2_buf.append(obs2)
self.done_buf.append(done)
def sample_minibatch(self, batch_size):
'''
Sample a mini-batch of size 'batch_size'
'''
mb_indices = np.random.randint(len(self.obs_buf), size=batch_size)
mb_obs = [self.obs_buf[i] for i in mb_indices]
mb_rew = [self.rew_buf[i] for i in mb_indices]
mb_act = [self.act_buf[i] for i in mb_indices]
mb_obs2 = [self.obs2_buf[i] for i in mb_indices]
mb_done = [self.done_buf[i] for i in mb_indices]
return mb_obs, mb_rew, mb_act, mb_obs2, mb_done
def __len__(self):
return len(self.obs_buf)
def test_agent(env_test, agent_op, num_games=10):
'''
Test an agent 'agent_op', 'num_games' times
Return mean and std
'''
games_r = []
for _ in range(num_games):
d = False
game_r = 0
o = env_test.reset()
while not d:
a_s = agent_op(o)
o, r, d, _ = env_test.step(a_s)
game_r += r
games_r.append(game_r)
return np.mean(games_r), np.std(games_r)
def TD3(env_name, hidden_sizes=[32], ac_lr=1e-2, cr_lr=1e-2, num_epochs=2000, buffer_size=5000, discount=0.99, render_cycle=10000, mean_summaries_steps=1000,
batch_size=128, min_buffer_size=5000, tau=0.005, target_noise=0.2, expl_noise=0.1, policy_update_freq=2):
# Create an environment for training
env = gym.make(env_name)
# Create an environment for testing the actor
env_test = gym.make(env_name)
tf.reset_default_graph()
obs_dim = env.observation_space.shape
act_dim = env.action_space.shape
print('-- Observation space:', obs_dim, ' Action space:', act_dim, '--')
# Create some placeholders
obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs')
act_ph = tf.placeholder(shape=(None, act_dim[0]), dtype=tf.float32, name='act')
y_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='y')
# Create an online deterministic actor and a double critic
with tf.variable_scope('online'):
p_onl, qd1_onl, qa1_onl, _, qa2_onl = deterministic_actor_double_critic(obs_ph, act_ph, hidden_sizes, act_dim[0], np.max(env.action_space.high))
# and a target actor and double critic
with tf.variable_scope('target'):
p_tar, _, qa1_tar, _, qa2_tar = deterministic_actor_double_critic(obs_ph, act_ph, hidden_sizes, act_dim[0], np.max(env.action_space.high))
def variables_in_scope(scope):
'''
Retrieve all the variables in the scope 'scope'
'''
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope)
# Copy all the online variables to the target networks i.e. target = online
# Needed only at the beginning
init_target = [target_var.assign(online_var) for target_var, online_var in zip(variables_in_scope('target'), variables_in_scope('online'))]
init_target_op = tf.group(*init_target)
# Soft update
update_target = [target_var.assign(tau*online_var + (1-tau)*target_var) for target_var, online_var in zip(variables_in_scope('target'), variables_in_scope('online'))]
update_target_op = tf.group(*update_target)
# Critics loss (MSE)
q1_loss = tf.reduce_mean((qa1_onl - y_ph)**2)
q2_loss = tf.reduce_mean((qa2_onl - y_ph)**2)
# Actor loss
p_loss = -tf.reduce_mean(qd1_onl)
# Optimize the critics
q1_opt = tf.train.AdamOptimizer(cr_lr).minimize(q1_loss)
q2_opt = tf.train.AdamOptimizer(cr_lr).minimize(q2_loss)
# Optimize the actor
p_opt = tf.train.AdamOptimizer(ac_lr).minimize(p_loss, var_list=variables_in_scope('online/p_mlp'))
def add_normal_noise(x, scale, low_lim=-0.5, high_lim=0.5):
return x + np.clip(np.random.normal(loc=0.0, scale=scale, size=x.shape), low_lim, high_lim)
def agent_op(o):
ac = np.squeeze(sess.run(p_onl, feed_dict={obs_ph:[o]}))
return np.clip(ac, env.action_space.low, env.action_space.high)
def agent_noisy_op(o, scale):
ac = agent_op(o)
return np.clip(add_normal_noise(ac, scale, env.action_space.low, env.action_space.high), env.action_space.low, env.action_space.high)
# Time
now = datetime.now()
clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, int(now.second))
print('Time:', clock_time)
# Set TensorBoard
tf.summary.scalar('loss/q1', q1_loss)
tf.summary.scalar('loss/q2', q2_loss)
tf.summary.scalar('loss/p', p_loss)
scalar_summary = tf.summary.merge_all()
hyp_str = '-aclr_'+str(ac_lr)+'-crlr_'+str(cr_lr)+'-tau_'+str(tau)
file_writer = tf.summary.FileWriter('log_dir/'+env_name+'/TD3_'+clock_time+'_'+hyp_str, tf.get_default_graph())
# Create a session and initialize the variables
sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(init_target_op)
# Some useful variables..
render_the_game = False
step_count = 0
last_q1_update_loss = []
last_q2_update_loss = []
last_p_update_loss = []
ep_time = current_milli_time()
batch_rew = []
# Reset the environment
obs = env.reset()
# Initialize the buffer
buffer = ExperiencedBuffer(buffer_size)
for ep in range(num_epochs):
g_rew = 0
done = False
while not done:
# If not gathered enough experience yet, act randomly
if len(buffer) < min_buffer_size:
act = env.action_space.sample()
else:
act = agent_noisy_op(obs, expl_noise)
# Take a step in the environment
obs2, rew, done, _ = env.step(act)
if render_the_game:
env.render()
# Add the transition in the buffer
buffer.add(obs.copy(), rew, act, obs2.copy(), done)
obs = obs2
g_rew += rew
step_count += 1
if len(buffer) > min_buffer_size:
# sample a mini batch from the buffer
mb_obs, mb_rew, mb_act, mb_obs2, mb_done = buffer.sample_minibatch(batch_size)
double_actions = sess.run(p_tar, feed_dict={obs_ph:mb_obs2})
# Target regularization
double_noisy_actions = np.clip(add_normal_noise(double_actions, target_noise), env.action_space.low, env.action_space.high)
# Clipped Double Q-learning
q1_target_mb, q2_target_mb = sess.run([qa1_tar,qa2_tar], feed_dict={obs_ph:mb_obs2, act_ph:double_noisy_actions})
q_target_mb = np.min([q1_target_mb, q2_target_mb], axis=0)
assert(len(q1_target_mb) == len(q_target_mb))
# Compute the target values
y_r = np.array(mb_rew) + discount*(1-np.array(mb_done))*q_target_mb
# Optimize the critics
train_summary, _, q1_train_loss, _, q2_train_loss = sess.run([scalar_summary, q1_opt, q1_loss, q2_opt, q2_loss], feed_dict={obs_ph:mb_obs, y_ph:y_r, act_ph: mb_act})
# Delayed policy update
if step_count % policy_update_freq == 0:
# Optimize the policy
_, p_train_loss = sess.run([p_opt, p_loss], feed_dict={obs_ph:mb_obs})
# Soft update of the target networks
sess.run(update_target_op)
file_writer.add_summary(train_summary, step_count)
last_q1_update_loss.append(q1_train_loss)
last_q2_update_loss.append(q2_train_loss)
last_p_update_loss.append(p_train_loss)
# some 'mean' summaries to plot more smooth functions
if step_count % mean_summaries_steps == 0:
summary = tf.Summary()
summary.value.add(tag='loss/mean_q1', simple_value=np.mean(last_q1_update_loss))
summary.value.add(tag='loss/mean_q2', simple_value=np.mean(last_q2_update_loss))
summary.value.add(tag='loss/mean_p', simple_value=np.mean(last_p_update_loss))
file_writer.add_summary(summary, step_count)
file_writer.flush()
last_q1_update_loss = []
last_q2_update_loss = []
last_p_update_loss = []
if done:
obs = env.reset()
batch_rew.append(g_rew)
g_rew, render_the_game = 0, False
# Test the actor every 10 epochs
if ep % 10 == 0:
test_mn_rw, test_std_rw = test_agent(env_test, agent_op)
summary = tf.Summary()
summary.value.add(tag='test/reward', simple_value=test_mn_rw)
file_writer.add_summary(summary, step_count)
file_writer.flush()
ep_sec_time = int((current_milli_time()-ep_time) / 1000)
print('Ep:%4d Rew:%4.2f -- Step:%5d -- Test:%4.2f %4.2f -- Time:%d' % (ep,np.mean(batch_rew), step_count, test_mn_rw, test_std_rw, ep_sec_time))
ep_time = current_milli_time()
batch_rew = []
if ep % render_cycle == 0:
render_the_game = True
# close everything
file_writer.close()
env.close()
env_test.close()
if __name__ == '__main__':
TD3('BipedalWalker-v2', hidden_sizes=[64,64], ac_lr=4e-4, cr_lr=4e-4, buffer_size=200000, mean_summaries_steps=100, batch_size=64,
min_buffer_size=10000, tau=0.005, policy_update_freq=2, target_noise=0.1)
================================================
FILE: Chapter09/ME-TRPO.py
================================================
import numpy as np
import tensorflow as tf
import gym
from datetime import datetime
import roboschool
def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activation=None):
'''
Multi-layer perceptron
'''
for l in hidden_layers:
x = tf.layers.dense(x, units=l, activation=activation)
return tf.layers.dense(x, units=output_layer, activation=last_activation)
def softmax_entropy(logits):
'''
Softmax Entropy
'''
return -tf.reduce_sum(tf.nn.softmax(logits, axis=-1) * tf.nn.log_softmax(logits, axis=-1), axis=-1)
def gaussian_log_likelihood(ac, mean, log_std):
'''
Gaussian Log Likelihood
'''
log_p = ((ac-mean)**2 / (tf.exp(log_std)**2+1e-9) + 2*log_std) + np.log(2*np.pi)
return -0.5 * tf.reduce_sum(log_p, axis=-1)
def conjugate_gradient(A, b, x=None, iters=10):
'''
Conjugate gradient method: approximate the solution of Ax=b
It solve Ax=b without forming the full matrix, just compute the matrix-vector product (The Fisher-vector product)
NB: A is not the full matrix but is a useful matrix-vector product between the averaged Fisher information matrix and arbitrary vectors
Descibed in Appendix C.1 of the TRPO paper
'''
if x is None:
x = np.zeros_like(b)
r = A(x) - b
p = -r
for _ in range(iters):
a = np.dot(r, r) / (np.dot(p, A(p))+1e-8)
x += a*p
r_n = r + a*A(p)
b = np.dot(r_n, r_n) / (np.dot(r, r)+1e-8)
p = -r_n + b*p
r = r_n
return x
def gaussian_DKL(mu_q, log_std_q, mu_p, log_std_p):
'''
Gaussian KL divergence in case of a diagonal covariance matrix
'''
return tf.reduce_mean(tf.reduce_sum(0.5 * (log_std_p - log_std_q + tf.exp(log_std_q - log_std_p) + (mu_q - mu_p)**2 / tf.exp(log_std_p) - 1), axis=1))
def backtracking_line_search(Dkl, delta, old_loss, p=0.8):
'''
Backtracking line searc. It look for a coefficient s.t. the constraint on the DKL is satisfied
It has both to
- improve the non-linear objective
- satisfy the constraint
'''
## Explained in Appendix C of the TRPO paper
a = 1
it = 0
new_dkl, new_loss = Dkl(a)
while (new_dkl > delta) or (new_loss > old_loss):
a *= p
it += 1
new_dkl, new_loss = Dkl(a)
return a
def GAE(rews, v, v_last, gamma=0.99, lam=0.95):
'''
Generalized Advantage Estimation
'''
assert len(rews) == len(v)
vs = np.append(v, v_last)
d = np.array(rews) + gamma*vs[1:] - vs[:-1]
gae_advantage = discounted_rewards(d, 0, gamma*lam)
return gae_advantage
def discounted_rewards(rews, last_sv, gamma):
'''
Discounted reward to go
Parameters:
----------
rews: list of rewards
last_sv: value of the last state
gamma: discount value
'''
rtg = np.zeros_like(rews, dtype=np.float32)
rtg[-1] = rews[-1] + gamma*last_sv
for i in reversed(range(len(rews)-1)):
rtg[i] = rews[i] + gamma*rtg[i+1]
return rtg
def flatten_list(tensor_list):
'''
Flatten a list of tensors
'''
return tf.concat([flatten(t) for t in tensor_list], axis=0)
def flatten(tensor):
'''
Flatten a tensor
'''
return tf.reshape(tensor, shape=(-1,))
def test_agent(env_test, agent_op, num_games=10):
'''
Test an agent 'agent_op', 'num_games' times
Return mean and std
'''
games_r = []
for _ in range(num_games):
d = False
game_r = 0
o = env_test.reset()
while not d:
a_s, _ = agent_op([o])
o, r, d, _ = env_test.step(a_s)
game_r += r
games_r.append(game_r)
return np.mean(games_r), np.std(games_r)
class Buffer():
'''
Class to store the experience from a unique policy
'''
def __init__(self, gamma=0.99, lam=0.95):
self.gamma = gamma
self.lam = lam
self.adv = []
self.ob = []
self.ac = []
self.rtg = []
def store(self, temp_traj, last_sv):
'''
Add temp_traj values to the buffers and compute the advantage and reward to go
Parameters:
-----------
temp_traj: list where each element is a list that contains: observation, reward, action, state-value
last_sv: value of the last state (Used to Bootstrap)
'''
# store only if there are temporary trajectories
if len(temp_traj) > 0:
self.ob.extend(temp_traj[:,0])
rtg = discounted_rewards(temp_traj[:,1], last_sv, self.gamma)
self.adv.extend(GAE(temp_traj[:,1], temp_traj[:,3], last_sv, self.gamma, self.lam))
self.rtg.extend(rtg)
self.ac.extend(temp_traj[:,2])
def get_batch(self):
# standardize the advantage values
norm_adv = (self.adv - np.mean(self.adv)) / (np.std(self.adv) + 1e-10)
return np.array(self.ob), np.array(np.expand_dims(self.ac,-1)), np.array(norm_adv), np.array(self.rtg)
def __len__(self):
assert(len(self.adv) == len(self.ob) == len(self.ac) == len(self.rtg))
return len(self.ob)
class FullBuffer():
def __init__(self):
self.rew = []
self.obs = []
self.act = []
self.nxt_obs = []
self.done = []
self.train_idx = []
self.valid_idx = []
self.idx = 0
def store(self, obs, act, rew, nxt_obs, done):
self.rew.append(rew)
self.obs.append(obs)
self.act.append(act)
self.nxt_obs.append(nxt_obs)
self.done.append(done)
self.idx += 1
def generate_random_dataset(self):
rnd = np.arange(len(self.obs))
np.random.shuffle(rnd)
self.valid_idx = rnd[ : int(len(self.obs)/3)]
self.train_idx = rnd[int(len(self.obs)/3) : ]
print('Train set:', len(self.train_idx), 'Valid set:', len(self.valid_idx))
def get_training_batch(self):
return np.array(self.obs)[self.train_idx], np.array(np.expand_dims(self.act,-1))[self.train_idx], np.array(self.rew)[self.train_idx], np.array(self.nxt_obs)[self.train_idx], np.array(self.done)[self.train_idx]
def get_valid_batch(self):
return np.array(self.obs)[self.valid_idx], np.array(np.expand_dims(self.act,-1))[self.valid_idx], np.array(self.rew)[self.valid_idx], np.array(self.nxt_obs)[self.valid_idx], np.array(self.done)[self.valid_idx]
def __len__(self):
assert(len(self.rew) == len(self.obs) == len(self.act) == len(self.nxt_obs) == len(self.done))
return len(self.obs)
def simulate_environment(env, policy, simulated_steps):
buffer = Buffer(0.99, 0.95)
# lists to store rewards and length of the trajectories completed
steps = 0
number_episodes = 0
while steps < simulated_steps:
temp_buf = []
obs = env.reset()
number_episodes += 1
done = False
while not done:
act, val = policy([obs])
obs2, rew, done, _ = env.step([act])
temp_buf.append([obs.copy(), rew, np.squeeze(act), np.squeeze(val)])
obs = obs2.copy()
steps += 1
if done:
buffer.store(np.array(temp_buf), 0)
temp_buf = []
if steps == simulated_steps:
break
buffer.store(np.array(temp_buf), np.squeeze(policy([obs])[1]))
print('Sim ep:',number_episodes, end=' ')
return buffer.get_batch()
class NetworkEnv(gym.Wrapper):
def __init__(self, env, model_func, reward_func, done_func, number_models):
gym.Wrapper.__init__(self, env)
self.model_func = model_func
self.reward_func = reward_func
self.done_func = done_func
self.number_models = number_models
self.len_episode = 0
def reset(self, **kwargs):
self.len_episode = 0
self.obs = self.env.reset(**kwargs)
return self.obs
def step(self, action):
# predict the next state on a random model
obs = self.model_func(self.obs, [np.squeeze(action)], np.random.randint(0,self.number_models))
rew = self.reward_func(self.obs, [np.squeeze(action)])
done = self.done_func(obs)
self.len_episode += 1
if self.len_episode >= 990:
done = True
self.obs = obs
return self.obs, rew, done, ""
class StructEnv(gym.Wrapper):
'''
Gym Wrapper to store information like number of steps and total reward of the last espisode.
'''
def __init__(self, env):
gym.Wrapper.__init__(self, env)
self.n_obs = self.env.reset()
self.total_rew = 0
self.len_episode = 0
def reset(self, **kwargs):
self.n_obs = self.env.reset(**kwargs)
self.total_rew = 0
self.len_episode = 0
return self.n_obs.copy()
def step(self, action):
ob, reward, done, info = self.env.step(action)
self.total_rew += reward
self.len_episode += 1
return ob, reward, done, info
def get_episode_reward(self):
return self.total_rew
def get_episode_length(self):
return self.len_episode
def pendulum_done(ob):
return np.abs(np.arcsin(np.squeeze(ob[3]))) > .2
def pendulum_reward(ob, ac):
return 1
def restore_model(old_model_variables, m_variables):
# variable used as index for restoring the actor's parameters
it_v2 = tf.Variable(0, trainable=False)
restore_m_params = []
for m_v in m_variables:
upd_m_rsh = tf.reshape(old_model_variables[it_v2 : it_v2+tf.reduce_prod(m_v.shape)], shape=m_v.shape)
restore_m_params.append(m_v.assign(upd_m_rsh))
it_v2 += tf.reduce_prod(m_v.shape)
return tf.group(*restore_m_params)
def METRPO(env_name, hidden_sizes=[32], cr_lr=5e-3, num_epochs=50, gamma=0.99, lam=0.95, number_envs=1,
critic_iter=10, steps_per_env=100, delta=0.002, algorithm='TRPO', conj_iters=10, minibatch_size=1000,
mb_lr=0.0001, model_batch_size=512, simulated_steps=300, num_ensemble_models=2, model_iter=15):
'''
Model Ensemble Trust Region Policy Optimization
Parameters:
-----------
env_name: Name of the environment
hidden_sizes: list of the number of hidden units for each layer
cr_lr: critic learning rate
num_epochs: number of training epochs
gamma: discount factor
lam: lambda parameter for computing the GAE
number_envs: number of "parallel" synchronous environments
# NB: it isn't distributed across multiple CPUs
critic_iter: NUmber of SGD iterations on the critic per epoch
steps_per_env: number of steps per environment
# NB: the total number of steps per epoch will be: steps_per_env*number_envs
delta: Maximum KL divergence between two policies. Scalar value
algorithm: type of algorithm. Either 'TRPO' or 'NPO'
conj_iters: number of conjugate gradient iterations
minibatch_size: Batch size used to train the critic
mb_lr: learning rate of the environment model
model_batch_size: batch size of the environment model
simulated_steps: number of simulated steps for each policy update
num_ensemble_models: number of models
model_iter: number of iterations without improvement before stopping training the model
'''
# TODO: add ME-TRPO hyperparameters
tf.reset_default_graph()
# Create a few environments to collect the trajectories
envs = [StructEnv(gym.make(env_name)) for _ in range(number_envs)]
env_test = gym.make(env_name)
#env_test = gym.wrappers.Monitor(env_test, "VIDEOS/", force=True, video_callable=lambda x: x%10 == 0)
low_action_space = envs[0].action_space.low
high_action_space = envs[0].action_space.high
obs_dim = envs[0].observation_space.shape
act_dim = envs[0].action_space.shape[0]
print(envs[0].action_space, envs[0].observation_space)
# Placeholders
act_ph = tf.placeholder(shape=(None,act_dim), dtype=tf.float32, name='act')
obs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='obs')
# NEW
nobs_ph = tf.placeholder(shape=(None, obs_dim[0]), dtype=tf.float32, name='nobs')
ret_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='ret')
adv_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='adv')
old_p_log_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='old_p_log')
old_mu_ph = tf.placeholder(shape=(None, act_dim), dtype=tf.float32, name='old_mu')
old_log_std_ph = tf.placeholder(shape=(act_dim), dtype=tf.float32, name='old_log_std')
p_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='p_ph')
# result of the conjugate gradient algorithm
cg_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='cg')
#########################################################
######################## POLICY #########################
#########################################################
old_model_variables = tf.placeholder(shape=(None,), dtype=tf.float32, name='old_model_variables')
# Neural network that represent the policy
with tf.variable_scope('actor_nn'):
p_means = mlp(obs_ph, hidden_sizes, act_dim, tf.tanh, last_activation=tf.tanh)
p_means = tf.clip_by_value(p_means, low_action_space, high_action_space)
log_std = tf.get_variable(name='log_std', initializer=np.ones(act_dim, dtype=np.float32))
# Neural network that represent the value function
with tf.variable_scope('critic_nn'):
s_values = mlp(obs_ph, hidden_sizes, 1, tf.tanh, last_activation=None)
s_values = tf.squeeze(s_values)
# Add "noise" to the predicted mean following the Gaussian distribution with standard deviation e^(log_std)
p_noisy = p_means + tf.random_normal(tf.shape(p_means), 0, 1) * tf.exp(log_std)
# Clip the noisy actions
a_sampl = tf.clip_by_value(p_noisy, low_action_space, high_action_space)
# Compute the gaussian log likelihood
p_log = gaussian_log_likelihood(act_ph, p_means, log_std)
# Measure the divergence
diverg = tf.reduce_mean(tf.exp(old_p_log_ph - p_log))
# ratio
ratio_new_old = tf.exp(p_log - old_p_log_ph)
# TRPO surrogate loss function
p_loss = - tf.reduce_mean(ratio_new_old * adv_ph)
# MSE loss function
v_loss = tf.reduce_mean((ret_ph - s_values)**2)
# Critic optimization
v_opt = tf.train.AdamOptimizer(cr_lr).minimize(v_loss)
def variables_in_scope(scope):
# get all trainable variables in 'scope'
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
# Gather and flatten the actor parameters
p_variables = variables_in_scope('actor_nn')
p_var_flatten = flatten_list(p_variables)
# Gradient of the policy loss with respect to the actor parameters
p_grads = tf.gradients(p_loss, p_variables)
p_grads_flatten = flatten_list(p_grads)
########### RESTORE ACTOR PARAMETERS ###########
p_old_variables = tf.placeholder(shape=(None,), dtype=tf.float32, name='p_old_variables')
# variable used as index for restoring the actor's parameters
it_v1 = tf.Variable(0, trainable=False)
restore_params = []
for p_v in p_variables:
upd_rsh = tf.reshape(p_old_variables[it_v1 : it_v1+tf.reduce_prod(p_v.shape)], shape=p_v.shape)
restore_params.append(p_v.assign(upd_rsh))
it_v1 += tf.reduce_prod(p_v.shape)
restore_params = tf.group(*restore_params)
# gaussian KL divergence of the two policies
dkl_diverg = gaussian_DKL(old_mu_ph, old_log_std_ph, p_means, log_std)
# Jacobian of the KL divergence (Needed for the Fisher matrix-vector product)
dkl_diverg_grad = tf.gradients(dkl_diverg, p_variables)
dkl_matrix_product = tf.reduce_sum(flatten_list(dkl_diverg_grad) * p_ph)
print('dkl_matrix_product', dkl_matrix_product.shape)
# Fisher vector product
# The Fisher-vector product is a way to compute the A matrix without the need of the full A
Fx = flatten_list(tf.gradients(dkl_matrix_product, p_variables))
## Step length
beta_ph = tf.placeholder(shape=(), dtype=tf.float32, name='beta')
# NPG update
npg_update = beta_ph * cg_ph
## alpha is found through line search
alpha = tf.Variable(1., trainable=False)
# TRPO update
trpo_update = alpha * npg_update
#################### POLICY UPDATE ###################
# variable used as an index
it_v = tf.Variable(0, trainable=False)
p_opt = []
# Apply the updates to the policy
for p_v in p_variables:
upd_rsh = tf.reshape(trpo_update[it_v : it_v+tf.reduce_prod(p_v.shape)], shape=p_v.shape)
p_opt.append(p_v.assign_sub(upd_rsh))
it_v += tf.reduce_prod(p_v.shape)
p_opt = tf.group(*p_opt)
#########################################################
######################### MODEL #########################
#########################################################
m_opts = []
m_losses = []
nobs_pred_m = []
act_obs = tf.concat([obs_ph, act_ph], 1)
# computational graph of N models
for i in range(num_ensemble_models):
with tf.variable_scope('model_'+str(i)+'_nn'):
nobs_pred = mlp(act_obs, [64, 64], obs_dim[0], tf.nn.relu, last_activation=None)
nobs_pred_m.append(nobs_pred)
m_loss = tf.reduce_mean((nobs_ph - nobs_pred)**2)
m_losses.append(m_loss)
m_opts.append(tf.train.AdamOptimizer(mb_lr).minimize(m_loss))
##################### RESTORE MODEL ######################
initialize_models = []
models_variables = []
for i in range(num_ensemble_models):
m_variables = variables_in_scope('model_'+str(i)+'_nn')
initialize_models.append(restore_model(old_model_variables, m_variables))
models_variables.append(flatten_list(m_variables))
# Time
now = datetime.now()
clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)
print('Time:', clock_time)
# Set scalars and hisograms for TensorBoard
tf.summary.scalar('p_loss', p_loss, collections=['train'])
tf.summary.scalar('v_loss', v_loss, collections=['train'])
tf.summary.scalar('p_divergence', diverg, collections=['train'])
tf.summary.scalar('ratio_new_old',tf.reduce_mean(ratio_new_old), collections=['train'])
tf.summary.scalar('dkl_diverg', dkl_diverg, collections=['train'])
tf.summary.scalar('alpha', alpha, collections=['train'])
tf.summary.scalar('beta', beta_ph, collections=['train'])
tf.summary.scalar('p_std_mn', tf.reduce_mean(tf.exp(log_std)), collections=['train'])
tf.summary.scalar('s_values_mn', tf.reduce_mean(s_values), collections=['train'])
tf.summary.histogram('p_log', p_log, collections=['train'])
tf.summary.histogram('p_means', p_means, collections=['train'])
tf.summary.histogram('s_values', s_values, collections=['train'])
tf.summary.histogram('adv_ph',adv_ph, collections=['train'])
tf.summary.histogram('log_std',log_std, collections=['train'])
scalar_summary = tf.summary.merge_all('train')
tf.summary.scalar('old_v_loss', v_loss, collections=['pre_train'])
tf.summary.scalar('old_p_loss', p_loss, collections=['pre_train'])
pre_scalar_summary = tf.summary.merge_all('pre_train')
hyp_str = '-spe_'+str(steps_per_env)+'-envs_'+str(number_envs)+'-cr_lr'+str(cr_lr)+'-crit_it_'+str(critic_iter)+'-delta_'+str(delta)+'-conj_iters_'+str(conj_iters)
file_writer = tf.summary.FileWriter('log_dir/'+env_name+'/'+algorithm+'_'+clock_time+'_'+hyp_str, tf.get_default_graph())
# create a session
sess = tf.Session()
# initialize the variables
sess.run(tf.global_variables_initializer())
def action_op(o):
return sess.run([p_means, s_values], feed_dict={obs_ph:o})
def action_op_noise(o):
return sess.run([a_sampl, s_values], feed_dict={obs_ph:o})
def model_op(o, a, md_idx):
mo = sess.run(nobs_pred_m[md_idx], feed_dict={obs_ph:[o], act_ph:[a]})
return np.squeeze(mo)
def run_model_loss(model_idx, r_obs, r_act, r_nxt_obs):
return sess.run(m_losses[model_idx], feed_dict={obs_ph:r_obs, act_ph:r_act, nobs_ph:r_nxt_obs})
def run_model_opt_loss(model_idx, r_obs, r_act, r_nxt_obs):
return sess.run([m_opts[model_idx], m_losses[model_idx]], feed_dict={obs_ph:r_obs, act_ph:r_act, nobs_ph:r_nxt_obs})
def model_assign(i, model_variables_to_assign):
'''
Update the i-th model's parameters
'''
return sess.run(initialize_models[i], feed_dict={old_model_variables:model_variables_to_assign})
def policy_update(obs_batch, act_batch, adv_batch, rtg_batch):
# log probabilities, logits and log std of the "old" policy
# "old" policy refer to the policy to optimize and that has been used to sample from the environment
old_p_log, old_p_means, old_log_std = sess.run([p_log, p_means, log_std], feed_dict={obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch})
# get also the "old" parameters
old_actor_params = sess.run(p_var_flatten)
# old_p_loss is later used in the line search
# run pre_scalar_summary for a summary before the optimization
old_p_loss, summary = sess.run([p_loss,pre_scalar_summary], feed_dict={obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch, old_p_log_ph:old_p_log})
file_writer.add_summary(summary, step_count)
def H_f(p):
'''
Run the Fisher-Vector product on 'p' to approximate the Hessian of the DKL
'''
return sess.run(Fx, feed_dict={old_mu_ph:old_p_means, old_log_std_ph:old_log_std, p_ph:p, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch})
g_f = sess.run(p_grads_flatten, feed_dict={old_mu_ph:old_p_means,obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch, old_p_log_ph:old_p_log})
## Compute the Conjugate Gradient so to obtain an approximation of H^(-1)*g
# Where H in reality isn't the true Hessian of the KL divergence but an approximation of it computed via Fisher-Vector Product (F)
conj_grad = conjugate_gradient(H_f, g_f, iters=conj_iters)
# Compute the step length
beta_np = np.sqrt(2*delta / (1e-10 + np.sum(conj_grad * H_f(conj_grad))))
def DKL(alpha_v):
'''
Compute the KL divergence.
It optimize the function to compute the DKL. Afterwards it restore the old parameters.
'''
sess.run(p_opt, feed_dict={beta_ph:beta_np, alpha:alpha_v, cg_ph:conj_grad, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, old_p_log_ph:old_p_log})
a_res = sess.run([dkl_diverg, p_loss], feed_dict={old_mu_ph:old_p_means, old_log_std_ph:old_log_std, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, ret_ph:rtg_batch, old_p_log_ph:old_p_log})
sess.run(restore_params, feed_dict={p_old_variables: old_actor_params})
return a_res
# Actor optimization step
# Different for TRPO or NPG
# Backtracing line search to find the maximum alpha coefficient s.t. the constraint is valid
best_alpha = backtracking_line_search(DKL, delta, old_p_loss, p=0.8)
sess.run(p_opt, feed_dict={beta_ph:beta_np, alpha:best_alpha, cg_ph:conj_grad, obs_ph:obs_batch, act_ph:act_batch, adv_ph:adv_batch, old_p_log_ph:old_p_log})
lb = len(obs_batch)
shuffled_batch = np.arange(lb)
np.random.shuffle(shuffled_batch)
# Value function optimization steps
for _ in range(critic_iter):
# shuffle the batch on every iteration
np.random.shuffle(shuffled_batch)
for idx in range(0,lb, minibatch_size):
minib = shuffled_batch[idx:min(idx+minibatch_size,lb)]
sess.run(v_opt, feed_dict={obs_ph:obs_batch[minib], ret_ph:rtg_batch[minib]})
def train_model(tr_obs, tr_act, tr_nxt_obs, v_obs, v_act, v_nxt_obs, step_count, model_idx):
# Get validation loss on the old model
mb_valid_loss1 = run_model_loss(model_idx, v_obs, v_act, v_nxt_obs)
# Restore the random weights to have a new, clean neural network
model_assign(model_idx, initial_variables_models[model_idx])
mb_valid_loss = run_model_loss(model_idx, v_obs, v_act, v_nxt_obs)
acc_m_losses = []
last_m_losses = []
md_params = sess.run(models_variables[model_idx])
best_mb = {'iter':0, 'loss':mb_valid_loss, 'params':md_params}
it = 0
lb = len(tr_obs)
shuffled_batch = np.arange(lb)
np.random.shuffle(shuffled_batch)
while best_mb['iter'] > it - model_iter:
# update the model on each mini-batch
last_m_losses = []
for idx in range(0, lb, model_batch_size):
minib = shuffled_batch[idx:min(idx+minibatch_size,lb)]
if len(minib) != minibatch_size:
_, ml = run_model_opt_loss(model_idx, tr_obs[minib], tr_act[minib], tr_nxt_obs[minib])
acc_m_losses.append(ml)
last_m_losses.append(ml)
else:
print('Warning!')
# Check if the loss on the validation set has improved
mb_valid_loss = run_model_loss(model_idx, v_obs, v_act, v_nxt_obs)
if mb_valid_loss < best_mb['loss']:
best_mb['loss'] = mb_valid_loss
best_mb['iter'] = it
best_mb['params'] = sess.run(models_variables[model_idx])
it += 1
# Restore the model with the lower validation loss
model_assign(model_idx, best_mb['params'])
print('Model:{}, iter:{} -- Old Val loss:{:.6f} New Val loss:{:.6f} -- New Train loss:{:.6f}'.format(model_idx, it, mb_valid_loss1, best_mb['loss'], np.mean(last_m_losses)))
summary = tf.Summary()
summary.value.add(tag='supplementary/m_loss', simple_value=np.mean(acc_m_losses))
summary.value.add(tag='supplementary/iterations', simple_value=it)
file_writer.add_summary(summary, step_count)
file_writer.flush()
# variable to store the total number of steps
step_count = 0
model_buffer = FullBuffer()
print('Env batch size:',steps_per_env, ' Batch size:',steps_per_env*number_envs)
# Create a simulated environment
sim_env = NetworkEnv(gym.make(env_name), model_op, pendulum_reward, pendulum_done, num_ensemble_models)
# Get the initial parameters of each model
# These are used in later epochs when we aim to re-train the models anew with the new dataset
initial_variables_models = []
for model_var in models_variables:
initial_variables_models.append(sess.run(model_var))
for ep in range(num_epochs):
# lists to store rewards and length of the trajectories completed
batch_rew = []
batch_len = []
print('============================', ep, '============================')
# Execute in serial the environment, storing temporarily the trajectories.
for env in envs:
init_log_std = np.ones(act_dim) * np.log(np.random.rand()*1)
env.reset()
# iterate over a fixed number of steps
for _ in range(steps_per_env):
# run the policy
if ep == 0:
# Sample random action during the first epoch
act = env.action_space.sample()
else:
act = sess.run(a_sampl, feed_dict={obs_ph:[env.n_obs], log_std:init_log_std})
act = np.squeeze(act)
# take a step in the environment
obs2, rew, done, _ = env.step(np.array([act]))
# add the new transition to the temporary buffer
model_buffer.store(env.n_obs.copy(), act, rew, obs2.copy(), done)
env.n_obs = obs2.copy()
step_count += 1
if done:
batch_rew.append(env.get_episode_reward())
batch_len.append(env.get_episode_length())
env.reset()
init_log_std = np.ones(act_dim) * np.log(np.random.rand()*1)
print('Ep:%d Rew:%.2f -- Step:%d' % (ep, np.mean(batch_rew), step_count))
############################################################
###################### MODEL LEARNING ######################
############################################################
# Initialize randomly a training and validation set
model_buffer.generate_random_dataset()
# get both datasets
train_obs, train_act, _, train_nxt_obs, _ = model_buffer.get_training_batch()
valid_obs, valid_act, _, valid_nxt_obs, _ = model_buffer.get_valid_batch()
print('Log Std policy:', sess.run(log_std))
for i in range(num_ensemble_models):
# train the dynamic model on the datasets just sampled
train_model(train_obs, train_act, train_nxt_obs, valid_obs, valid_act, valid_nxt_obs, step_count, i)
############################################################
###################### POLICY LEARNING ######################
############################################################
best_sim_test = np.zeros(num_ensemble_models)
for it in range(80):
print('\t Policy it', it, end='.. ')
##################### MODEL SIMLUATION #####################
obs_batch, act_batch, adv_batch, rtg_batch = simulate_environment(sim_env, action_op_noise, simulated_steps)
################# TRPO UPDATE ################
policy_update(obs_batch, act_batch, adv_batch, rtg_batch)
# Testing the policy on a real environment
mn_test = test_agent(env_test, action_op, num_games=10)[0]
print(' Test score: ', np.round(mn_test, 2))
summary = tf.Summary()
summary.value.add(tag='test/performance', simple_value=mn_test)
file_writer.add_summary(summary, step_count)
file_writer.flush()
# Test the policy on simulated environment.
if (it+1) % 5 == 0:
print('Simulated test:', end=' -- ')
sim_rewards = []
for i in range(num_ensemble_models):
sim_m_env = NetworkEnv(gym.make(env_name), model_op, pendulum_reward, pendulum_done, i+1)
mn_sim_rew, _ = test_agent(sim_m_env, action_op, num_games=5)
sim_rewards.append(mn_sim_rew)
print(mn_sim_rew, end=' -- ')
print("")
sim_rewards = np.array(sim_rewards)
# stop training if the policy hasn't improved
if (np.sum(best_sim_test >= sim_rewards) > int(num_ensemble_models*0.7)) \
or (len(sim_rewards[sim_rewards >= 990]) > int(num_ensemble_models*0.7)):
break
else:
best_sim_test = sim_rewards
# closing environments..
for env in envs:
env.close()
file_writer.close()
if __name__ == '__main__':
METRPO('RoboschoolInvertedPendulum-v1', hidden_sizes=[32,32], cr_lr=1e-3, gamma=0.99, lam=0.95, num_epochs=7, steps_per_env=300,
number_envs=1, critic_iter=10, delta=0.01, algorithm='TRPO', conj_iters=10, minibatch_size=5000,
mb_lr=0.00001, model_batch_size=50, simulated_steps=50000, num_ensemble_models=10, model_iter=15)
================================================
FILE: Chapter10/DAgger.py
================================================
import numpy as np
import tensorflow as tf
from datetime import datetime
import time
from ple.games.flappybird import FlappyBird
from ple import PLE
def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activation=None):
'''
Multi-layer perceptron
'''
for l in hidden_layers:
x = tf.layers.dense(x, units=l, activation=activation)
return tf.layers.dense(x, units=output_layer, activation=last_activation)
def flappy_to_list(fd):
'''
Return the state dictionary as a list
'''
return fd['player_y'], fd['player_vel'], fd['next_pipe_dist_to_player'], fd['next_pipe_top_y'], \
fd['next_pipe_bottom_y'], fd['next_next_pipe_dist_to_player'], fd['next_next_pipe_top_y'], \
fd['next_next_pipe_bottom_y']
def flappy_game_state(bol):
'''
Normalize the game state
'''
stat = flappy_to_list(bol.getGameState())
stat = (np.array(stat, dtype=np.float32) / 300.0) - 0.5
return stat
def no_op(env, n_act=5):
for _ in range(n_act):
env.act(119 if np.random.randn() < 0.5 else None)
def expert():
'''
Load the computational graph and pretarined weights of the expert
'''
graph = tf.get_default_graph()
sess_expert = tf.Session(graph=graph)
saver = tf.train.import_meta_graph('expert/model.ckpt.meta')
saver.restore(sess_expert,tf.train.latest_checkpoint('expert/'))
p_argmax = graph.get_tensor_by_name('actor_nn/max_act:0')
obs_ph = graph.get_tensor_by_name('obs:0')
def expert_policy(state):
act = sess_expert.run(p_argmax, feed_dict={obs_ph:[state]})
return np.squeeze(act)
return expert_policy
def test_agent(policy, file_writer=None, test_games=10, step=0):
game = FlappyBird()
env = PLE(game, fps=30, display_screen=False)
env.init()
test_rewards = []
for _ in range(test_games):
env.reset_game()
no_op(env)
game_rew = 0
while not env.game_over():
state = flappy_game_state(env)
action = 119 if policy(state) == 1 else None
for _ in range(2):
game_rew += env.act(action)
test_rewards.append(game_rew)
if file_writer is not None:
summary = tf.Summary()
summary.value.add(tag='test_performance', simple_value=game_rew)
file_writer.add_summary(summary, step)
file_writer.flush()
return test_rewards
def DAgger(hidden_sizes=[32,32], dagger_iterations=20, p_lr=1e-3, step_iterations=1000, batch_size=128, train_epochs=20, obs_dim=8, act_dim=2):
tf.reset_default_graph()
############################## EXPERT ###############################
# load the expert and return a function that predict the expert action given a state
expert_policy = expert()
print('Expert performance: ', np.mean(test_agent(expert_policy)))
#################### LEARNER COMPUTATIONAL GRAPH ####################
obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32, name='obs')
act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act')
# Multi-layer perceptron
p_logits = mlp(obs_ph, hidden_sizes, act_dim, tf.nn.relu, last_activation=None)
act_max = tf.math.argmax(p_logits, axis=1)
act_onehot = tf.one_hot(act_ph, depth=act_dim)
# softmax cross entropy loss
p_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=act_onehot, logits=p_logits))
# Adam optimizer
p_opt = tf.train.AdamOptimizer(p_lr).minimize(p_loss)
now = datetime.now()
clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)
file_writer = tf.summary.FileWriter('log_dir/FlappyBird/DAgger_'+clock_time, tf.get_default_graph())
sess = tf.Session()
sess.run(tf.global_variables_initializer())
def learner_policy(state):
action = sess.run(act_max, feed_dict={obs_ph:[state]})
return np.squeeze(action)
X = []
y = []
env = FlappyBird()
env = PLE(env, fps=30, display_screen=False)
env.init()
#################### DAgger iterations ####################
for it in range(dagger_iterations):
sess.run(tf.global_variables_initializer())
env.reset_game()
no_op(env)
game_rew = 0
rewards = []
###################### Populate the dataset #####################
for _ in range(step_iterations):
# get the current state from the environment
state = flappy_game_state(env)
# As the iterations continue use more and more actions sampled from the learner
if np.random.rand() < (1 - it/5):
action = expert_policy(state)
else:
action = learner_policy(state)
action = 119 if action == 1 else None
rew = env.act(action)
rew += env.act(action)
# Add the state and the expert action to the dataset
X.append(state)
y.append(expert_policy(state))
game_rew += rew
# Whenever the game stop, reset the environment and initailize the variables
if env.game_over():
env.reset_game()
no_op(env)
rewards.append(game_rew)
game_rew = 0
##################### Training #####################
# Calculate the number of minibatches
n_batches = int(np.floor(len(X)/batch_size))
# shuffle the dataset
shuffle = np.arange(len(X))
np.random.shuffle(shuffle)
shuffled_X = np.array(X)[shuffle]
shuffled_y = np.array(y)[shuffle]
for _ in range(train_epochs):
ep_loss = []
# Train the model on each minibatch in the dataset
for b in range(n_batches):
p_start = b*batch_size
# mini-batch training
tr_loss, _ = sess.run([p_loss, p_opt], feed_dict={
obs_ph:shuffled_X[p_start:p_start+batch_size],
act_ph:shuffled_y[p_start:p_start+batch_size]})
ep_loss.append(tr_loss)
agent_tests = test_agent(learner_policy, file_writer, step=len(X))
print('Ep:', it, np.mean(ep_loss), 'Test:', np.mean(agent_tests))
if __name__ == "__main__":
DAgger(hidden_sizes=[16,16], dagger_iterations=10, p_lr=1e-4, step_iterations=100, batch_size=50, train_epochs=2000)
================================================
FILE: Chapter10/expert/checkpoint
================================================
model_checkpoint_path: "model.ckpt"
all_model_checkpoint_paths: "model.ckpt"
================================================
FILE: Chapter11/ES.py
================================================
import numpy as np
import tensorflow as tf
from datetime import datetime
import time
import gym
import multiprocessing as mp
import scipy.stats as ss
import contextlib
import numpy as np
@contextlib.contextmanager
def temp_seed(seed):
state = np.random.get_state()
np.random.seed(seed)
try:
yield
finally:
np.random.set_state(state)
def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activation=None):
'''
Multi-layer perceptron
'''
for l in hidden_layers:
x = tf.layers.dense(x, units=l, activation=activation)
return tf.layers.dense(x, units=output_layer, activation=last_activation)
def test_agent(env_test, agent_op, num_games=1):
'''
Test an agent 'agent_op', 'num_games' times
Return mean and std
'''
games_r = []
steps = 0
for _ in range(num_games):
d = False
game_r = 0
o = env_test.reset()
while not d:
a_s = agent_op(o)
o, r, d, _ = env_test.step(a_s)
game_r += r
steps += 1
games_r.append(game_r)
return games_r, steps
def worker(env_name, initial_seed, hidden_sizes, lr, std_noise, indiv_per_worker, worker_name, params_queue, output_queue):
env = gym.make(env_name)
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
import tensorflow as tf
# set an initial seed common to all the workers
tf.random.set_random_seed(initial_seed)
np.random.seed(initial_seed)
with tf.device("/cpu:" + worker_name):
obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32, name='obs_ph')
new_weights_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='new_weights_ph')
def variables_in_scope(scope):
# get all trainable variables in 'scope'
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
with tf.variable_scope('nn_' + worker_name):
acts = mlp(obs_ph, hidden_sizes, act_dim, tf.tanh, last_activation=tf.tanh)
agent_variables = variables_in_scope('nn_' + worker_name)
agent_variables_flatten = flatten_list(agent_variables)
# Update the agent parameters with new weights new_weights_ph
it_v1 = tf.Variable(0, trainable=False)
update_weights = []
for a_v in agent_variables:
upd_rsh = tf.reshape(new_weights_ph[it_v1 : it_v1+tf.reduce_prod(a_v.shape)], shape=a_v.shape)
update_weights.append(a_v.assign(upd_rsh))
it_v1 += tf.reduce_prod(a_v.shape)
# Reshape the new_weights_ph following the neural network shape
it_v2 = tf.Variable(0, trainable=False)
vars_grads_list = []
for a_v in agent_variables:
vars_grads_list.append(tf.reshape(new_weights_ph[it_v2 : it_v2+tf.reduce_prod(a_v.shape)], shape=a_v.shape))
it_v2 += tf.reduce_prod(a_v.shape)
# Create the optimizer
opt = tf.train.AdamOptimizer(lr)
# Apply the "gradients" using Adam
apply_g = opt.apply_gradients([(g, v) for g, v in zip(vars_grads_list, agent_variables)])
def agent_op(o):
a = np.squeeze(sess.run(acts, feed_dict={obs_ph:[o]}))
return np.clip(a, env.action_space.low, env.action_space.high)
def evaluation_on_noise(noise):
'''
Evaluate the agent with the noise
'''
# Get the original weights that will be restored after the evaluation
original_weights = sess.run(agent_variables_flatten)
# Update the weights of the agent/individual by adding the extra noise noise*STD_NOISE
sess.run(update_weights, feed_dict={new_weights_ph:original_weights + noise*std_noise})
# Test the agent with the new weights
rewards, steps = test_agent(env, agent_op)
# Restore the original weights
sess.run(update_weights, feed_dict={new_weights_ph:original_weights})
return np.mean(rewards), steps
config_proto = tf.ConfigProto(device_count={'CPU': 4}, allow_soft_placement=True)
sess = tf.Session(config=config_proto)
sess.run(tf.global_variables_initializer())
agent_flatten_shape = sess.run(agent_variables_flatten).shape
while True:
for _ in range(indiv_per_worker):
seed = np.random.randint(1e7)
with temp_seed(seed):
# sample, for each weight of the agent, from a normal distribution
sampled_noise = np.random.normal(size=agent_flatten_shape)
# Mirrored sampling
pos_rew, stp1 = evaluation_on_noise(sampled_noise)
neg_rew, stp2 = evaluation_on_noise(-sampled_noise)
# Put the returns and seeds on the queue
# Note that here we are just sending the seed (a scalar value), not the complete perturbation sampled_noise
output_queue.put([[pos_rew, neg_rew], seed, stp1+stp2])
# Get all the returns and seed from each other worker
batch_return, batch_seed = params_queue.get()
batch_noise = []
for seed in batch_seed:
# reconstruct the perturbations from the seed
with temp_seed(seed):
sampled_noise = np.random.normal(size=agent_flatten_shape)
batch_noise.append(sampled_noise)
batch_noise.append(-sampled_noise)
# Compute the sthocastic gradient estimate
vars_grads = np.zeros(agent_flatten_shape)
for n, r in zip(batch_noise, batch_return):
vars_grads += n * r
vars_grads /= len(batch_noise) * std_noise
# run Adam optimization on the estimate gradient just computed
sess.run(apply_g, feed_dict={new_weights_ph:-vars_grads})
def normalized_rank(rewards):
'''
Rank the rewards and normalize them.
'''
ranked = ss.rankdata(rewards)
norm = (ranked - 1) / (len(ranked) - 1)
norm -= 0.5
return norm
def flatten(tensor):
'''
Flatten a tensor
'''
return tf.reshape(tensor, shape=(-1,))
def flatten_list(tensor_list):
'''
Flatten a list of tensors
'''
return tf.concat([flatten(t) for t in tensor_list], axis=0)
def ES(env_name, hidden_sizes=[8,8], number_iter=1000, num_workers=4, lr=0.01, indiv_per_worker=10, std_noise=0.01):
initial_seed = np.random.randint(1e7)
# Create a queue for the output values (single returns and seeds values)
output_queue = mp.Queue(maxsize=num_workers*indiv_per_worker)
# Create a queue for the input paramaters (batch return and batch seeds)
params_queue = mp.Queue(maxsize=num_workers)
now = datetime.now()
clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)
hyp_str = '-numworkers_'+str(num_workers)+'-lr_'+str(lr)
file_writer = tf.summary.FileWriter('log_dir/'+env_name+'/'+clock_time+'_'+hyp_str, tf.get_default_graph())
processes = []
# Create a parallel process for each worker
for widx in range(num_workers):
p = mp.Process(target=worker, args=(env_name, initial_seed, hidden_sizes, lr, std_noise, indiv_per_worker, str(widx), params_queue, output_queue))
p.start()
processes.append(p)
tot_steps = 0
# Iterate over all the training iterations
for n_iter in range(number_iter):
batch_seed = []
batch_return = []
# Wait until enough candidate individuals are evaluated
for _ in range(num_workers*indiv_per_worker):
p_rews, p_seed, p_steps = output_queue.get()
batch_seed.append(p_seed)
batch_return.extend(p_rews)
tot_steps += p_steps
print('Iter: {} Reward: {:.2f}'.format(n_iter, np.mean(batch_return)))
# Let's save the population's performance
summary = tf.Summary()
for r in batch_return:
summary.value.add(tag='performance', simple_value=r)
file_writer.add_summary(summary, tot_steps)
file_writer.flush()
# Rank and normalize the returns
batch_return = normalized_rank(batch_return)
# Put on the queue all the returns and seed so that each worker can optimize the neural network
for _ in range(num_workers):
params_queue.put([batch_return, batch_seed])
# terminate all workers
for p in processes:
p.terminate()
if __name__ == '__main__':
ES('LunarLanderContinuous-v2', hidden_sizes=[32,32], number_iter=200, num_workers=4, lr=0.02, indiv_per_worker=12, std_noise=0.05)
================================================
FILE: Chapter12/ESBAS.py
================================================
import numpy as np
import tensorflow as tf
import gym
from datetime import datetime
from collections import deque
import time
import sys
gym.logger.set_level(40)
current_milli_time = lambda: int(round(time.time() * 1000))
def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activation=None):
'''
Multi-layer perceptron
'''
for l in hidden_layers:
x = tf.layers.dense(x, units=l, activation=activation)
return tf.layers.dense(x, units=output_layer, activation=last_activation)
class ExperienceBuffer():
'''
Experience Replay Buffer
'''
def __init__(self, buffer_size):
self.obs_buf = deque(maxlen=buffer_size)
self.rew_buf = deque(maxlen=buffer_size)
self.act_buf = deque(maxlen=buffer_size)
self.obs2_buf = deque(maxlen=buffer_size)
self.done_buf = deque(maxlen=buffer_size)
def add(self, obs, rew, act, obs2, done):
# Add a new transition to the buffers
self.obs_buf.append(obs)
self.rew_buf.append(rew)
self.act_buf.append(act)
self.obs2_buf.append(obs2)
self.done_buf.append(done)
def sample_minibatch(self, batch_size):
# Sample a minibatch of size batch_size
mb_indices = np.random.randint(len(self.obs_buf), size=batch_size)
mb_obs = [self.obs_buf[i] for i in mb_indices]
mb_rew = [self.rew_buf[i] for i in mb_indices]
mb_act = [self.act_buf[i] for i in mb_indices]
mb_obs2 = [self.obs2_buf[i] for i in mb_indices]
mb_done = [self.done_buf[i] for i in mb_indices]
return mb_obs, mb_rew, mb_act, mb_obs2, mb_done
def __len__(self):
return len(self.obs_buf)
def q_target_values(mini_batch_rw, mini_batch_done, av, discounted_value):
'''
Calculate the target value y for each transition
'''
max_av = np.max(av, axis=1)
# if episode terminate, y take value r
# otherwise, q-learning step
ys = []
for r, d, av in zip(mini_batch_rw, mini_batch_done, max_av):
if d:
ys.append(r)
else:
q_step = r + discounted_value * av
ys.append(q_step)
assert len(ys) == len(mini_batch_rw)
return ys
def greedy(action_values):
'''
Greedy policy
'''
return np.argmax(action_values)
def eps_greedy(action_values, eps=0.1):
'''
Eps-greedy policy
'''
if np.random.uniform(0,1) < eps:
# Choose a uniform random action
return np.random.randint(len(action_values))
else:
# Choose the greedy action
return np.argmax(action_values)
def test_agent(env_test, agent_op, num_games=20, summary=None):
'''
Test an agent
'''
games_r = []
for _ in range(num_games):
d = False
game_r = 0
o = env_test.reset()
while not d:
a = greedy(np.squeeze(agent_op(o)))
o, r, d, _ = env_test.step(a)
game_r += r
if summary is not None:
summary.value.add(tag='test_performance', simple_value=game_r)
games_r.append(game_r)
return games_r
class DQN_optimization:
def __init__(self, obs_dim, act_dim, hidden_layers, lr, discount):
self.obs_dim = obs_dim
self.act_dim = act_dim
self.hidden_layers = hidden_layers
self.lr = lr
self.discount = discount
self.__build_graph()
def __build_graph(self):
self.g = tf.Graph()
with self.g.as_default():
# Create all the placeholders
self.obs_ph = tf.placeholder(shape=(None, self.obs_dim[0]), dtype=tf.float32, name='obs')
self.act_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='act')
self.y_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='y')
# Create the target network
with tf.variable_scope('target_network'):
self.target_qv = mlp(self.obs_ph, self.hidden_layers, self.act_dim, tf.nn.relu, last_activation=None)
target_vars = tf.trainable_variables()
# Create the online network (i.e. the behavior policy)
with tf.variable_scope('online_network'):
self.online_qv = mlp(self.obs_ph, self.hidden_layers, self.act_dim, tf.nn.relu, last_activation=None)
train_vars = tf.trainable_variables()
# Update the target network by assigning to it the variables of the online network
# Note that the target network and the online network have the same exact architecture
update_target = [train_vars[i].assign(train_vars[i+len(target_vars)]) for i in range(len(train_vars) - len(target_vars))]
self.update_target_op = tf.group(*update_target)
# One hot encoding of the action
act_onehot = tf.one_hot(self.act_ph, depth=self.act_dim)
# We are interested only in the Q-values of those actions
q_values = tf.reduce_sum(act_onehot * self.online_qv, axis=1)
# MSE loss function
self.v_loss = tf.reduce_mean((self.y_ph - q_values)**2)
# Adam optimize that minimize the loss v_loss
self.v_op
gitextract_o5sg2_86/ ├── Chapter02/ │ └── Code.ipynb ├── Chapter03/ │ ├── frozenlake8x8_policyiteration.py │ └── frozenlake8x8_valueiteration.py ├── Chapter04/ │ └── SARSA Q_learning Taxi-v2.py ├── Chapter05/ │ ├── .ipynb_checkpoints/ │ │ └── Untitled-checkpoint.ipynb │ ├── DQN_Atari.py │ ├── DQN_variations_Atari.py │ ├── Untitled.ipynb │ ├── atari_wrappers.py │ └── untitled ├── Chapter06/ │ ├── AC.py │ ├── REINFORCE.py │ └── REINFORCE_baseline.py ├── Chapter07/ │ ├── PPO.py │ └── TRPO.py ├── Chapter08/ │ ├── DDPG.py │ └── TD3.py ├── Chapter09/ │ └── ME-TRPO.py ├── Chapter10/ │ ├── DAgger.py │ └── expert/ │ ├── checkpoint │ ├── model.ckpt.data-00000-of-00001 │ ├── model.ckpt.index │ └── model.ckpt.meta ├── Chapter11/ │ └── ES.py ├── Chapter12/ │ └── ESBAS.py └── README.md
SYMBOL INDEX (232 symbols across 17 files)
FILE: Chapter03/frozenlake8x8_policyiteration.py
function eval_state_action (line 4) | def eval_state_action(V, s, a, gamma=0.99):
function policy_evaluation (line 7) | def policy_evaluation(V, policy, eps=0.0001):
function policy_improvement (line 23) | def policy_improvement(V, policy):
function run_episodes (line 38) | def run_episodes(env, policy, num_games=100):
FILE: Chapter03/frozenlake8x8_valueiteration.py
function eval_state_action (line 4) | def eval_state_action(V, s, a, gamma=0.99):
function value_iteration (line 7) | def value_iteration(eps=0.0001):
function run_episodes (line 30) | def run_episodes(env, V, num_games=100):
FILE: Chapter04/SARSA Q_learning Taxi-v2.py
function eps_greedy (line 5) | def eps_greedy(Q, s, eps=0.1):
function greedy (line 17) | def greedy(Q, s):
function run_episodes (line 26) | def run_episodes(env, Q, num_episodes=100, to_print=False):
function Q_learning (line 52) | def Q_learning(env, lr=0.01, num_episodes=10000, eps=0.3, gamma=0.95, ep...
function SARSA (line 95) | def SARSA(env, lr=0.01, num_episodes=10000, eps=0.3, gamma=0.95, eps_dec...
FILE: Chapter05/DQN_Atari.py
function cnn (line 16) | def cnn(x):
function fnn (line 25) | def fnn(x, hidden_layers, output_layer, activation=tf.nn.relu, last_acti...
function qnet (line 33) | def qnet(x, hidden_layers, output_size, fnn_activation=tf.nn.relu, last_...
class ExperienceBuffer (line 43) | class ExperienceBuffer():
method __init__ (line 47) | def __init__(self, buffer_size):
method add (line 55) | def add(self, obs, rew, act, obs2, done):
method sample_minibatch (line 64) | def sample_minibatch(self, batch_size):
method __len__ (line 76) | def __len__(self):
function q_target_values (line 80) | def q_target_values(mini_batch_rw, mini_batch_done, av, discounted_value):
function greedy (line 100) | def greedy(action_values):
function eps_greedy (line 106) | def eps_greedy(action_values, eps=0.1):
function test_agent (line 117) | def test_agent(env_test, agent_op, num_games=20):
function scale_frames (line 141) | def scale_frames(frames):
function DQN (line 147) | def DQN(env_name, hidden_sizes=[32], lr=1e-2, num_epochs=2000, buffer_si...
FILE: Chapter05/DQN_variations_Atari.py
function cnn (line 17) | def cnn(x):
function fnn (line 26) | def fnn(x, hidden_layers, output_layer, activation=tf.nn.relu, last_acti...
function qnet (line 34) | def qnet(x, hidden_layers, output_size, fnn_activation=tf.nn.relu, last_...
function greedy (line 43) | def greedy(action_values):
function eps_greedy (line 49) | def eps_greedy(action_values, eps=0.1):
function q_target_values (line 60) | def q_target_values(mini_batch_rw, mini_batch_done, av, discounted_value):
function test_agent (line 80) | def test_agent(env_test, agent_op, num_games=20):
function scale_frames (line 104) | def scale_frames(frames):
function dueling_qnet (line 111) | def dueling_qnet(x, hidden_layers, output_size, fnn_activation=tf.nn.rel...
function double_q_target_values (line 123) | def double_q_target_values(mini_batch_rw, mini_batch_done, target_qv, on...
class MultiStepExperienceBuffer (line 145) | class MultiStepExperienceBuffer():
method __init__ (line 149) | def __init__(self, buffer_size, n_step, gamma):
method add (line 162) | def add(self, obs, rew, act, obs2, done):
method sample_minibatch (line 198) | def sample_minibatch(self, batch_size):
method __len__ (line 211) | def __len__(self):
function DQN_with_variations (line 214) | def DQN_with_variations(env_name, extensions_hyp, hidden_sizes=[32], lr=...
FILE: Chapter05/atari_wrappers.py
class NoopResetEnv (line 12) | class NoopResetEnv(gym.Wrapper):
method __init__ (line 13) | def __init__(self, env, noop_max=30):
method reset (line 23) | def reset(self, **kwargs):
method step (line 38) | def step(self, ac):
class LazyFrames (line 41) | class LazyFrames(object):
method __init__ (line 42) | def __init__(self, frames):
method _force (line 51) | def _force(self):
method __array__ (line 57) | def __array__(self, dtype=None):
method __len__ (line 63) | def __len__(self):
method __getitem__ (line 66) | def __getitem__(self, i):
class FireResetEnv (line 69) | class FireResetEnv(gym.Wrapper):
method __init__ (line 70) | def __init__(self, env):
method reset (line 76) | def reset(self, **kwargs):
method step (line 86) | def step(self, ac):
class MaxAndSkipEnv (line 90) | class MaxAndSkipEnv(gym.Wrapper):
method __init__ (line 91) | def __init__(self, env, skip=4):
method step (line 98) | def step(self, action):
method reset (line 115) | def reset(self, **kwargs):
class WarpFrame (line 120) | class WarpFrame(gym.ObservationWrapper):
method __init__ (line 121) | def __init__(self, env):
method observation (line 129) | def observation(self, frame):
class FrameStack (line 136) | class FrameStack(gym.Wrapper):
method __init__ (line 137) | def __init__(self, env, k):
method reset (line 149) | def reset(self):
method step (line 155) | def step(self, action):
method _get_ob (line 160) | def _get_ob(self):
class ScaledFloatFrame (line 165) | class ScaledFloatFrame(gym.ObservationWrapper):
method __init__ (line 166) | def __init__(self, env):
method observation (line 170) | def observation(self, observation):
function make_env (line 176) | def make_env(env_name, fire=True, frames_num=2, noop_num=30, skip_frames...
FILE: Chapter06/AC.py
function mlp (line 8) | def mlp(x, hidden_layers, output_size, activation=tf.nn.relu, last_activ...
function softmax_entropy (line 16) | def softmax_entropy(logits):
function discounted_rewards (line 22) | def discounted_rewards(rews, last_sv, gamma):
class Buffer (line 38) | class Buffer():
method __init__ (line 42) | def __init__(self, gamma=0.99):
method store (line 49) | def store(self, temp_traj, last_sv):
method get_batch (line 66) | def get_batch(self):
method __len__ (line 69) | def __len__(self):
function AC (line 73) | def AC(env_name, hidden_sizes=[32], ac_lr=5e-3, cr_lr=8e-3, num_epochs=5...
FILE: Chapter06/REINFORCE.py
function mlp (line 8) | def mlp(x, hidden_layers, output_size, activation=tf.nn.relu, last_activ...
function softmax_entropy (line 16) | def softmax_entropy(logits):
function discounted_rewards (line 23) | def discounted_rewards(rews, gamma):
class Buffer (line 38) | class Buffer():
method __init__ (line 42) | def __init__(self, gamma=0.99):
method store (line 48) | def store(self, temp_traj):
method get_batch (line 63) | def get_batch(self):
method __len__ (line 67) | def __len__(self):
function REINFORCE (line 72) | def REINFORCE(env_name, hidden_sizes=[32], lr=5e-3, num_epochs=50, gamma...
FILE: Chapter06/REINFORCE_baseline.py
function mlp (line 8) | def mlp(x, hidden_layers, output_size, activation=tf.nn.relu, last_activ...
function softmax_entropy (line 16) | def softmax_entropy(logits):
function discounted_rewards (line 23) | def discounted_rewards(rews, gamma):
class Buffer (line 38) | class Buffer():
method __init__ (line 42) | def __init__(self, gamma=0.99):
method store (line 49) | def store(self, temp_traj):
method get_batch (line 66) | def get_batch(self):
method __len__ (line 70) | def __len__(self):
function REINFORCE_baseline (line 75) | def REINFORCE_baseline(env_name, hidden_sizes=[32], p_lr=5e-3, vf_lr=8e-...
FILE: Chapter07/PPO.py
function mlp (line 8) | def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activat...
function softmax_entropy (line 16) | def softmax_entropy(logits):
function clipped_surrogate_obj (line 22) | def clipped_surrogate_obj(new_p, old_p, adv, eps):
function GAE (line 29) | def GAE(rews, v, v_last, gamma=0.99, lam=0.95):
function discounted_rewards (line 39) | def discounted_rewards(rews, last_sv, gamma):
class StructEnv (line 56) | class StructEnv(gym.Wrapper):
method __init__ (line 60) | def __init__(self, env):
method reset (line 66) | def reset(self, **kwargs):
method step (line 72) | def step(self, action):
method get_episode_reward (line 78) | def get_episode_reward(self):
method get_episode_length (line 81) | def get_episode_length(self):
class Buffer (line 84) | class Buffer():
method __init__ (line 88) | def __init__(self, gamma=0.99, lam=0.95):
method store (line 96) | def store(self, temp_traj, last_sv):
method get_batch (line 113) | def get_batch(self):
method __len__ (line 118) | def __len__(self):
function gaussian_log_likelihood (line 122) | def gaussian_log_likelihood(x, mean, log_std):
function PPO (line 129) | def PPO(env_name, hidden_sizes=[32], cr_lr=5e-3, ac_lr=5e-3, num_epochs=...
FILE: Chapter07/TRPO.py
function mlp (line 7) | def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activat...
function softmax_entropy (line 15) | def softmax_entropy(logits):
function gaussian_log_likelihood (line 22) | def gaussian_log_likelihood(ac, mean, log_std):
function conjugate_gradient (line 30) | def conjugate_gradient(A, b, x=None, iters=10):
function gaussian_DKL (line 52) | def gaussian_DKL(mu_q, log_std_q, mu_p, log_std_p):
function backtracking_line_search (line 59) | def backtracking_line_search(Dkl, delta, old_loss, p=0.8):
function GAE (line 81) | def GAE(rews, v, v_last, gamma=0.99, lam=0.95):
function discounted_rewards (line 91) | def discounted_rewards(rews, last_sv, gamma):
class Buffer (line 107) | class Buffer():
method __init__ (line 111) | def __init__(self, gamma=0.99, lam=0.95):
method store (line 119) | def store(self, temp_traj, last_sv):
method get_batch (line 136) | def get_batch(self):
method __len__ (line 141) | def __len__(self):
function flatten_list (line 145) | def flatten_list(tensor_list):
function flatten (line 151) | def flatten(tensor):
class StructEnv (line 158) | class StructEnv(gym.Wrapper):
method __init__ (line 162) | def __init__(self, env):
method reset (line 168) | def reset(self, **kwargs):
method step (line 174) | def step(self, action):
method get_episode_reward (line 180) | def get_episode_reward(self):
method get_episode_length (line 183) | def get_episode_length(self):
function TRPO (line 187) | def TRPO(env_name, hidden_sizes=[32], cr_lr=5e-3, num_epochs=50, gamma=0...
FILE: Chapter08/DDPG.py
function mlp (line 10) | def mlp(x, hidden_layers, output_layer, activation=tf.nn.relu, last_acti...
function deterministic_actor_critic (line 18) | def deterministic_actor_critic(x, a, hidden_sizes, act_dim, max_act):
class ExperiencedBuffer (line 36) | class ExperiencedBuffer():
method __init__ (line 40) | def __init__(self, buffer_size):
method add (line 49) | def add(self, obs, rew, act, obs2, done):
method sample_minibatch (line 60) | def sample_minibatch(self, batch_size):
method __len__ (line 74) | def __len__(self):
function test_agent (line 77) | def test_agent(env_test, agent_op, num_games=10):
function DDPG (line 98) | def DDPG(env_name, hidden_sizes=[32], ac_lr=1e-2, cr_lr=1e-2, num_epochs...
FILE: Chapter08/TD3.py
function mlp (line 10) | def mlp(x, hidden_layers, output_layer, activation=tf.nn.relu, last_acti...
function deterministic_actor_double_critic (line 19) | def deterministic_actor_double_critic(x, a, hidden_sizes, act_dim, max_a...
class ExperiencedBuffer (line 42) | class ExperiencedBuffer():
method __init__ (line 46) | def __init__(self, buffer_size):
method add (line 55) | def add(self, obs, rew, act, obs2, done):
method sample_minibatch (line 66) | def sample_minibatch(self, batch_size):
method __len__ (line 80) | def __len__(self):
function test_agent (line 83) | def test_agent(env_test, agent_op, num_games=10):
function TD3 (line 107) | def TD3(env_name, hidden_sizes=[32], ac_lr=1e-2, cr_lr=1e-2, num_epochs=...
FILE: Chapter09/ME-TRPO.py
function mlp (line 8) | def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activat...
function softmax_entropy (line 16) | def softmax_entropy(logits):
function gaussian_log_likelihood (line 22) | def gaussian_log_likelihood(ac, mean, log_std):
function conjugate_gradient (line 29) | def conjugate_gradient(A, b, x=None, iters=10):
function gaussian_DKL (line 51) | def gaussian_DKL(mu_q, log_std_q, mu_p, log_std_p):
function backtracking_line_search (line 57) | def backtracking_line_search(Dkl, delta, old_loss, p=0.8):
function GAE (line 77) | def GAE(rews, v, v_last, gamma=0.99, lam=0.95):
function discounted_rewards (line 87) | def discounted_rewards(rews, last_sv, gamma):
function flatten_list (line 103) | def flatten_list(tensor_list):
function flatten (line 109) | def flatten(tensor):
function test_agent (line 116) | def test_agent(env_test, agent_op, num_games=10):
class Buffer (line 135) | class Buffer():
method __init__ (line 139) | def __init__(self, gamma=0.99, lam=0.95):
method store (line 147) | def store(self, temp_traj, last_sv):
method get_batch (line 164) | def get_batch(self):
method __len__ (line 169) | def __len__(self):
class FullBuffer (line 174) | class FullBuffer():
method __init__ (line 175) | def __init__(self):
method store (line 187) | def store(self, obs, act, rew, nxt_obs, done):
method generate_random_dataset (line 196) | def generate_random_dataset(self):
method get_training_batch (line 203) | def get_training_batch(self):
method get_valid_batch (line 207) | def get_valid_batch(self):
method __len__ (line 210) | def __len__(self):
function simulate_environment (line 216) | def simulate_environment(env, policy, simulated_steps):
class NetworkEnv (line 253) | class NetworkEnv(gym.Wrapper):
method __init__ (line 254) | def __init__(self, env, model_func, reward_func, done_func, number_mod...
method reset (line 262) | def reset(self, **kwargs):
method step (line 268) | def step(self, action):
class StructEnv (line 283) | class StructEnv(gym.Wrapper):
method __init__ (line 287) | def __init__(self, env):
method reset (line 293) | def reset(self, **kwargs):
method step (line 299) | def step(self, action):
method get_episode_reward (line 305) | def get_episode_reward(self):
method get_episode_length (line 308) | def get_episode_length(self):
function pendulum_done (line 311) | def pendulum_done(ob):
function pendulum_reward (line 314) | def pendulum_reward(ob, ac):
function restore_model (line 318) | def restore_model(old_model_variables, m_variables):
function METRPO (line 331) | def METRPO(env_name, hidden_sizes=[32], cr_lr=5e-3, num_epochs=50, gamma...
FILE: Chapter10/DAgger.py
function mlp (line 9) | def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activat...
function flappy_to_list (line 17) | def flappy_to_list(fd):
function flappy_game_state (line 25) | def flappy_game_state(bol):
function no_op (line 33) | def no_op(env, n_act=5):
function expert (line 38) | def expert():
function test_agent (line 58) | def test_agent(policy, file_writer=None, test_games=10, step=0):
function DAgger (line 90) | def DAgger(hidden_sizes=[32,32], dagger_iterations=20, p_lr=1e-3, step_i...
FILE: Chapter11/ES.py
function temp_seed (line 13) | def temp_seed(seed):
function mlp (line 21) | def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activat...
function test_agent (line 31) | def test_agent(env_test, agent_op, num_games=1):
function worker (line 53) | def worker(env_name, initial_seed, hidden_sizes, lr, std_noise, indiv_pe...
function normalized_rank (line 173) | def normalized_rank(rewards):
function flatten (line 183) | def flatten(tensor):
function flatten_list (line 189) | def flatten_list(tensor_list):
function ES (line 197) | def ES(env_name, hidden_sizes=[8,8], number_iter=1000, num_workers=4, lr...
FILE: Chapter12/ESBAS.py
function mlp (line 15) | def mlp(x, hidden_layers, output_layer, activation=tf.tanh, last_activat...
class ExperienceBuffer (line 24) | class ExperienceBuffer():
method __init__ (line 28) | def __init__(self, buffer_size):
method add (line 36) | def add(self, obs, rew, act, obs2, done):
method sample_minibatch (line 45) | def sample_minibatch(self, batch_size):
method __len__ (line 57) | def __len__(self):
function q_target_values (line 61) | def q_target_values(mini_batch_rw, mini_batch_done, av, discounted_value):
function greedy (line 80) | def greedy(action_values):
function eps_greedy (line 86) | def eps_greedy(action_values, eps=0.1):
function test_agent (line 97) | def test_agent(env_test, agent_op, num_games=20, summary=None):
class DQN_optimization (line 122) | class DQN_optimization:
method __init__ (line 123) | def __init__(self, obs_dim, act_dim, hidden_layers, lr, discount):
method __build_graph (line 133) | def __build_graph(self):
method __create_session (line 172) | def __create_session(self):
method act (line 179) | def act(self, o):
method optimize (line 185) | def optimize(self, mb_obs, mb_rew, mb_act, mb_obs2, mb_done):
method update_target_network (line 193) | def update_target_network(self):
class UCB1 (line 198) | class UCB1:
method __init__ (line 199) | def __init__(self, algos, epsilon):
method choose_algorithm (line 207) | def choose_algorithm(self):
method update (line 216) | def update(self, idx_algo, traj_return):
function ESBAS (line 224) | def ESBAS(env_name, hidden_sizes=[32], lr=1e-2, num_epochs=2000, buffer_...
Condensed preview — 26 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (219K chars).
[
{
"path": "Chapter02/Code.ipynb",
"chars": 20304,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"#### TensorFlow installation\"\n ]\n"
},
{
"path": "Chapter03/frozenlake8x8_policyiteration.py",
"chars": 2321,
"preview": "import numpy as np\nimport gym\n\ndef eval_state_action(V, s, a, gamma=0.99):\n return np.sum([p * (rew + gamma*V[next_s]"
},
{
"path": "Chapter03/frozenlake8x8_valueiteration.py",
"chars": 1691,
"preview": "import numpy as np\nimport gym\n\ndef eval_state_action(V, s, a, gamma=0.99):\n return np.sum([p * (rew + gamma*V[next_s]"
},
{
"path": "Chapter04/SARSA Q_learning Taxi-v2.py",
"chars": 4380,
"preview": "import numpy as np \nimport gym\n\n\ndef eps_greedy(Q, s, eps=0.1):\n '''\n Epsilon greedy policy\n '''\n if np.rand"
},
{
"path": "Chapter05/.ipynb_checkpoints/Untitled-checkpoint.ipynb",
"chars": 72,
"preview": "{\n \"cells\": [],\n \"metadata\": {},\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
},
{
"path": "Chapter05/DQN_Atari.py",
"chars": 11818,
"preview": "import numpy as np \nimport tensorflow as tf\nimport gym\nfrom datetime import datetime\nfrom collections import deque\nimpor"
},
{
"path": "Chapter05/DQN_variations_Atari.py",
"chars": 15429,
"preview": "import numpy as np \nimport tensorflow as tf\nimport gym\nfrom datetime import datetime\nfrom collections import deque\nimpor"
},
{
"path": "Chapter05/Untitled.ipynb",
"chars": 72,
"preview": "{\n \"cells\": [],\n \"metadata\": {},\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
},
{
"path": "Chapter05/atari_wrappers.py",
"chars": 6365,
"preview": "import numpy as np\nimport os\nfrom collections import deque\nimport gym\nfrom gym import spaces\nimport cv2\n\n''' \nAtari Wrap"
},
{
"path": "Chapter05/untitled",
"chars": 0,
"preview": ""
},
{
"path": "Chapter06/AC.py",
"chars": 9652,
"preview": "import numpy as np \nimport tensorflow as tf\nimport gym\nfrom datetime import datetime\nimport time\n\n\ndef mlp(x, hidden_lay"
},
{
"path": "Chapter06/REINFORCE.py",
"chars": 7472,
"preview": "import numpy as np \nimport tensorflow as tf\nimport gym\nfrom datetime import datetime\nimport time\n\n\ndef mlp(x, hidden_lay"
},
{
"path": "Chapter06/REINFORCE_baseline.py",
"chars": 8715,
"preview": "import numpy as np \nimport tensorflow as tf\nimport gym\nfrom datetime import datetime\nimport time\n\n\ndef mlp(x, hidden_lay"
},
{
"path": "Chapter07/PPO.py",
"chars": 14215,
"preview": "import numpy as np \nimport tensorflow as tf\nimport gym\nfrom datetime import datetime\nimport time\nimport roboschool\n\ndef "
},
{
"path": "Chapter07/TRPO.py",
"chars": 20176,
"preview": "import numpy as np \nimport tensorflow as tf\nimport gym\nfrom datetime import datetime\nimport roboschool\n\ndef mlp(x, hidde"
},
{
"path": "Chapter08/DDPG.py",
"chars": 10043,
"preview": "import numpy as np \nimport tensorflow as tf\nimport gym\nfrom datetime import datetime\nfrom collections import deque\nimpor"
},
{
"path": "Chapter08/TD3.py",
"chars": 11660,
"preview": "import numpy as np \nimport tensorflow as tf\nimport gym\nfrom datetime import datetime\nfrom collections import deque\nimpor"
},
{
"path": "Chapter09/ME-TRPO.py",
"chars": 32177,
"preview": "import numpy as np \nimport tensorflow as tf\nimport gym\nfrom datetime import datetime\nimport roboschool\n\n\ndef mlp(x, hidd"
},
{
"path": "Chapter10/DAgger.py",
"chars": 6569,
"preview": "import numpy as np \nimport tensorflow as tf\nfrom datetime import datetime\nimport time\nfrom ple.games.flappybird import F"
},
{
"path": "Chapter10/expert/checkpoint",
"chars": 77,
"preview": "model_checkpoint_path: \"model.ckpt\"\nall_model_checkpoint_paths: \"model.ckpt\"\n"
},
{
"path": "Chapter11/ES.py",
"chars": 8637,
"preview": "import numpy as np \nimport tensorflow as tf\nfrom datetime import datetime\nimport time\nimport gym\n\nimport multiprocessing"
},
{
"path": "Chapter12/ESBAS.py",
"chars": 12443,
"preview": "import numpy as np \nimport tensorflow as tf\nimport gym\nfrom datetime import datetime\nfrom collections import deque\nimpor"
},
{
"path": "README.md",
"chars": 5933,
"preview": "\n\n\n# Reinforcement Learning Algorithms with Python\n\n<a href=\"https://www.packtpub.com/data/hands-on-reinforcement-learni"
}
]
// ... and 3 more files (download for full content)
About this extraction
This page contains the full source code of the PacktPublishing/Reinforcement-Learning-Algorithms-with-Python GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 26 files (205.3 KB), approximately 54.7k tokens, and a symbol index with 232 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.