Repository: sherjilozair/dqn Branch: master Commit: ee621e10169b Files: 8 Total size: 17.6 KB Directory structure: gitextract_4u5kapam/ ├── .gitignore ├── LICENSE ├── README.md ├── dqn.py ├── dqn.sh ├── plot.py ├── run.sh └── util.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *,cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log # Sphinx documentation docs/_build/ # PyBuilder target/ #Ipython Notebook .ipynb_checkpoints ================================================ FILE: LICENSE ================================================ The MIT License (MIT) Copyright (c) 2016 Sherjil Ozair Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # dqn This is a very basic DQN (with experience replay) implementation, which uses OpenAI's gym environment and Keras/Theano neural networks. # Requirements - gym - keras - theano - numpy and all their dependencies. # Usage To run, `python example.py `. It runs `MsPacman-v0` if no env is specified. Uncomment the `env.render()` line to see the game while training, however, this is likely to make training slow. Currently, it assumes that the observation is an image, i.e. a 3d array, which is the case for all Atari games, and other Atari-like environments. # Purpose This is meant to be a very simple implementation, to be used as a starter code. I aimed it to be easy-to-comprehend rather than feature-complete. Pull requests welcome! # References - https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf # TODO - Extend to other environemnts. Currently only works for Atari and Atari-like environments where the observation space is a 3D Box. ================================================ FILE: dqn.py ================================================ #!/usr/bin/env python from __future__ import division, print_function, unicode_literals # Handle arguments (before slow imports so --help can be fast) import argparse parser = argparse.ArgumentParser( description="Train a DQN net for Atari games.") # Important hparams parser.add_argument("-g", "--game", type=str, default="Pong") parser.add_argument("-n", "--number-steps", type=int, default=1000000, help="total number of training steps") parser.add_argument("-e", "--explore-steps", type=int, default=100000, help="total number of explorartion steps") parser.add_argument("-c", "--copy-steps", type=int, default=4096, help="number of training steps between copies of online DQN to target DQN") parser.add_argument("-l", "--learn-freq", type=int, default=4, help="number of game steps between each training step") # Irrelevant hparams parser.add_argument("-s", "--save-steps", type=int, default=10000, help="number of training steps between saving checkpoints") parser.add_argument("-r", "--render", action="store_true", default=False, help="render the game during training or testing") parser.add_argument("-t", "--test", action="store_true", default=False, help="test (no learning and minimal epsilon)") parser.add_argument("-v", "--verbosity", action="count", default=1, help="increase output verbosity") parser.add_argument("-j", "--jobid", default="123123", help="SLURM job ID") args = parser.parse_args() from collections import deque import gym import numpy as np import os import tensorflow as tf import sys import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import seaborn as sns sns.set() from util import wrap_dqn env = wrap_dqn(gym.make("{}NoFrameskip-v4".format(args.game))) def q_network(net, name, reuse=False): with tf.variable_scope(name, reuse=reuse) as scope: initializer = tf.contrib.layers.variance_scaling_initializer() for n_maps, kernel_size, strides, padding, activation in zip( [32, 64, 64], [(8,8), (4,4), (3,3)], [4, 2, 1], ["SAME"] * 3 , [tf.nn.relu] * 3): net = tf.layers.conv2d(net, filters=n_maps, kernel_size=kernel_size, strides=strides, padding=padding, activation=activation, kernel_initializer=initializer) net = tf.layers.dense(tf.contrib.layers.flatten(net), 256, activation=tf.nn.relu, kernel_initializer=initializer) net = tf.layers.dense(net, env.action_space.n, kernel_initializer=initializer) trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name) return net, trainable_vars # Now for the training operations learning_rate = 1e-4 training_start = 10000 # start training after 10,000 game steps discount_rate = 0.99 batch_size = 64 with tf.variable_scope("train"): X_state = tf.placeholder(tf.float32, shape=[None, 84, 84, 4]) X_next_state = tf.placeholder(tf.float32, shape=[None, 84, 84, 4]) X_action = tf.placeholder(tf.int32, shape=[None]) X_done = tf.placeholder(tf.float32, shape=[None]) X_rewards = tf.placeholder(tf.float32, shape=[None]) online_q_values, online_vars = q_network(X_state, name="q_networks/online") target_q_values, target_vars = q_network(X_next_state, name="q_networks/online", reuse=True) max_target_q_values = tf.reduce_max(target_q_values, axis=1) target = X_rewards + (1. - X_done) * discount_rate * max_target_q_values q_value = tf.reduce_sum(online_q_values * tf.one_hot(X_action, env.action_space.n), axis=1) error = tf.abs(q_value - tf.stop_gradient(target)) clipped_error = tf.clip_by_value(error, 0.0, 1.0) linear_error = 2 * (error - clipped_error) loss = tf.reduce_mean(tf.square(clipped_error) + linear_error) global_step = tf.Variable(0, trainable=False, name='global_step') optimizer = tf.train.AdamOptimizer(learning_rate) training_op = optimizer.minimize(loss, global_step=global_step) # We need an operation to copy the online DQN to the target DQN copy_ops = [target_var.assign(online_var) for target_var, online_var in zip(target_vars, online_vars)] copy_online_to_target = tf.group(*copy_ops) init = tf.global_variables_initializer() saver = tf.train.Saver() # Let's implement a simple replay memory replay_memory = deque([], maxlen=10000) def sample_memories(batch_size): indices = np.random.permutation(len(replay_memory))[:batch_size] cols = [[], [], [], [], []] # state, action, reward, next_state, continue for idx in indices: memory = replay_memory[idx] for col, value in zip(cols, memory): col.append(value) cols = [np.array(col) for col in cols] return cols # And on to the epsilon-greedy policy with decaying epsilon eps_min = 0.01 eps_max = 1.0 if not args.test else eps_min def epsilon_greedy(q_values, step): epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step / args.explore_steps) if np.random.rand() < epsilon: return np.random.randint(env.action_space.n) # random action else: return np.argmax(q_values) # optimal action done = True # env needs to be reset # We will keep track of the max Q-Value over time and compute the mean per game loss_val = np.infty game_length = 0 total_max_q = 0 mean_max_q = 0.0 returnn = 0.0 returns = [] steps = [] path = os.path.join(args.jobid, "model") with tf.Session() as sess: if os.path.isfile(path + ".index"): saver.restore(sess, path) else: init.run() copy_online_to_target.run() for step in range(args.number_steps): training_iter = global_step.eval() if done: # game over, start again if args.verbosity > 0: print("Step {}/{} ({:.1f})% Training iters {} " "Loss {:5f} Mean Max-Q {:5f} Return: {:5f}".format( step, args.number_steps, step * 100 / args.number_steps, training_iter, loss_val, mean_max_q, returnn)) sys.stdout.flush() state = env.reset() if args.render: env.render() # Online DQN evaluates what to do q_values = online_q_values.eval(feed_dict={X_state: [state]}) action = epsilon_greedy(q_values, step) # Online DQN plays next_state, reward, done, info = env.step(action) returnn += reward # Let's memorize what happened replay_memory.append((state, action, reward, next_state, done)) state = next_state if args.test: continue # Compute statistics for tracking progress (not shown in the book) total_max_q += q_values.max() game_length += 1 if done: steps.append(step) returns.append(returnn) returnn = 0. mean_max_q = total_max_q / game_length total_max_q = 0.0 game_length = 0 if step < training_start or step % args.learn_freq != 0: continue # only train after warmup period and at regular intervals # Sample memories and train the online DQN X_state_val, X_action_val, X_rewards_val, X_next_state_val, X_done_val = sample_memories(batch_size) _, loss_val = sess.run([training_op, loss], {X_state: X_state_val, X_action: X_action_val, X_rewards: X_rewards_val, X_done: X_done_val, X_next_state: X_next_state_val}) # Regularly copy the online DQN to the target DQN if step % args.copy_steps == 0: copy_online_to_target.run() # And save regularly if step % args.save_steps == 0: saver.save(sess, path) np.save(os.path.join(args.jobid, "{}.npy".format(args.jobid)), np.array((steps, returns))) ================================================ FILE: dqn.sh ================================================ #!/bin/bash source activate tfgpu python dqn.py $@ --jobid=$SLURM_JOB_ID ================================================ FILE: plot.py ================================================ import numpy as np import sys import re import subprocess def get_job_name(jobid): cmd = "sacct --format=\"JobName%30\" -j {}".format(jobid) result = subprocess.check_output(cmd, shell=True) return str(result).split("\\n")[2].strip() jobids = sys.argv[2:] exp_name = [get_job_name(jobid) for jobid in jobids] exp_data = [np.load("{}/{}.npy".format(jobid, jobid)) for jobid in jobids] N = int(sys.argv[1]) import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import seaborn as sns sns.set() exp_data = map(lambda d: np.convolve(d, np.ones((N,))/N, mode='valid'), exp_data) plt.figure(figsize=(16, 8)) for data in exp_data: plt.plot(data) plt.xlabel('#episodes') plt.ylabel('returns') plt.legend(exp_name, loc='best') plotpath = "plot_{}.png".format("_".join(jobids)) print ("Saved to {}".format(plotpath)) plt.savefig(plotpath) ================================================ FILE: run.sh ================================================ set -ex sbatch --gres=gpu:1 --time=2:59:00 --mem=4gb --job-name=Pong --account=rpp-bengioy dqn.sh --game=Pong ================================================ FILE: util.py ================================================ import gym import numpy as np from collections import deque from gym import spaces from PIL import Image class NoopResetEnv(gym.Wrapper): def __init__(self, env, noop_max=30): """Sample initial states by taking random number of no-ops on reset. No-op is assumed to be action 0. """ gym.Wrapper.__init__(self, env) self.noop_max = noop_max self.override_num_noops = None assert env.unwrapped.get_action_meanings()[0] == 'NOOP' def _reset(self): """ Do no-op action for a number of steps in [1, noop_max].""" self.env.reset() if self.override_num_noops is not None: noops = self.override_num_noops else: noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101 assert noops > 0 obs = None for _ in range(noops): obs, _, done, _ = self.env.step(0) if done: obs = self.env.reset() return obs class FireResetEnv(gym.Wrapper): def __init__(self, env): """Take action on reset for environments that are fixed until firing.""" gym.Wrapper.__init__(self, env) assert env.unwrapped.get_action_meanings()[1] == 'FIRE' assert len(env.unwrapped.get_action_meanings()) >= 3 def _reset(self): self.env.reset() obs, _, done, _ = self.env.step(1) if done: self.env.reset() obs, _, done, _ = self.env.step(2) if done: self.env.reset() return obs class EpisodicLifeEnv(gym.Wrapper): def __init__(self, env): """Make end-of-life == end-of-episode, but only reset on true game over. Done by DeepMind for the DQN and co. since it helps value estimation. """ gym.Wrapper.__init__(self, env) self.lives = 0 self.was_real_done = True def _step(self, action): obs, reward, done, info = self.env.step(action) self.was_real_done = done # check current lives, make loss of life terminal, # then update lives to handle bonus lives lives = self.env.unwrapped.ale.lives() if lives < self.lives and lives > 0: # for Qbert somtimes we stay in lives == 0 condtion for a few frames # so its important to keep lives > 0, so that we only reset once # the environment advertises done. done = True self.lives = lives return obs, reward, done, info def _reset(self): """Reset only when lives are exhausted. This way all states are still reachable even though lives are episodic, and the learner need not know about any of this behind-the-scenes. """ if self.was_real_done: obs = self.env.reset() else: # no-op step to advance from terminal/lost life state obs, _, _, _ = self.env.step(0) self.lives = self.env.unwrapped.ale.lives() return obs class MaxAndSkipEnv(gym.Wrapper): def __init__(self, env, skip=4): """Return only every `skip`-th frame""" gym.Wrapper.__init__(self, env) # most recent raw observations (for max pooling across time steps) self._obs_buffer = deque(maxlen=2) self._skip = skip def _step(self, action): """Repeat action, sum reward, and max over last observations.""" total_reward = 0.0 done = None for _ in range(self._skip): obs, reward, done, info = self.env.step(action) self._obs_buffer.append(obs) total_reward += reward if done: break max_frame = np.max(np.stack(self._obs_buffer), axis=0) return max_frame, total_reward, done, info def _reset(self): """Clear past frame buffer and init. to first obs. from inner env.""" self._obs_buffer.clear() obs = self.env.reset() self._obs_buffer.append(obs) return obs class ClipRewardEnv(gym.RewardWrapper): def _reward(self, reward): """Bin reward to {+1, 0, -1} by its sign.""" return np.sign(reward) class WarpFrame(gym.ObservationWrapper): def __init__(self, env): """Warp frames to 84x84 as done in the Nature paper and later work.""" gym.ObservationWrapper.__init__(self, env) self.res = 84 self.observation_space = spaces.Box(low=0, high=255, shape=(self.res, self.res, 1)) def _observation(self, obs): frame = np.dot(obs.astype('float32'), np.array([0.299, 0.587, 0.114], 'float32')) frame = np.array(Image.fromarray(frame).resize((self.res, self.res), resample=Image.BILINEAR), dtype=np.uint8) return frame.reshape((self.res, self.res, 1)) class FrameStack(gym.Wrapper): def __init__(self, env, k): """Buffer observations and stack across channels (last axis).""" gym.Wrapper.__init__(self, env) self.k = k self.frames = deque([], maxlen=k) shp = env.observation_space.shape assert shp[2] == 1 # can only stack 1-channel frames self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], k)) def _reset(self): """Clear buffer and re-fill by duplicating the first observation.""" ob = self.env.reset() for _ in range(self.k): self.frames.append(ob) return self._observation() def _step(self, action): ob, reward, done, info = self.env.step(action) self.frames.append(ob) return self._observation(), reward, done, info def _observation(self): assert len(self.frames) == self.k return np.concatenate(self.frames, axis=2) class ScaledFloatFrame(gym.ObservationWrapper): def _observation(self, obs): # careful! This undoes the memory optimization, use # with smaller replay buffers only. return np.array(obs).astype(np.float32) / 255.0 def wrap_dqn(env): """Apply a common set of wrappers for Atari games.""" assert 'NoFrameskip' in env.spec.id env = EpisodicLifeEnv(env) env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) if 'FIRE' in env.unwrapped.get_action_meanings(): env = FireResetEnv(env) env = WarpFrame(env) env = FrameStack(env, 4) env = ClipRewardEnv(env) env = ScaledFloatFrame(env) return env