[
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nenv/\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\n*.egg-info/\n.installed.cfg\n*.egg\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*,cover\n.hypothesis/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n#Ipython Notebook\n.ipynb_checkpoints\n"
  },
  {
    "path": "LICENSE",
    "content": "The MIT License (MIT)\n\nCopyright (c) 2016 Sherjil Ozair\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "# dqn\nThis is a very basic DQN (with experience replay) implementation, which uses OpenAI's gym environment and Keras/Theano neural networks. \n\n# Requirements\n- gym\n- keras\n- theano\n- numpy\n\nand all their dependencies.\n\n# Usage\nTo run, `python example.py <env_name>`. It runs `MsPacman-v0` if no env is specified.\nUncomment the `env.render()` line to see the game while training, however, this is likely to make training slow.\n\nCurrently, it assumes that the observation is an image, i.e. a 3d array, which is the case for all Atari games, and other Atari-like environments.\n\n# Purpose\nThis is meant to be a very simple implementation, to be used as a starter code. I aimed it to be easy-to-comprehend rather than feature-complete.\n\nPull requests welcome!\n\n# References\n- https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf\n\n# TODO\n- Extend to other environemnts. Currently only works for Atari and Atari-like environments where the observation space is a 3D Box.\n"
  },
  {
    "path": "dqn.py",
    "content": "#!/usr/bin/env python\n\nfrom __future__ import division, print_function, unicode_literals\n\n# Handle arguments (before slow imports so --help can be fast)\nimport argparse\nparser = argparse.ArgumentParser(\n    description=\"Train a DQN net for Atari games.\")\n\n# Important hparams\nparser.add_argument(\"-g\", \"--game\", type=str, default=\"Pong\")\nparser.add_argument(\"-n\", \"--number-steps\", type=int, default=1000000, help=\"total number of training steps\")\nparser.add_argument(\"-e\", \"--explore-steps\", type=int, default=100000, help=\"total number of explorartion steps\")\nparser.add_argument(\"-c\", \"--copy-steps\", type=int, default=4096, help=\"number of training steps between copies of online DQN to target DQN\")\nparser.add_argument(\"-l\", \"--learn-freq\", type=int, default=4, help=\"number of game steps between each training step\")\n\n# Irrelevant hparams\nparser.add_argument(\"-s\", \"--save-steps\", type=int, default=10000, help=\"number of training steps between saving checkpoints\")\nparser.add_argument(\"-r\", \"--render\", action=\"store_true\", default=False, help=\"render the game during training or testing\")\nparser.add_argument(\"-t\", \"--test\", action=\"store_true\", default=False, help=\"test (no learning and minimal epsilon)\")\nparser.add_argument(\"-v\", \"--verbosity\", action=\"count\", default=1, help=\"increase output verbosity\")\nparser.add_argument(\"-j\", \"--jobid\", default=\"123123\", help=\"SLURM job ID\")\n\nargs = parser.parse_args()\n\nfrom collections import deque\nimport gym\nimport numpy as np\nimport os\nimport tensorflow as tf\nimport sys\nimport matplotlib\nmatplotlib.use('Agg')\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nsns.set()\n\nfrom util import wrap_dqn\n\nenv = wrap_dqn(gym.make(\"{}NoFrameskip-v4\".format(args.game)))\n\ndef q_network(net, name, reuse=False):\n    with tf.variable_scope(name, reuse=reuse) as scope:\n        initializer = tf.contrib.layers.variance_scaling_initializer()\n        for n_maps, kernel_size, strides, padding, activation in zip(\n                [32, 64, 64], [(8,8), (4,4), (3,3)], [4, 2, 1],\n                [\"SAME\"] * 3 , [tf.nn.relu] * 3):\n            net = tf.layers.conv2d(net, filters=n_maps, kernel_size=kernel_size, strides=strides, \n                padding=padding, activation=activation, kernel_initializer=initializer)\n        net = tf.layers.dense(tf.contrib.layers.flatten(net), 256, activation=tf.nn.relu, kernel_initializer=initializer)\n        net = tf.layers.dense(net, env.action_space.n, kernel_initializer=initializer)\n\n    trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)\n    return net, trainable_vars\n\n# Now for the training operations\nlearning_rate = 1e-4\ntraining_start = 10000  # start training after 10,000 game steps\ndiscount_rate = 0.99\nbatch_size = 64\n\nwith tf.variable_scope(\"train\"):\n    X_state = tf.placeholder(tf.float32, shape=[None, 84, 84, 4])\n    X_next_state = tf.placeholder(tf.float32, shape=[None, 84, 84, 4])\n    X_action = tf.placeholder(tf.int32, shape=[None])\n    X_done = tf.placeholder(tf.float32, shape=[None])\n    X_rewards = tf.placeholder(tf.float32, shape=[None])\n    online_q_values, online_vars = q_network(X_state, name=\"q_networks/online\")\n    target_q_values, target_vars = q_network(X_next_state, name=\"q_networks/online\", reuse=True)\n    max_target_q_values = tf.reduce_max(target_q_values, axis=1)\n    target = X_rewards + (1. - X_done) * discount_rate * max_target_q_values\n    q_value = tf.reduce_sum(online_q_values * tf.one_hot(X_action, env.action_space.n), axis=1)\n    error = tf.abs(q_value - tf.stop_gradient(target))\n    clipped_error = tf.clip_by_value(error, 0.0, 1.0)\n    linear_error = 2 * (error - clipped_error)\n    loss = tf.reduce_mean(tf.square(clipped_error) + linear_error)\n\n    global_step = tf.Variable(0, trainable=False, name='global_step')\n    optimizer = tf.train.AdamOptimizer(learning_rate)\n    training_op = optimizer.minimize(loss, global_step=global_step)\n\n# We need an operation to copy the online DQN to the target DQN\ncopy_ops = [target_var.assign(online_var)\n            for target_var, online_var in zip(target_vars, online_vars)]\ncopy_online_to_target = tf.group(*copy_ops)\n\ninit = tf.global_variables_initializer()\nsaver = tf.train.Saver()\n\n# Let's implement a simple replay memory\nreplay_memory = deque([], maxlen=10000)\n\ndef sample_memories(batch_size):\n    indices = np.random.permutation(len(replay_memory))[:batch_size]\n    cols = [[], [], [], [], []] # state, action, reward, next_state, continue\n    for idx in indices:\n        memory = replay_memory[idx]\n        for col, value in zip(cols, memory):\n            col.append(value)\n    cols = [np.array(col) for col in cols]\n    return cols\n\n# And on to the epsilon-greedy policy with decaying epsilon\neps_min = 0.01\neps_max = 1.0 if not args.test else eps_min\n\ndef epsilon_greedy(q_values, step):\n    epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step / args.explore_steps)\n    if np.random.rand() < epsilon:\n        return np.random.randint(env.action_space.n) # random action\n    else:\n        return np.argmax(q_values) # optimal action\n\ndone = True # env needs to be reset\n\n# We will keep track of the max Q-Value over time and compute the mean per game\nloss_val = np.infty\ngame_length = 0\ntotal_max_q = 0\nmean_max_q = 0.0\nreturnn = 0.0\nreturns = []\nsteps = []\npath = os.path.join(args.jobid, \"model\")\nwith tf.Session() as sess:\n    if os.path.isfile(path + \".index\"):\n        saver.restore(sess, path)\n    else:\n        init.run()\n        copy_online_to_target.run()\n    for step in range(args.number_steps):\n        training_iter = global_step.eval() \n        if done: # game over, start again\n            if args.verbosity > 0:\n                print(\"Step {}/{} ({:.1f})% Training iters {}   \"\n                      \"Loss {:5f}    Mean Max-Q {:5f}   Return: {:5f}\".format(\n                step, args.number_steps, step * 100 / args.number_steps,\n                training_iter, loss_val, mean_max_q, returnn))\n                sys.stdout.flush()\n            state = env.reset()\n        if args.render:\n            env.render()\n\n        # Online DQN evaluates what to do\n        q_values = online_q_values.eval(feed_dict={X_state: [state]})\n        action = epsilon_greedy(q_values, step)\n\n        # Online DQN plays\n        next_state, reward, done, info = env.step(action)\n        returnn += reward\n\n        # Let's memorize what happened\n        replay_memory.append((state, action, reward, next_state, done))\n        state = next_state\n\n        if args.test:\n            continue\n\n        # Compute statistics for tracking progress (not shown in the book)\n        total_max_q += q_values.max()\n        game_length += 1\n        if done:\n            steps.append(step)\n            returns.append(returnn)\n            returnn = 0.\n            mean_max_q = total_max_q / game_length\n            total_max_q = 0.0\n            game_length = 0\n\n        if step < training_start or step % args.learn_freq != 0:\n            continue # only train after warmup period and at regular intervals\n        \n        # Sample memories and train the online DQN\n        X_state_val, X_action_val, X_rewards_val, X_next_state_val, X_done_val = sample_memories(batch_size)\n        \n        _, loss_val = sess.run([training_op, loss],\n        {X_state: X_state_val, \n        X_action: X_action_val, \n        X_rewards: X_rewards_val,\n        X_done: X_done_val,\n        X_next_state: X_next_state_val})\n\n        # Regularly copy the online DQN to the target DQN\n        if step % args.copy_steps == 0:\n            copy_online_to_target.run()\n\n        # And save regularly\n        if step % args.save_steps == 0:\n            saver.save(sess, path)\n            np.save(os.path.join(args.jobid, \"{}.npy\".format(args.jobid)), np.array((steps, returns)))\n\n"
  },
  {
    "path": "dqn.sh",
    "content": "#!/bin/bash\nsource activate tfgpu\npython dqn.py $@ --jobid=$SLURM_JOB_ID\n\n    \n"
  },
  {
    "path": "plot.py",
    "content": "import numpy as np\nimport sys\nimport re\nimport subprocess\n\ndef get_job_name(jobid):\n    cmd = \"sacct --format=\\\"JobName%30\\\" -j {}\".format(jobid)\n    result = subprocess.check_output(cmd, shell=True)\n    return str(result).split(\"\\\\n\")[2].strip()\n\njobids = sys.argv[2:]\nexp_name = [get_job_name(jobid) for jobid in jobids]\nexp_data = [np.load(\"{}/{}.npy\".format(jobid, jobid)) for jobid in jobids]\n\nN = int(sys.argv[1])\n\nimport matplotlib\nmatplotlib.use('Agg')\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nsns.set()\n\nexp_data = map(lambda d: np.convolve(d, np.ones((N,))/N, mode='valid'), exp_data)\n\nplt.figure(figsize=(16, 8))\n\nfor data in exp_data:\n    plt.plot(data)\n\nplt.xlabel('#episodes')\nplt.ylabel('returns')\nplt.legend(exp_name, loc='best')\n\nplotpath = \"plot_{}.png\".format(\"_\".join(jobids))\nprint (\"Saved to {}\".format(plotpath))\nplt.savefig(plotpath)\n"
  },
  {
    "path": "run.sh",
    "content": "set -ex\n\nsbatch --gres=gpu:1 --time=2:59:00 --mem=4gb --job-name=Pong --account=rpp-bengioy dqn.sh --game=Pong\n"
  },
  {
    "path": "util.py",
    "content": "import gym\nimport numpy as np\nfrom collections import deque\nfrom gym import spaces\nfrom PIL import Image\n\n\nclass NoopResetEnv(gym.Wrapper):\n    def __init__(self, env, noop_max=30):\n        \"\"\"Sample initial states by taking random number of no-ops on reset.\n        No-op is assumed to be action 0.\n        \"\"\"\n        gym.Wrapper.__init__(self, env)\n        self.noop_max = noop_max\n        self.override_num_noops = None\n        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'\n\n    def _reset(self):\n        \"\"\" Do no-op action for a number of steps in [1, noop_max].\"\"\"\n        self.env.reset()\n        if self.override_num_noops is not None:\n            noops = self.override_num_noops\n        else:\n            noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101\n        assert noops > 0\n        obs = None\n        for _ in range(noops):\n            obs, _, done, _ = self.env.step(0)\n            if done:\n                obs = self.env.reset()\n        return obs\n\nclass FireResetEnv(gym.Wrapper):\n    def __init__(self, env):\n        \"\"\"Take action on reset for environments that are fixed until firing.\"\"\"\n        gym.Wrapper.__init__(self, env)\n        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'\n        assert len(env.unwrapped.get_action_meanings()) >= 3\n\n    def _reset(self):\n        self.env.reset()\n        obs, _, done, _ = self.env.step(1)\n        if done:\n            self.env.reset()\n        obs, _, done, _ = self.env.step(2)\n        if done:\n            self.env.reset()\n        return obs\n\nclass EpisodicLifeEnv(gym.Wrapper):\n    def __init__(self, env):\n        \"\"\"Make end-of-life == end-of-episode, but only reset on true game over.\n        Done by DeepMind for the DQN and co. since it helps value estimation.\n        \"\"\"\n        gym.Wrapper.__init__(self, env)\n        self.lives = 0\n        self.was_real_done  = True\n\n    def _step(self, action):\n        obs, reward, done, info = self.env.step(action)\n        self.was_real_done = done\n        # check current lives, make loss of life terminal,\n        # then update lives to handle bonus lives\n        lives = self.env.unwrapped.ale.lives()\n        if lives < self.lives and lives > 0:\n            # for Qbert somtimes we stay in lives == 0 condtion for a few frames\n            # so its important to keep lives > 0, so that we only reset once\n            # the environment advertises done.\n            done = True\n        self.lives = lives\n        return obs, reward, done, info\n\n    def _reset(self):\n        \"\"\"Reset only when lives are exhausted.\n        This way all states are still reachable even though lives are episodic,\n        and the learner need not know about any of this behind-the-scenes.\n        \"\"\"\n        if self.was_real_done:\n            obs = self.env.reset()\n        else:\n            # no-op step to advance from terminal/lost life state\n            obs, _, _, _ = self.env.step(0)\n        self.lives = self.env.unwrapped.ale.lives()\n        return obs\n\nclass MaxAndSkipEnv(gym.Wrapper):\n    def __init__(self, env, skip=4):\n        \"\"\"Return only every `skip`-th frame\"\"\"\n        gym.Wrapper.__init__(self, env)\n        # most recent raw observations (for max pooling across time steps)\n        self._obs_buffer = deque(maxlen=2)\n        self._skip       = skip\n\n    def _step(self, action):\n        \"\"\"Repeat action, sum reward, and max over last observations.\"\"\"\n        total_reward = 0.0\n        done = None\n        for _ in range(self._skip):\n            obs, reward, done, info = self.env.step(action)\n            self._obs_buffer.append(obs)\n            total_reward += reward\n            if done:\n                break\n        max_frame = np.max(np.stack(self._obs_buffer), axis=0)\n\n        return max_frame, total_reward, done, info\n\n    def _reset(self):\n        \"\"\"Clear past frame buffer and init. to first obs. from inner env.\"\"\"\n        self._obs_buffer.clear()\n        obs = self.env.reset()\n        self._obs_buffer.append(obs)\n        return obs\n\nclass ClipRewardEnv(gym.RewardWrapper):\n    def _reward(self, reward):\n        \"\"\"Bin reward to {+1, 0, -1} by its sign.\"\"\"\n        return np.sign(reward)\n\nclass WarpFrame(gym.ObservationWrapper):\n    def __init__(self, env):\n        \"\"\"Warp frames to 84x84 as done in the Nature paper and later work.\"\"\"\n        gym.ObservationWrapper.__init__(self, env)\n        self.res = 84\n        self.observation_space = spaces.Box(low=0, high=255, shape=(self.res, self.res, 1))\n\n    def _observation(self, obs):\n        frame = np.dot(obs.astype('float32'), np.array([0.299, 0.587, 0.114], 'float32'))\n        frame = np.array(Image.fromarray(frame).resize((self.res, self.res),\n            resample=Image.BILINEAR), dtype=np.uint8)\n        return frame.reshape((self.res, self.res, 1))\n\nclass FrameStack(gym.Wrapper):\n    def __init__(self, env, k):\n        \"\"\"Buffer observations and stack across channels (last axis).\"\"\"\n        gym.Wrapper.__init__(self, env)\n        self.k = k\n        self.frames = deque([], maxlen=k)\n        shp = env.observation_space.shape\n        assert shp[2] == 1  # can only stack 1-channel frames\n        self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], k))\n\n    def _reset(self):\n        \"\"\"Clear buffer and re-fill by duplicating the first observation.\"\"\"\n        ob = self.env.reset()\n        for _ in range(self.k): self.frames.append(ob)\n        return self._observation()\n\n    def _step(self, action):\n        ob, reward, done, info = self.env.step(action)\n        self.frames.append(ob)\n        return self._observation(), reward, done, info\n\n    def _observation(self):\n        assert len(self.frames) == self.k\n        return np.concatenate(self.frames, axis=2)\n\n\nclass ScaledFloatFrame(gym.ObservationWrapper):\n    def _observation(self, obs):\n        # careful! This undoes the memory optimization, use\n        # with smaller replay buffers only.\n        return np.array(obs).astype(np.float32) / 255.0\n\n\ndef wrap_dqn(env):\n    \"\"\"Apply a common set of wrappers for Atari games.\"\"\"\n    assert 'NoFrameskip' in env.spec.id\n    env = EpisodicLifeEnv(env)\n    env = NoopResetEnv(env, noop_max=30)\n    env = MaxAndSkipEnv(env, skip=4)\n    if 'FIRE' in env.unwrapped.get_action_meanings():\n        env = FireResetEnv(env)\n    env = WarpFrame(env)\n    env = FrameStack(env, 4)\n    env = ClipRewardEnv(env)\n    env = ScaledFloatFrame(env)\n    return env\n\n\n"
  }
]