Full Code of sherjilozair/dqn for AI

master ee621e10169b cached
8 files
17.6 KB
4.7k tokens
31 symbols
1 requests
Download .txt
Repository: sherjilozair/dqn
Branch: master
Commit: ee621e10169b
Files: 8
Total size: 17.6 KB

Directory structure:
gitextract_4u5kapam/

├── .gitignore
├── LICENSE
├── README.md
├── dqn.py
├── dqn.sh
├── plot.py
├── run.sh
└── util.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/

# Translations
*.mo
*.pot

# Django stuff:
*.log

# Sphinx documentation
docs/_build/

# PyBuilder
target/

#Ipython Notebook
.ipynb_checkpoints


================================================
FILE: LICENSE
================================================
The MIT License (MIT)

Copyright (c) 2016 Sherjil Ozair

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
# dqn
This is a very basic DQN (with experience replay) implementation, which uses OpenAI's gym environment and Keras/Theano neural networks. 

# Requirements
- gym
- keras
- theano
- numpy

and all their dependencies.

# Usage
To run, `python example.py <env_name>`. It runs `MsPacman-v0` if no env is specified.
Uncomment the `env.render()` line to see the game while training, however, this is likely to make training slow.

Currently, it assumes that the observation is an image, i.e. a 3d array, which is the case for all Atari games, and other Atari-like environments.

# Purpose
This is meant to be a very simple implementation, to be used as a starter code. I aimed it to be easy-to-comprehend rather than feature-complete.

Pull requests welcome!

# References
- https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf

# TODO
- Extend to other environemnts. Currently only works for Atari and Atari-like environments where the observation space is a 3D Box.


================================================
FILE: dqn.py
================================================
#!/usr/bin/env python

from __future__ import division, print_function, unicode_literals

# Handle arguments (before slow imports so --help can be fast)
import argparse
parser = argparse.ArgumentParser(
    description="Train a DQN net for Atari games.")

# Important hparams
parser.add_argument("-g", "--game", type=str, default="Pong")
parser.add_argument("-n", "--number-steps", type=int, default=1000000, help="total number of training steps")
parser.add_argument("-e", "--explore-steps", type=int, default=100000, help="total number of explorartion steps")
parser.add_argument("-c", "--copy-steps", type=int, default=4096, help="number of training steps between copies of online DQN to target DQN")
parser.add_argument("-l", "--learn-freq", type=int, default=4, help="number of game steps between each training step")

# Irrelevant hparams
parser.add_argument("-s", "--save-steps", type=int, default=10000, help="number of training steps between saving checkpoints")
parser.add_argument("-r", "--render", action="store_true", default=False, help="render the game during training or testing")
parser.add_argument("-t", "--test", action="store_true", default=False, help="test (no learning and minimal epsilon)")
parser.add_argument("-v", "--verbosity", action="count", default=1, help="increase output verbosity")
parser.add_argument("-j", "--jobid", default="123123", help="SLURM job ID")

args = parser.parse_args()

from collections import deque
import gym
import numpy as np
import os
import tensorflow as tf
import sys
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from util import wrap_dqn

env = wrap_dqn(gym.make("{}NoFrameskip-v4".format(args.game)))

def q_network(net, name, reuse=False):
    with tf.variable_scope(name, reuse=reuse) as scope:
        initializer = tf.contrib.layers.variance_scaling_initializer()
        for n_maps, kernel_size, strides, padding, activation in zip(
                [32, 64, 64], [(8,8), (4,4), (3,3)], [4, 2, 1],
                ["SAME"] * 3 , [tf.nn.relu] * 3):
            net = tf.layers.conv2d(net, filters=n_maps, kernel_size=kernel_size, strides=strides, 
                padding=padding, activation=activation, kernel_initializer=initializer)
        net = tf.layers.dense(tf.contrib.layers.flatten(net), 256, activation=tf.nn.relu, kernel_initializer=initializer)
        net = tf.layers.dense(net, env.action_space.n, kernel_initializer=initializer)

    trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)
    return net, trainable_vars

# Now for the training operations
learning_rate = 1e-4
training_start = 10000  # start training after 10,000 game steps
discount_rate = 0.99
batch_size = 64

with tf.variable_scope("train"):
    X_state = tf.placeholder(tf.float32, shape=[None, 84, 84, 4])
    X_next_state = tf.placeholder(tf.float32, shape=[None, 84, 84, 4])
    X_action = tf.placeholder(tf.int32, shape=[None])
    X_done = tf.placeholder(tf.float32, shape=[None])
    X_rewards = tf.placeholder(tf.float32, shape=[None])
    online_q_values, online_vars = q_network(X_state, name="q_networks/online")
    target_q_values, target_vars = q_network(X_next_state, name="q_networks/online", reuse=True)
    max_target_q_values = tf.reduce_max(target_q_values, axis=1)
    target = X_rewards + (1. - X_done) * discount_rate * max_target_q_values
    q_value = tf.reduce_sum(online_q_values * tf.one_hot(X_action, env.action_space.n), axis=1)
    error = tf.abs(q_value - tf.stop_gradient(target))
    clipped_error = tf.clip_by_value(error, 0.0, 1.0)
    linear_error = 2 * (error - clipped_error)
    loss = tf.reduce_mean(tf.square(clipped_error) + linear_error)

    global_step = tf.Variable(0, trainable=False, name='global_step')
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss, global_step=global_step)

# We need an operation to copy the online DQN to the target DQN
copy_ops = [target_var.assign(online_var)
            for target_var, online_var in zip(target_vars, online_vars)]
copy_online_to_target = tf.group(*copy_ops)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

# Let's implement a simple replay memory
replay_memory = deque([], maxlen=10000)

def sample_memories(batch_size):
    indices = np.random.permutation(len(replay_memory))[:batch_size]
    cols = [[], [], [], [], []] # state, action, reward, next_state, continue
    for idx in indices:
        memory = replay_memory[idx]
        for col, value in zip(cols, memory):
            col.append(value)
    cols = [np.array(col) for col in cols]
    return cols

# And on to the epsilon-greedy policy with decaying epsilon
eps_min = 0.01
eps_max = 1.0 if not args.test else eps_min

def epsilon_greedy(q_values, step):
    epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step / args.explore_steps)
    if np.random.rand() < epsilon:
        return np.random.randint(env.action_space.n) # random action
    else:
        return np.argmax(q_values) # optimal action

done = True # env needs to be reset

# We will keep track of the max Q-Value over time and compute the mean per game
loss_val = np.infty
game_length = 0
total_max_q = 0
mean_max_q = 0.0
returnn = 0.0
returns = []
steps = []
path = os.path.join(args.jobid, "model")
with tf.Session() as sess:
    if os.path.isfile(path + ".index"):
        saver.restore(sess, path)
    else:
        init.run()
        copy_online_to_target.run()
    for step in range(args.number_steps):
        training_iter = global_step.eval() 
        if done: # game over, start again
            if args.verbosity > 0:
                print("Step {}/{} ({:.1f})% Training iters {}   "
                      "Loss {:5f}    Mean Max-Q {:5f}   Return: {:5f}".format(
                step, args.number_steps, step * 100 / args.number_steps,
                training_iter, loss_val, mean_max_q, returnn))
                sys.stdout.flush()
            state = env.reset()
        if args.render:
            env.render()

        # Online DQN evaluates what to do
        q_values = online_q_values.eval(feed_dict={X_state: [state]})
        action = epsilon_greedy(q_values, step)

        # Online DQN plays
        next_state, reward, done, info = env.step(action)
        returnn += reward

        # Let's memorize what happened
        replay_memory.append((state, action, reward, next_state, done))
        state = next_state

        if args.test:
            continue

        # Compute statistics for tracking progress (not shown in the book)
        total_max_q += q_values.max()
        game_length += 1
        if done:
            steps.append(step)
            returns.append(returnn)
            returnn = 0.
            mean_max_q = total_max_q / game_length
            total_max_q = 0.0
            game_length = 0

        if step < training_start or step % args.learn_freq != 0:
            continue # only train after warmup period and at regular intervals
        
        # Sample memories and train the online DQN
        X_state_val, X_action_val, X_rewards_val, X_next_state_val, X_done_val = sample_memories(batch_size)
        
        _, loss_val = sess.run([training_op, loss],
        {X_state: X_state_val, 
        X_action: X_action_val, 
        X_rewards: X_rewards_val,
        X_done: X_done_val,
        X_next_state: X_next_state_val})

        # Regularly copy the online DQN to the target DQN
        if step % args.copy_steps == 0:
            copy_online_to_target.run()

        # And save regularly
        if step % args.save_steps == 0:
            saver.save(sess, path)
            np.save(os.path.join(args.jobid, "{}.npy".format(args.jobid)), np.array((steps, returns)))



================================================
FILE: dqn.sh
================================================
#!/bin/bash
source activate tfgpu
python dqn.py $@ --jobid=$SLURM_JOB_ID

    


================================================
FILE: plot.py
================================================
import numpy as np
import sys
import re
import subprocess

def get_job_name(jobid):
    cmd = "sacct --format=\"JobName%30\" -j {}".format(jobid)
    result = subprocess.check_output(cmd, shell=True)
    return str(result).split("\\n")[2].strip()

jobids = sys.argv[2:]
exp_name = [get_job_name(jobid) for jobid in jobids]
exp_data = [np.load("{}/{}.npy".format(jobid, jobid)) for jobid in jobids]

N = int(sys.argv[1])

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

exp_data = map(lambda d: np.convolve(d, np.ones((N,))/N, mode='valid'), exp_data)

plt.figure(figsize=(16, 8))

for data in exp_data:
    plt.plot(data)

plt.xlabel('#episodes')
plt.ylabel('returns')
plt.legend(exp_name, loc='best')

plotpath = "plot_{}.png".format("_".join(jobids))
print ("Saved to {}".format(plotpath))
plt.savefig(plotpath)


================================================
FILE: run.sh
================================================
set -ex

sbatch --gres=gpu:1 --time=2:59:00 --mem=4gb --job-name=Pong --account=rpp-bengioy dqn.sh --game=Pong


================================================
FILE: util.py
================================================
import gym
import numpy as np
from collections import deque
from gym import spaces
from PIL import Image


class NoopResetEnv(gym.Wrapper):
    def __init__(self, env, noop_max=30):
        """Sample initial states by taking random number of no-ops on reset.
        No-op is assumed to be action 0.
        """
        gym.Wrapper.__init__(self, env)
        self.noop_max = noop_max
        self.override_num_noops = None
        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'

    def _reset(self):
        """ Do no-op action for a number of steps in [1, noop_max]."""
        self.env.reset()
        if self.override_num_noops is not None:
            noops = self.override_num_noops
        else:
            noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
        assert noops > 0
        obs = None
        for _ in range(noops):
            obs, _, done, _ = self.env.step(0)
            if done:
                obs = self.env.reset()
        return obs

class FireResetEnv(gym.Wrapper):
    def __init__(self, env):
        """Take action on reset for environments that are fixed until firing."""
        gym.Wrapper.__init__(self, env)
        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
        assert len(env.unwrapped.get_action_meanings()) >= 3

    def _reset(self):
        self.env.reset()
        obs, _, done, _ = self.env.step(1)
        if done:
            self.env.reset()
        obs, _, done, _ = self.env.step(2)
        if done:
            self.env.reset()
        return obs

class EpisodicLifeEnv(gym.Wrapper):
    def __init__(self, env):
        """Make end-of-life == end-of-episode, but only reset on true game over.
        Done by DeepMind for the DQN and co. since it helps value estimation.
        """
        gym.Wrapper.__init__(self, env)
        self.lives = 0
        self.was_real_done  = True

    def _step(self, action):
        obs, reward, done, info = self.env.step(action)
        self.was_real_done = done
        # check current lives, make loss of life terminal,
        # then update lives to handle bonus lives
        lives = self.env.unwrapped.ale.lives()
        if lives < self.lives and lives > 0:
            # for Qbert somtimes we stay in lives == 0 condtion for a few frames
            # so its important to keep lives > 0, so that we only reset once
            # the environment advertises done.
            done = True
        self.lives = lives
        return obs, reward, done, info

    def _reset(self):
        """Reset only when lives are exhausted.
        This way all states are still reachable even though lives are episodic,
        and the learner need not know about any of this behind-the-scenes.
        """
        if self.was_real_done:
            obs = self.env.reset()
        else:
            # no-op step to advance from terminal/lost life state
            obs, _, _, _ = self.env.step(0)
        self.lives = self.env.unwrapped.ale.lives()
        return obs

class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env, skip=4):
        """Return only every `skip`-th frame"""
        gym.Wrapper.__init__(self, env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = deque(maxlen=2)
        self._skip       = skip

    def _step(self, action):
        """Repeat action, sum reward, and max over last observations."""
        total_reward = 0.0
        done = None
        for _ in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            self._obs_buffer.append(obs)
            total_reward += reward
            if done:
                break
        max_frame = np.max(np.stack(self._obs_buffer), axis=0)

        return max_frame, total_reward, done, info

    def _reset(self):
        """Clear past frame buffer and init. to first obs. from inner env."""
        self._obs_buffer.clear()
        obs = self.env.reset()
        self._obs_buffer.append(obs)
        return obs

class ClipRewardEnv(gym.RewardWrapper):
    def _reward(self, reward):
        """Bin reward to {+1, 0, -1} by its sign."""
        return np.sign(reward)

class WarpFrame(gym.ObservationWrapper):
    def __init__(self, env):
        """Warp frames to 84x84 as done in the Nature paper and later work."""
        gym.ObservationWrapper.__init__(self, env)
        self.res = 84
        self.observation_space = spaces.Box(low=0, high=255, shape=(self.res, self.res, 1))

    def _observation(self, obs):
        frame = np.dot(obs.astype('float32'), np.array([0.299, 0.587, 0.114], 'float32'))
        frame = np.array(Image.fromarray(frame).resize((self.res, self.res),
            resample=Image.BILINEAR), dtype=np.uint8)
        return frame.reshape((self.res, self.res, 1))

class FrameStack(gym.Wrapper):
    def __init__(self, env, k):
        """Buffer observations and stack across channels (last axis)."""
        gym.Wrapper.__init__(self, env)
        self.k = k
        self.frames = deque([], maxlen=k)
        shp = env.observation_space.shape
        assert shp[2] == 1  # can only stack 1-channel frames
        self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], k))

    def _reset(self):
        """Clear buffer and re-fill by duplicating the first observation."""
        ob = self.env.reset()
        for _ in range(self.k): self.frames.append(ob)
        return self._observation()

    def _step(self, action):
        ob, reward, done, info = self.env.step(action)
        self.frames.append(ob)
        return self._observation(), reward, done, info

    def _observation(self):
        assert len(self.frames) == self.k
        return np.concatenate(self.frames, axis=2)


class ScaledFloatFrame(gym.ObservationWrapper):
    def _observation(self, obs):
        # careful! This undoes the memory optimization, use
        # with smaller replay buffers only.
        return np.array(obs).astype(np.float32) / 255.0


def wrap_dqn(env):
    """Apply a common set of wrappers for Atari games."""
    assert 'NoFrameskip' in env.spec.id
    env = EpisodicLifeEnv(env)
    env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)
    if 'FIRE' in env.unwrapped.get_action_meanings():
        env = FireResetEnv(env)
    env = WarpFrame(env)
    env = FrameStack(env, 4)
    env = ClipRewardEnv(env)
    env = ScaledFloatFrame(env)
    return env


Download .txt
gitextract_4u5kapam/

├── .gitignore
├── LICENSE
├── README.md
├── dqn.py
├── dqn.sh
├── plot.py
├── run.sh
└── util.py
Download .txt
SYMBOL INDEX (31 symbols across 3 files)

FILE: dqn.py
  function q_network (line 42) | def q_network(net, name, reuse=False):
  function sample_memories (line 93) | def sample_memories(batch_size):
  function epsilon_greedy (line 107) | def epsilon_greedy(q_values, step):

FILE: plot.py
  function get_job_name (line 6) | def get_job_name(jobid):

FILE: util.py
  class NoopResetEnv (line 8) | class NoopResetEnv(gym.Wrapper):
    method __init__ (line 9) | def __init__(self, env, noop_max=30):
    method _reset (line 18) | def _reset(self):
  class FireResetEnv (line 33) | class FireResetEnv(gym.Wrapper):
    method __init__ (line 34) | def __init__(self, env):
    method _reset (line 40) | def _reset(self):
  class EpisodicLifeEnv (line 50) | class EpisodicLifeEnv(gym.Wrapper):
    method __init__ (line 51) | def __init__(self, env):
    method _step (line 59) | def _step(self, action):
    method _reset (line 73) | def _reset(self):
  class MaxAndSkipEnv (line 86) | class MaxAndSkipEnv(gym.Wrapper):
    method __init__ (line 87) | def __init__(self, env, skip=4):
    method _step (line 94) | def _step(self, action):
    method _reset (line 108) | def _reset(self):
  class ClipRewardEnv (line 115) | class ClipRewardEnv(gym.RewardWrapper):
    method _reward (line 116) | def _reward(self, reward):
  class WarpFrame (line 120) | class WarpFrame(gym.ObservationWrapper):
    method __init__ (line 121) | def __init__(self, env):
    method _observation (line 127) | def _observation(self, obs):
  class FrameStack (line 133) | class FrameStack(gym.Wrapper):
    method __init__ (line 134) | def __init__(self, env, k):
    method _reset (line 143) | def _reset(self):
    method _step (line 149) | def _step(self, action):
    method _observation (line 154) | def _observation(self):
  class ScaledFloatFrame (line 159) | class ScaledFloatFrame(gym.ObservationWrapper):
    method _observation (line 160) | def _observation(self, obs):
  function wrap_dqn (line 166) | def wrap_dqn(env):
Condensed preview — 8 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (19K chars).
[
  {
    "path": ".gitignore",
    "chars": 764,
    "preview": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packagi"
  },
  {
    "path": "LICENSE",
    "chars": 1080,
    "preview": "The MIT License (MIT)\n\nCopyright (c) 2016 Sherjil Ozair\n\nPermission is hereby granted, free of charge, to any person obt"
  },
  {
    "path": "README.md",
    "chars": 959,
    "preview": "# dqn\nThis is a very basic DQN (with experience replay) implementation, which uses OpenAI's gym environment and Keras/Th"
  },
  {
    "path": "dqn.py",
    "chars": 7769,
    "preview": "#!/usr/bin/env python\n\nfrom __future__ import division, print_function, unicode_literals\n\n# Handle arguments (before slo"
  },
  {
    "path": "dqn.sh",
    "chars": 79,
    "preview": "#!/bin/bash\nsource activate tfgpu\npython dqn.py $@ --jobid=$SLURM_JOB_ID\n\n    \n"
  },
  {
    "path": "plot.py",
    "chars": 871,
    "preview": "import numpy as np\nimport sys\nimport re\nimport subprocess\n\ndef get_job_name(jobid):\n    cmd = \"sacct --format=\\\"JobName%"
  },
  {
    "path": "run.sh",
    "chars": 111,
    "preview": "set -ex\n\nsbatch --gres=gpu:1 --time=2:59:00 --mem=4gb --job-name=Pong --account=rpp-bengioy dqn.sh --game=Pong\n"
  },
  {
    "path": "util.py",
    "chars": 6439,
    "preview": "import gym\nimport numpy as np\nfrom collections import deque\nfrom gym import spaces\nfrom PIL import Image\n\n\nclass NoopRes"
  }
]

About this extraction

This page contains the full source code of the sherjilozair/dqn GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 8 files (17.6 KB), approximately 4.7k tokens, and a symbol index with 31 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!