[
  {
    "path": "CartPole-A3C.py",
    "content": "# OpenGym CartPole-v0 with A3C on GPU\n# -----------------------------------\n#\n# A3C implementation with GPU optimizer threads.\n# \n# Made as part of blog series Let's make an A3C, available at\n# https://jaromiru.com/2017/02/16/lets-make-an-a3c-theory/\n#\n# author: Jaromir Janisch, 2017\n\nimport numpy as np\nimport tensorflow as tf\n\nimport gym, time, random, threading\n\nfrom keras.models import *\nfrom keras.layers import *\nfrom keras import backend as K\n\n#-- constants\nENV = 'CartPole-v0'\n\nRUN_TIME = 30\nTHREADS = 8\nOPTIMIZERS = 2\nTHREAD_DELAY = 0.001\n\nGAMMA = 0.99\n\nN_STEP_RETURN = 8\nGAMMA_N = GAMMA ** N_STEP_RETURN\n\nEPS_START = 0.4\nEPS_STOP  = .15\nEPS_STEPS = 75000\n\nMIN_BATCH = 32\nLEARNING_RATE = 5e-3\n\nLOSS_V = .5\t\t\t# v loss coefficient\nLOSS_ENTROPY = .01 \t# entropy coefficient\n\n#---------\nclass Brain:\n\ttrain_queue = [ [], [], [], [], [] ]\t# s, a, r, s', s' terminal mask\n\tlock_queue = threading.Lock()\n\n\tdef __init__(self):\n\t\tself.session = tf.Session()\n\t\tK.set_session(self.session)\n\t\tK.manual_variable_initialization(True)\n\n\t\tself.model = self._build_model()\n\t\tself.graph = self._build_graph(self.model)\n\n\t\tself.session.run(tf.global_variables_initializer())\n\t\tself.default_graph = tf.get_default_graph()\n\n\t\tself.default_graph.finalize()\t# avoid modifications\n\n\tdef _build_model(self):\n\n\t\tl_input = Input( batch_shape=(None, NUM_STATE) )\n\t\tl_dense = Dense(16, activation='relu')(l_input)\n\n\t\tout_actions = Dense(NUM_ACTIONS, activation='softmax')(l_dense)\n\t\tout_value   = Dense(1, activation='linear')(l_dense)\n\n\t\tmodel = Model(inputs=[l_input], outputs=[out_actions, out_value])\n\t\tmodel._make_predict_function()\t# have to initialize before threading\n\n\t\treturn model\n\n\tdef _build_graph(self, model):\n\t\ts_t = tf.placeholder(tf.float32, shape=(None, NUM_STATE))\n\t\ta_t = tf.placeholder(tf.float32, shape=(None, NUM_ACTIONS))\n\t\tr_t = tf.placeholder(tf.float32, shape=(None, 1)) # not immediate, but discounted n step reward\n\t\t\n\t\tp, v = model(s_t)\n\n\t\tlog_prob = tf.log( tf.reduce_sum(p * a_t, axis=1, keep_dims=True) + 1e-10)\n\t\tadvantage = r_t - v\n\n\t\tloss_policy = - log_prob * tf.stop_gradient(advantage)\t\t\t\t\t\t\t\t\t# maximize policy\n\t\tloss_value  = LOSS_V * tf.square(advantage)\t\t\t\t\t\t\t\t\t\t\t\t# minimize value error\n\t\tentropy = LOSS_ENTROPY * tf.reduce_sum(p * tf.log(p + 1e-10), axis=1, keep_dims=True)\t# maximize entropy (regularization)\n\n\t\tloss_total = tf.reduce_mean(loss_policy + loss_value + entropy)\n\n\t\toptimizer = tf.train.RMSPropOptimizer(LEARNING_RATE, decay=.99)\n\t\tminimize = optimizer.minimize(loss_total)\n\n\t\treturn s_t, a_t, r_t, minimize\n\n\tdef optimize(self):\n\t\tif len(self.train_queue[0]) < MIN_BATCH:\n\t\t\ttime.sleep(0)\t# yield\n\t\t\treturn\n\n\t\twith self.lock_queue:\n\t\t\tif len(self.train_queue[0]) < MIN_BATCH:\t# more thread could have passed without lock\n\t\t\t\treturn \t\t\t\t\t\t\t\t\t# we can't yield inside lock\n\n\t\t\ts, a, r, s_, s_mask = self.train_queue\n\t\t\tself.train_queue = [ [], [], [], [], [] ]\n\n\t\ts = np.vstack(s)\n\t\ta = np.vstack(a)\n\t\tr = np.vstack(r)\n\t\ts_ = np.vstack(s_)\n\t\ts_mask = np.vstack(s_mask)\n\n\t\tif len(s) > 5*MIN_BATCH: print(\"Optimizer alert! Minimizing batch of %d\" % len(s))\n\n\t\tv = self.predict_v(s_)\n\t\tr = r + GAMMA_N * v * s_mask\t# set v to 0 where s_ is terminal state\n\t\t\n\t\ts_t, a_t, r_t, minimize = self.graph\n\t\tself.session.run(minimize, feed_dict={s_t: s, a_t: a, r_t: r})\n\n\tdef train_push(self, s, a, r, s_):\n\t\twith self.lock_queue:\n\t\t\tself.train_queue[0].append(s)\n\t\t\tself.train_queue[1].append(a)\n\t\t\tself.train_queue[2].append(r)\n\n\t\t\tif s_ is None:\n\t\t\t\tself.train_queue[3].append(NONE_STATE)\n\t\t\t\tself.train_queue[4].append(0.)\n\t\t\telse:\t\n\t\t\t\tself.train_queue[3].append(s_)\n\t\t\t\tself.train_queue[4].append(1.)\n\n\tdef predict(self, s):\n\t\twith self.default_graph.as_default():\n\t\t\tp, v = self.model.predict(s)\n\t\t\treturn p, v\n\n\tdef predict_p(self, s):\n\t\twith self.default_graph.as_default():\n\t\t\tp, v = self.model.predict(s)\t\t\n\t\t\treturn p\n\n\tdef predict_v(self, s):\n\t\twith self.default_graph.as_default():\n\t\t\tp, v = self.model.predict(s)\t\t\n\t\t\treturn v\n\n#---------\nframes = 0\nclass Agent:\n\tdef __init__(self, eps_start, eps_end, eps_steps):\n\t\tself.eps_start = eps_start\n\t\tself.eps_end   = eps_end\n\t\tself.eps_steps = eps_steps\n\n\t\tself.memory = []\t# used for n_step return\n\t\tself.R = 0.\n\n\tdef getEpsilon(self):\n\t\tif(frames >= self.eps_steps):\n\t\t\treturn self.eps_end\n\t\telse:\n\t\t\treturn self.eps_start + frames * (self.eps_end - self.eps_start) / self.eps_steps\t# linearly interpolate\n\n\tdef act(self, s):\n\t\teps = self.getEpsilon()\t\t\t\n\t\tglobal frames; frames = frames + 1\n\n\t\tif random.random() < eps:\n\t\t\treturn random.randint(0, NUM_ACTIONS-1)\n\n\t\telse:\n\t\t\ts = np.array([s])\n\t\t\tp = brain.predict_p(s)[0]\n\n\t\t\t# a = np.argmax(p)\n\t\t\ta = np.random.choice(NUM_ACTIONS, p=p)\n\n\t\t\treturn a\n\t\n\tdef train(self, s, a, r, s_):\n\t\tdef get_sample(memory, n):\n\t\t\ts, a, _, _  = memory[0]\n\t\t\t_, _, _, s_ = memory[n-1]\n\n\t\t\treturn s, a, self.R, s_\n\n\t\ta_cats = np.zeros(NUM_ACTIONS)\t# turn action into one-hot representation\n\t\ta_cats[a] = 1 \n\n\t\tself.memory.append( (s, a_cats, r, s_) )\n\n\t\tself.R = ( self.R + r * GAMMA_N ) / GAMMA\n\n\t\tif s_ is None:\n\t\t\twhile len(self.memory) > 0:\n\t\t\t\tn = len(self.memory)\n\t\t\t\ts, a, r, s_ = get_sample(self.memory, n)\n\t\t\t\tbrain.train_push(s, a, r, s_)\n\n\t\t\t\tself.R = ( self.R - self.memory[0][2] ) / GAMMA\n\t\t\t\tself.memory.pop(0)\t\t\n\n\t\t\tself.R = 0\n\n\t\tif len(self.memory) >= N_STEP_RETURN:\n\t\t\ts, a, r, s_ = get_sample(self.memory, N_STEP_RETURN)\n\t\t\tbrain.train_push(s, a, r, s_)\n\n\t\t\tself.R = self.R - self.memory[0][2]\n\t\t\tself.memory.pop(0)\t\n\t\n\t# possible edge case - if an episode ends in <N steps, the computation is incorrect\n\t\t\n#---------\nclass Environment(threading.Thread):\n\tstop_signal = False\n\n\tdef __init__(self, render=False, eps_start=EPS_START, eps_end=EPS_STOP, eps_steps=EPS_STEPS):\n\t\tthreading.Thread.__init__(self)\n\n\t\tself.render = render\n\t\tself.env = gym.make(ENV)\n\t\tself.agent = Agent(eps_start, eps_end, eps_steps)\n\n\tdef runEpisode(self):\n\t\ts = self.env.reset()\n\n\t\tR = 0\n\t\twhile True:         \n\t\t\ttime.sleep(THREAD_DELAY) # yield \n\n\t\t\tif self.render: self.env.render()\n\n\t\t\ta = self.agent.act(s)\n\t\t\ts_, r, done, info = self.env.step(a)\n\n\t\t\tif done: # terminal state\n\t\t\t\ts_ = None\n\n\t\t\tself.agent.train(s, a, r, s_)\n\n\t\t\ts = s_\n\t\t\tR += r\n\n\t\t\tif done or self.stop_signal:\n\t\t\t\tbreak\n\n\t\tprint(\"Total R:\", R)\n\n\tdef run(self):\n\t\twhile not self.stop_signal:\n\t\t\tself.runEpisode()\n\n\tdef stop(self):\n\t\tself.stop_signal = True\n\n#---------\nclass Optimizer(threading.Thread):\n\tstop_signal = False\n\n\tdef __init__(self):\n\t\tthreading.Thread.__init__(self)\n\n\tdef run(self):\n\t\twhile not self.stop_signal:\n\t\t\tbrain.optimize()\n\n\tdef stop(self):\n\t\tself.stop_signal = True\n\n#-- main\nenv_test = Environment(render=True, eps_start=0., eps_end=0.)\nNUM_STATE = env_test.env.observation_space.shape[0]\nNUM_ACTIONS = env_test.env.action_space.n\nNONE_STATE = np.zeros(NUM_STATE)\n\nbrain = Brain()\t# brain is global in A3C\n\nenvs = [Environment() for i in range(THREADS)]\nopts = [Optimizer() for i in range(OPTIMIZERS)]\n\nfor o in opts:\n\to.start()\n\nfor e in envs:\n\te.start()\n\ntime.sleep(RUN_TIME)\n\nfor e in envs:\n\te.stop()\nfor e in envs:\n\te.join()\n\nfor o in opts:\n\to.stop()\nfor o in opts:\n\to.join()\n\nprint(\"Training finished\")\nenv_test.run()"
  },
  {
    "path": "CartPole-DQN.py",
    "content": "# OpenGym CartPole-v0\n# -------------------\n#\n# This code demonstrates use a full DQN implementation\n# to solve OpenGym CartPole-v0 problem.\n#\n# Made as part of blog series Let's make a DQN, available at: \n# https://jaromiru.com/2016/09/27/lets-make-a-dqn-theory/\n# \n# author: Jaromir Janisch, 2016\n\nimport random, numpy, math, gym, sys\nfrom keras import backend as K\n\nimport tensorflow as tf\n\n#----------\nHUBER_LOSS_DELTA = 1.0\nLEARNING_RATE = 0.00025\n\n#----------\ndef huber_loss(y_true, y_pred):\n    err = y_true - y_pred\n\n    cond = K.abs(err) < HUBER_LOSS_DELTA\n    L2 = 0.5 * K.square(err)\n    L1 = HUBER_LOSS_DELTA * (K.abs(err) - 0.5 * HUBER_LOSS_DELTA)\n\n    loss = tf.where(cond, L2, L1)   # Keras does not cover where function in tensorflow :-(\n\n    return K.mean(loss)\n\n#-------------------- BRAIN ---------------------------\nfrom keras.models import Sequential\nfrom keras.layers import *\nfrom keras.optimizers import *\n\nclass Brain:\n    def __init__(self, stateCnt, actionCnt):\n        self.stateCnt = stateCnt\n        self.actionCnt = actionCnt\n\n        self.model = self._createModel()\n        self.model_ = self._createModel() \n\n    def _createModel(self):\n        model = Sequential()\n\n        model.add(Dense(units=64, activation='relu', input_dim=stateCnt))\n        model.add(Dense(units=actionCnt, activation='linear'))\n\n        opt = RMSprop(lr=LEARNING_RATE)\n        model.compile(loss=huber_loss, optimizer=opt)\n\n        return model\n\n    def train(self, x, y, epochs=1, verbose=0):\n        self.model.fit(x, y, batch_size=64, epochs=epochs, verbose=verbose)\n\n    def predict(self, s, target=False):\n        if target:\n            return self.model_.predict(s)\n        else:\n            return self.model.predict(s)\n\n    def predictOne(self, s, target=False):\n        return self.predict(s.reshape(1, self.stateCnt), target=target).flatten()\n\n    def updateTargetModel(self):\n        self.model_.set_weights(self.model.get_weights())\n\n#-------------------- MEMORY --------------------------\nclass Memory:   # stored as ( s, a, r, s_ )\n    samples = []\n\n    def __init__(self, capacity):\n        self.capacity = capacity\n\n    def add(self, sample):\n        self.samples.append(sample)        \n\n        if len(self.samples) > self.capacity:\n            self.samples.pop(0)\n\n    def sample(self, n):\n        n = min(n, len(self.samples))\n        return random.sample(self.samples, n)\n\n    def isFull(self):\n        return len(self.samples) >= self.capacity\n\n#-------------------- AGENT ---------------------------\nMEMORY_CAPACITY = 100000\nBATCH_SIZE = 64\n\nGAMMA = 0.99\n\nMAX_EPSILON = 1\nMIN_EPSILON = 0.01\nLAMBDA = 0.001      # speed of decay\n\nUPDATE_TARGET_FREQUENCY = 1000\n\nclass Agent:\n    steps = 0\n    epsilon = MAX_EPSILON\n\n    def __init__(self, stateCnt, actionCnt):\n        self.stateCnt = stateCnt\n        self.actionCnt = actionCnt\n\n        self.brain = Brain(stateCnt, actionCnt)\n        self.memory = Memory(MEMORY_CAPACITY)\n        \n    def act(self, s):\n        if random.random() < self.epsilon:\n            return random.randint(0, self.actionCnt-1)\n        else:\n            return numpy.argmax(self.brain.predictOne(s))\n\n    def observe(self, sample):  # in (s, a, r, s_) format\n        self.memory.add(sample)        \n\n        if self.steps % UPDATE_TARGET_FREQUENCY == 0:\n            self.brain.updateTargetModel()\n\n        # debug the Q function in poin S\n        if self.steps % 100 == 0:\n            S = numpy.array([-0.01335408, -0.04600273, -0.00677248, 0.01517507])\n            pred = agent.brain.predictOne(S)\n            print(pred[0])\n            sys.stdout.flush()\n\n        # slowly decrease Epsilon based on our eperience\n        self.steps += 1\n        self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps)\n\n    def replay(self):    \n        batch = self.memory.sample(BATCH_SIZE)\n        batchLen = len(batch)\n\n        no_state = numpy.zeros(self.stateCnt)\n\n        states = numpy.array([ o[0] for o in batch ])\n        states_ = numpy.array([ (no_state if o[3] is None else o[3]) for o in batch ])\n\n        p = self.brain.predict(states)\n        p_ = self.brain.predict(states_, target=True)\n\n        x = numpy.zeros((batchLen, self.stateCnt))\n        y = numpy.zeros((batchLen, self.actionCnt))\n        \n        for i in range(batchLen):\n            o = batch[i]\n            s = o[0]; a = o[1]; r = o[2]; s_ = o[3]\n            \n            t = p[i]\n            if s_ is None:\n                t[a] = r\n            else:\n                t[a] = r + GAMMA * numpy.amax(p_[i])\n\n            x[i] = s\n            y[i] = t\n\n        self.brain.train(x, y)\n\n\nclass RandomAgent:\n    memory = Memory(MEMORY_CAPACITY)\n\n    def __init__(self, actionCnt):\n        self.actionCnt = actionCnt\n\n    def act(self, s):\n        return random.randint(0, self.actionCnt-1)\n\n    def observe(self, sample):  # in (s, a, r, s_) format\n        self.memory.add(sample)\n\n    def replay(self):\n        pass\n\n#-------------------- ENVIRONMENT ---------------------\nclass Environment:\n    def __init__(self, problem):\n        self.problem = problem\n        self.env = gym.make(problem)\n\n    def run(self, agent):\n        s = self.env.reset()\n        R = 0 \n\n        while True:            \n            # self.env.render()\n\n            a = agent.act(s)\n\n            s_, r, done, info = self.env.step(a)\n\n            if done: # terminal state\n                s_ = None\n\n            agent.observe( (s, a, r, s_) )\n            agent.replay()            \n\n            s = s_\n            R += r\n\n            if done:\n                break\n\n        # print(\"Total reward:\", R)\n\n#-------------------- MAIN ----------------------------\nPROBLEM = 'CartPole-v0'\nenv = Environment(PROBLEM)\n\nstateCnt  = env.env.observation_space.shape[0]\nactionCnt = env.env.action_space.n\n\nagent = Agent(stateCnt, actionCnt)\nrandomAgent = RandomAgent(actionCnt)\n\ntry:\n    while randomAgent.memory.isFull() == False:\n        env.run(randomAgent)\n\n    agent.memory.samples = randomAgent.memory.samples\n    randomAgent = None\n\n    while True:\n        env.run(agent)\nfinally:\n    agent.brain.model.save(\"cartpole-dqn.h5\")\n"
  },
  {
    "path": "CartPole-basic.py",
    "content": "# OpenGym CartPole-v0\n# -------------------\n#\n# This code demonstrates use of a basic Q-network (without target network)\n# to solve OpenGym CartPole-v0 problem.\n#\n# Made as part of blog series Let's make a DQN, available at: \n# https://jaromiru.com/2016/10/03/lets-make-a-dqn-implementation/\n# \n# author: Jaromir Janisch, 2016\n\n\n#--- enable this to run on GPU\n# import os    \n# os.environ['THEANO_FLAGS'] = \"device=gpu,floatX=float32\"  \n\nimport random, numpy, math, gym\n\n#-------------------- BRAIN ---------------------------\nfrom keras.models import Sequential\nfrom keras.layers import *\nfrom keras.optimizers import *\n\nclass Brain:\n    def __init__(self, stateCnt, actionCnt):\n        self.stateCnt = stateCnt\n        self.actionCnt = actionCnt\n\n        self.model = self._createModel()\n        # self.model.load_weights(\"cartpole-basic.h5\")\n\n    def _createModel(self):\n        model = Sequential()\n\n        model.add(Dense(output_dim=64, activation='relu', input_dim=stateCnt))\n        model.add(Dense(output_dim=actionCnt, activation='linear'))\n\n        opt = RMSprop(lr=0.00025)\n        model.compile(loss='mse', optimizer=opt)\n\n        return model\n\n    def train(self, x, y, epoch=1, verbose=0):\n        self.model.fit(x, y, batch_size=64, nb_epoch=epoch, verbose=verbose)\n\n    def predict(self, s):\n        return self.model.predict(s)\n\n    def predictOne(self, s):\n        return self.predict(s.reshape(1, self.stateCnt)).flatten()\n\n#-------------------- MEMORY --------------------------\nclass Memory:   # stored as ( s, a, r, s_ )\n    samples = []\n\n    def __init__(self, capacity):\n        self.capacity = capacity\n\n    def add(self, sample):\n        self.samples.append(sample)        \n\n        if len(self.samples) > self.capacity:\n            self.samples.pop(0)\n\n    def sample(self, n):\n        n = min(n, len(self.samples))\n        return random.sample(self.samples, n)\n\n#-------------------- AGENT ---------------------------\nMEMORY_CAPACITY = 100000\nBATCH_SIZE = 64\n\nGAMMA = 0.99\n\nMAX_EPSILON = 1\nMIN_EPSILON = 0.01\nLAMBDA = 0.001      # speed of decay\n\nclass Agent:\n    steps = 0\n    epsilon = MAX_EPSILON\n\n    def __init__(self, stateCnt, actionCnt):\n        self.stateCnt = stateCnt\n        self.actionCnt = actionCnt\n\n        self.brain = Brain(stateCnt, actionCnt)\n        self.memory = Memory(MEMORY_CAPACITY)\n        \n    def act(self, s):\n        if random.random() < self.epsilon:\n            return random.randint(0, self.actionCnt-1)\n        else:\n            return numpy.argmax(self.brain.predictOne(s))\n\n    def observe(self, sample):  # in (s, a, r, s_) format\n        self.memory.add(sample)        \n\n        # slowly decrease Epsilon based on our eperience\n        self.steps += 1\n        self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps)\n\n    def replay(self):    \n        batch = self.memory.sample(BATCH_SIZE)\n        batchLen = len(batch)\n\n        no_state = numpy.zeros(self.stateCnt)\n\n        states = numpy.array([ o[0] for o in batch ])\n        states_ = numpy.array([ (no_state if o[3] is None else o[3]) for o in batch ])\n\n        p = self.brain.predict(states)\n        p_ = self.brain.predict(states_)\n\n        x = numpy.zeros((batchLen, self.stateCnt))\n        y = numpy.zeros((batchLen, self.actionCnt))\n        \n        for i in range(batchLen):\n            o = batch[i]\n            s = o[0]; a = o[1]; r = o[2]; s_ = o[3]\n            \n            t = p[i]\n            if s_ is None:\n                t[a] = r\n            else:\n                t[a] = r + GAMMA * numpy.amax(p_[i])\n\n            x[i] = s\n            y[i] = t\n\n        self.brain.train(x, y)\n\n#-------------------- ENVIRONMENT ---------------------\nclass Environment:\n    def __init__(self, problem):\n        self.problem = problem\n        self.env = gym.make(problem)\n\n    def run(self, agent):\n        s = self.env.reset()\n        R = 0 \n\n        while True:            \n            self.env.render()\n\n            a = agent.act(s)\n\n            s_, r, done, info = self.env.step(a)\n\n            if done: # terminal state\n                s_ = None\n\n            agent.observe( (s, a, r, s_) )\n            agent.replay()            \n\n            s = s_\n            R += r\n\n            if done:\n                break\n\n        print(\"Total reward:\", R)\n\n#-------------------- MAIN ----------------------------\nPROBLEM = 'CartPole-v0'\nenv = Environment(PROBLEM)\n\nstateCnt  = env.env.observation_space.shape[0]\nactionCnt = env.env.action_space.n\n\nagent = Agent(stateCnt, actionCnt)\n\ntry:\n    while True:\n        env.run(agent)\nfinally:\n    agent.brain.model.save(\"cartpole-basic.h5\")\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2018 Jaromír Janisch\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "MountainCar-basic.py",
    "content": "# OpenGym MountainCar-v0\n# -------------------\n#\n# This code demonstrates debugging of a basic Q-network (without target network)\n# in an OpenGym MountainCar-v0 environment.\n#\n# Made as part of blog series Let's make a DQN, available at: \n# https://jaromiru.com/2016/10/12/lets-make-a-dqn-debugging/\n# \n# author: Jaromir Janisch, 2016\n\n\n#--- enable this to run on GPU\n# import os    \n# os.environ['THEANO_FLAGS'] = \"device=gpu,floatX=float32\"  \n\nimport random, numpy, math, gym\n\n#-------------------- UTILITIES -----------------------\nimport matplotlib.pyplot as plt\nfrom matplotlib import colors\nimport sys\n\ndef printQ(agent):\n    P = [\n        [-0.15955113,  0.        ], # s_start\n\n        [ 0.83600049,  0.27574312], # s'' -> s'\n        [ 0.85796947,  0.28245832], # s' -> s\n        [ 0.88062271,  0.29125591], # s -> terminal\n    ]\n\n    pred = agent.brain.predict( numpy.array(P) )\n\n    for o in pred:\n        sys.stdout.write(str(o[1])+\" \")\n\n    print(\";\")\n    sys.stdout.flush()\n\ndef mapBrain(brain, res):\n    s = numpy.zeros( (res * res, 2) )\n    i = 0\n\n    for i1 in range(res):\n        for i2 in range(res):            \n            s[i] = numpy.array( [ 2 * (i1 - res / 2) / res, 2 * (i2 - res / 2) / res ] )\n            i += 1\n\n    mapV = numpy.amax(brain.predict(s), axis=1).reshape( (res, res) )\n    mapA = numpy.argmax(brain.predict(s), axis=1).reshape( (res, res) )\n\n    return (mapV, mapA)\n\ndef displayBrain(brain, res=50):    \n    mapV, mapA = mapBrain(brain, res)\n\n    plt.close()\n    plt.show()  \n\n    fig = plt.figure(figsize=(5,7))\n    fig.add_subplot(211)\n\n    plt.imshow(mapV)\n    plt.colorbar(orientation='vertical')\n\n    fig.add_subplot(212)\n\n    cmap = colors.ListedColormap(['blue', 'red'])\n    bounds=[-0.5,0.5,1.5]\n    norm = colors.BoundaryNorm(bounds, cmap.N)\n\n    plt.imshow(mapA, cmap=cmap, norm=norm)        \n    cb = plt.colorbar(orientation='vertical', ticks=[0,1])\n\n    plt.pause(0.001)\n\n#-------------------- BRAIN ---------------------------\nfrom keras.models import Sequential\nfrom keras.layers import *\nfrom keras.optimizers import *\n\nclass Brain:\n    def __init__(self, stateCnt, actionCnt):\n        self.stateCnt = stateCnt\n        self.actionCnt = actionCnt\n\n        self.model = self._createModel()\n        # self.model.load_weights(\"MountainCar-basic.h5\")\n\n    def _createModel(self):\n        model = Sequential()\n\n        model.add(Dense(output_dim=64, activation='relu', input_dim=stateCnt))\n        model.add(Dense(output_dim=actionCnt, activation='linear'))\n\n        opt = RMSprop(lr=0.00025)\n        model.compile(loss='mse', optimizer=opt)\n\n        return model\n\n    def train(self, x, y, epoch=1, verbose=0):\n        self.model.fit(x, y, batch_size=64, nb_epoch=epoch, verbose=verbose)\n\n    def predict(self, s):\n        return self.model.predict(s)\n\n    def predictOne(self, s):\n        return self.predict(s.reshape(1, self.stateCnt)).flatten()\n\n#-------------------- MEMORY --------------------------\nclass Memory:   # stored as ( s, a, r, s_ )\n    samples = []\n\n    def __init__(self, capacity):\n        self.capacity = capacity\n\n    def add(self, sample):\n        self.samples.append(sample)        \n\n        if len(self.samples) > self.capacity:\n            self.samples.pop(0)\n\n    def sample(self, n):\n        n = min(n, len(self.samples))\n        return random.sample(self.samples, n)\n\n    def isFull(self):\n        return len(self.samples) >= self.capacity\n\n#-------------------- AGENT ---------------------------\nMEMORY_CAPACITY = 100000\nBATCH_SIZE = 64\n\nGAMMA = 0.99\n\nMAX_EPSILON = 1\nMIN_EPSILON = 0.1\nLAMBDA = 0.001      # speed of decay\n\nclass Agent:\n    steps = 0\n    epsilon = MAX_EPSILON\n\n    def __init__(self, stateCnt, actionCnt):\n        self.stateCnt = stateCnt\n        self.actionCnt = actionCnt\n\n        self.brain = Brain(stateCnt, actionCnt)\n        self.memory = Memory(MEMORY_CAPACITY)\n        \n    def act(self, s):\n        if random.random() < self.epsilon:\n            return random.randint(0, self.actionCnt-1)\n        else:\n            return numpy.argmax(self.brain.predictOne(s))\n\n    def observe(self, sample):  # in (s, a, r, s_) format\n        self.memory.add(sample)        \n\n        # ----- debug\n        if self.steps % 1000 == 0:\n            printQ(self)\n\n        if self.steps % 10000 == 0:\n            displayBrain(self.brain)\n\n        # slowly decrease Epsilon based on our eperience\n        self.steps += 1\n        self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps)\n\n    def replay(self):    \n        batch = self.memory.sample(BATCH_SIZE)\n        batchLen = len(batch)\n\n        no_state = numpy.zeros(self.stateCnt)\n\n        states = numpy.array([ o[0] for o in batch ])\n        states_ = numpy.array([ (no_state if o[3] is None else o[3]) for o in batch ])\n\n        p = agent.brain.predict(states)\n        p_ = agent.brain.predict(states_)\n\n        x = numpy.zeros((batchLen, self.stateCnt))\n        y = numpy.zeros((batchLen, self.actionCnt))\n        \n        for i in range(batchLen):\n            o = batch[i]\n            s = o[0]; a = o[1]; r = o[2]; s_ = o[3]\n            \n            t = p[i]\n            if s_ is None:\n                t[a] = r\n            else:\n                t[a] = r + GAMMA * numpy.amax(p_[i])\n\n            x[i] = s\n            y[i] = t\n\n        self.brain.train(x, y)\n\nclass RandomAgent:\n    memory = Memory(MEMORY_CAPACITY)\n\n    def __init__(self, actionCnt):\n        self.actionCnt = actionCnt\n\n    def act(self, s):\n        return random.randint(0, self.actionCnt-1)\n\n    def observe(self, sample):  # in (s, a, r, s_) format\n        self.memory.add(sample)\n\n    def replay(self):\n        pass\n\n#-------------------- ENVIRONMENT ---------------------\nclass Environment:\n    def __init__(self, problem):\n        self.problem = problem\n        self.env = gym.make(problem)\n\n        high = self.env.observation_space.high\n        low = self.env.observation_space.low\n\n        self.mean = (high + low) / 2\n        self.spread = abs(high - low) / 2\n\n    def normalize(self, s):\n        return (s - self.mean) / self.spread\n\n    def run(self, agent):\n        s = self.env.reset()\n        s = self.normalize(s)\n        R = 0 \n\n        while True:            \n            # self.env.render()\n\n            a = agent.act(s)    # map actions; 0 = left, 2 = right                      \n            if a == 0: \n                a_ = 0\n            elif a == 1: \n                a_ = 2\n\n            s_, r, done, info = self.env.step(a_)\n            s_ = self.normalize(s_)\n\n            if done: # terminal state\n                s_ = None\n\n            agent.observe( (s, a, r, s_) )\n            agent.replay()            \n\n            s = s_\n            R += r\n\n            if done:\n                break\n\n        # print(\"Total reward:\", R)\n\n#-------------------- MAIN ----------------------------\nPROBLEM = 'MountainCar-v0'\nenv = Environment(PROBLEM)\n\nstateCnt  = env.env.observation_space.shape[0]\nactionCnt = 2 #env.env.action_space.n\n\nagent = Agent(stateCnt, actionCnt)\nrandomAgent = RandomAgent(actionCnt)\n\ntry:\n    while randomAgent.memory.isFull() == False:\n        env.run(randomAgent)\n\n    agent.memory = randomAgent.memory\n    randomAgent = None\n\n    while True:\n        env.run(agent)\nfinally:\n    agent.brain.model.save(\"MountainCar-basic.h5\")\n"
  },
  {
    "path": "Seaquest-DDQN-PER.py",
    "content": "# OpenGym Seaquest-v0\n# -------------------\n#\n# This code demonstrates a Double DQN network with Priority Experience Replay\n# in an OpenGym Seaquest-v0 environment.\n#\n# Made as part of blog series Let's make a DQN, available at: \n# https://jaromiru.com/2016/11/07/lets-make-a-dqn-double-learning-and-prioritized-experience-replay/\n# \n# author: Jaromir Janisch, 2016\n\nimport random, numpy, math, gym, scipy\nfrom SumTree import SumTree\n\nIMAGE_WIDTH = 84\nIMAGE_HEIGHT = 84\nIMAGE_STACK = 2\n\nHUBER_LOSS_DELTA = 2.0\nLEARNING_RATE = 0.00025\n\n#-------------------- UTILITIES -----------------------\ndef huber_loss(y_true, y_pred):\n    err = y_true - y_pred\n\n    cond = K.abs(err) < HUBER_LOSS_DELTA\n    L2 = 0.5 * K.square(err)\n    L1 = HUBER_LOSS_DELTA * (K.abs(err) - 0.5 * HUBER_LOSS_DELTA)\n\n    loss = tf.where(cond, L2, L1)   # Keras does not cover where function in tensorflow :-(\n\n    return K.mean(loss)\n\ndef processImage( img ):\n    rgb = scipy.misc.imresize(img, (IMAGE_WIDTH, IMAGE_HEIGHT), interp='bilinear')\n\n    r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2]\n    gray = 0.2989 * r + 0.5870 * g + 0.1140 * b     # extract luminance\n\n    o = gray.astype('float32') / 128 - 1    # normalize\n    return o\n\n#-------------------- BRAIN ---------------------------\nfrom keras.models import Sequential\nfrom keras.layers import *\nfrom keras.optimizers import *\n\nclass Brain:\n    def __init__(self, stateCnt, actionCnt):\n        self.stateCnt = stateCnt\n        self.actionCnt = actionCnt\n\n        self.model = self._createModel()\n        self.model_ = self._createModel()  # target network\n\n    def _createModel(self):\n        model = Sequential()\n\n        model.add(Conv2D(32, (8, 8), strides=(4,4), activation='relu', input_shape=(self.stateCnt), data_format='channels_first'))\n        model.add(Conv2D(64, (4, 4), strides=(2,2), activation='relu'))\n        model.add(Conv2D(64, (3, 3), activation='relu'))\n        model.add(Flatten())\n        model.add(Dense(units=512, activation='relu'))\n\n        model.add(Dense(units=actionCnt, activation='linear'))\n\n        opt = RMSprop(lr=LEARNING_RATE)\n        model.compile(loss=huber_loss, optimizer=opt)\n\n        return model\n\n    def train(self, x, y, epochs=1, verbose=0):\n        self.model.fit(x, y, batch_size=32, epochs=epochs, verbose=verbose)\n\n    def predict(self, s, target=False):\n        if target:\n            return self.model_.predict(s)\n        else:\n            return self.model.predict(s)\n\n    def predictOne(self, s, target=False):\n        return self.predict(s.reshape(1, IMAGE_STACK, IMAGE_WIDTH, IMAGE_HEIGHT), target).flatten()\n\n    def updateTargetModel(self):\n        self.model_.set_weights(self.model.get_weights())\n\n#-------------------- MEMORY --------------------------\nclass Memory:   # stored as ( s, a, r, s_ ) in SumTree\n    e = 0.01\n    a = 0.6\n\n    def __init__(self, capacity):\n        self.tree = SumTree(capacity)\n\n    def _getPriority(self, error):\n        return (error + self.e) ** self.a\n\n    def add(self, error, sample):\n        p = self._getPriority(error)\n        self.tree.add(p, sample) \n\n    def sample(self, n):\n        batch = []\n        segment = self.tree.total() / n\n\n        for i in range(n):\n            a = segment * i\n            b = segment * (i + 1)\n\n            s = random.uniform(a, b)\n            (idx, p, data) = self.tree.get(s)\n            batch.append( (idx, data) )\n\n        return batch\n\n    def update(self, idx, error):\n        p = self._getPriority(error)\n        self.tree.update(idx, p)\n\n#-------------------- AGENT ---------------------------\nMEMORY_CAPACITY = 200000\n\nBATCH_SIZE = 32\n\nGAMMA = 0.99\n\nMAX_EPSILON = 1\nMIN_EPSILON = 0.1\n\nEXPLORATION_STOP = 500000   # at this step epsilon will be 0.01\nLAMBDA = - math.log(0.01) / EXPLORATION_STOP  # speed of decay\n\nUPDATE_TARGET_FREQUENCY = 10000\n\nclass Agent:\n    steps = 0\n    epsilon = MAX_EPSILON\n\n    def __init__(self, stateCnt, actionCnt):\n        self.stateCnt = stateCnt\n        self.actionCnt = actionCnt\n\n        self.brain = Brain(stateCnt, actionCnt)\n        # self.memory = Memory(MEMORY_CAPACITY)\n        \n    def act(self, s):\n        if random.random() < self.epsilon:\n            return random.randint(0, self.actionCnt-1)\n        else:\n            return numpy.argmax(self.brain.predictOne(s))\n\n    def observe(self, sample):  # in (s, a, r, s_) format\n        x, y, errors = self._getTargets([(0, sample)])\n        self.memory.add(errors[0], sample)\n\n        if self.steps % UPDATE_TARGET_FREQUENCY == 0:\n            self.brain.updateTargetModel()\n\n        # slowly decrease Epsilon based on our eperience\n        self.steps += 1\n        self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps)\n\n    def _getTargets(self, batch):\n        no_state = numpy.zeros(self.stateCnt)\n\n        states = numpy.array([ o[1][0] for o in batch ])\n        states_ = numpy.array([ (no_state if o[1][3] is None else o[1][3]) for o in batch ])\n\n        p = agent.brain.predict(states)\n\n        p_ = agent.brain.predict(states_, target=False)\n        pTarget_ = agent.brain.predict(states_, target=True)\n\n        x = numpy.zeros((len(batch), IMAGE_STACK, IMAGE_WIDTH, IMAGE_HEIGHT))\n        y = numpy.zeros((len(batch), self.actionCnt))\n        errors = numpy.zeros(len(batch))\n        \n        for i in range(len(batch)):\n            o = batch[i][1]\n            s = o[0]; a = o[1]; r = o[2]; s_ = o[3]\n            \n            t = p[i]\n            oldVal = t[a]\n            if s_ is None:\n                t[a] = r\n            else:\n                t[a] = r + GAMMA * pTarget_[i][ numpy.argmax(p_[i]) ]  # double DQN\n\n            x[i] = s\n            y[i] = t\n            errors[i] = abs(oldVal - t[a])\n\n        return (x, y, errors)\n\n    def replay(self):    \n        batch = self.memory.sample(BATCH_SIZE)\n        x, y, errors = self._getTargets(batch)\n\n        #update errors\n        for i in range(len(batch)):\n            idx = batch[i][0]\n            self.memory.update(idx, errors[i])\n\n        self.brain.train(x, y)\n\nclass RandomAgent:\n    memory = Memory(MEMORY_CAPACITY)\n    exp = 0\n\n    def __init__(self, actionCnt):\n        self.actionCnt = actionCnt\n\n    def act(self, s):\n        return random.randint(0, self.actionCnt-1)\n\n    def observe(self, sample):  # in (s, a, r, s_) format\n        error = abs(sample[2])  # reward\n        self.memory.add(error, sample)\n        self.exp += 1\n\n    def replay(self):\n        pass\n\n#-------------------- ENVIRONMENT ---------------------\nclass Environment:\n    def __init__(self, problem):\n        self.problem = problem\n        self.env = gym.make(problem)\n\n    def run(self, agent):                \n        img = self.env.reset()\n        w = processImage(img)\n        s = numpy.array([w, w])\n\n        R = 0\n        while True:         \n            # self.env.render()\n            a = agent.act(s)\n\n            r = 0\n            img, r, done, info = self.env.step(a)\n            s_ = numpy.array([s[1], processImage(img)]) #last two screens\n\n            r = np.clip(r, -1, 1)   # clip reward to [-1, 1]\n\n            if done: # terminal state\n                s_ = None\n\n            agent.observe( (s, a, r, s_) )\n            agent.replay()            \n\n            s = s_\n            R += r\n\n            if done:\n                break\n\n        print(\"Total reward:\", R)\n\n#-------------------- MAIN ----------------------------\nPROBLEM = 'Seaquest-v0'\nenv = Environment(PROBLEM)\n\nstateCnt  = (IMAGE_STACK, IMAGE_WIDTH, IMAGE_HEIGHT)\nactionCnt = env.env.action_space.n\n\nagent = Agent(stateCnt, actionCnt)\nrandomAgent = RandomAgent(actionCnt)\n\ntry:\n    print(\"Initialization with random agent...\")\n    while randomAgent.exp < MEMORY_CAPACITY:\n        env.run(randomAgent)\n        print(randomAgent.exp, \"/\", MEMORY_CAPACITY)\n\n    agent.memory = randomAgent.memory\n\n    randomAgent = None\n\n    print(\"Starting learning\")\n    while True:\n        env.run(agent)\nfinally:\n    agent.brain.model.save(\"Seaquest-DQN-PER.h5\")\n"
  },
  {
    "path": "SumTree.py",
    "content": "import numpy\n\nclass SumTree:\n    write = 0\n\n    def __init__(self, capacity):\n        self.capacity = capacity\n        self.tree = numpy.zeros( 2*capacity - 1 )\n        self.data = numpy.zeros( capacity, dtype=object )\n\n    def _propagate(self, idx, change):\n        parent = (idx - 1) // 2\n\n        self.tree[parent] += change\n\n        if parent != 0:\n            self._propagate(parent, change)\n\n    def _retrieve(self, idx, s):\n        left = 2 * idx + 1\n        right = left + 1\n\n        if left >= len(self.tree):\n            return idx\n\n        if s <= self.tree[left]:\n            return self._retrieve(left, s)\n        else:\n            return self._retrieve(right, s-self.tree[left])\n\n    def total(self):\n        return self.tree[0]\n\n    def add(self, p, data):\n        idx = self.write + self.capacity - 1\n\n        self.data[self.write] = data\n        self.update(idx, p)\n\n        self.write += 1\n        if self.write >= self.capacity:\n            self.write = 0\n\n    def update(self, idx, p):\n        change = p - self.tree[idx]\n\n        self.tree[idx] = p\n        self._propagate(idx, change)\n\n    def get(self, s):\n        idx = self._retrieve(0, s)\n        dataIdx = idx - self.capacity + 1\n\n        return (idx, self.tree[idx], self.data[dataIdx])"
  }
]