Repository: jaromiru/AI-blog
Branch: master
Commit: 348628b10505
Files: 7
Total size: 34.7 KB
Directory structure:
gitextract_jgc2t_gv/
├── CartPole-A3C.py
├── CartPole-DQN.py
├── CartPole-basic.py
├── LICENSE
├── MountainCar-basic.py
├── Seaquest-DDQN-PER.py
└── SumTree.py
================================================
FILE CONTENTS
================================================
================================================
FILE: CartPole-A3C.py
================================================
# OpenGym CartPole-v0 with A3C on GPU
# -----------------------------------
#
# A3C implementation with GPU optimizer threads.
#
# Made as part of blog series Let's make an A3C, available at
# https://jaromiru.com/2017/02/16/lets-make-an-a3c-theory/
#
# author: Jaromir Janisch, 2017
import numpy as np
import tensorflow as tf
import gym, time, random, threading
from keras.models import *
from keras.layers import *
from keras import backend as K
#-- constants
ENV = 'CartPole-v0'
RUN_TIME = 30
THREADS = 8
OPTIMIZERS = 2
THREAD_DELAY = 0.001
GAMMA = 0.99
N_STEP_RETURN = 8
GAMMA_N = GAMMA ** N_STEP_RETURN
EPS_START = 0.4
EPS_STOP = .15
EPS_STEPS = 75000
MIN_BATCH = 32
LEARNING_RATE = 5e-3
LOSS_V = .5 # v loss coefficient
LOSS_ENTROPY = .01 # entropy coefficient
#---------
class Brain:
train_queue = [ [], [], [], [], [] ] # s, a, r, s', s' terminal mask
lock_queue = threading.Lock()
def __init__(self):
self.session = tf.Session()
K.set_session(self.session)
K.manual_variable_initialization(True)
self.model = self._build_model()
self.graph = self._build_graph(self.model)
self.session.run(tf.global_variables_initializer())
self.default_graph = tf.get_default_graph()
self.default_graph.finalize() # avoid modifications
def _build_model(self):
l_input = Input( batch_shape=(None, NUM_STATE) )
l_dense = Dense(16, activation='relu')(l_input)
out_actions = Dense(NUM_ACTIONS, activation='softmax')(l_dense)
out_value = Dense(1, activation='linear')(l_dense)
model = Model(inputs=[l_input], outputs=[out_actions, out_value])
model._make_predict_function() # have to initialize before threading
return model
def _build_graph(self, model):
s_t = tf.placeholder(tf.float32, shape=(None, NUM_STATE))
a_t = tf.placeholder(tf.float32, shape=(None, NUM_ACTIONS))
r_t = tf.placeholder(tf.float32, shape=(None, 1)) # not immediate, but discounted n step reward
p, v = model(s_t)
log_prob = tf.log( tf.reduce_sum(p * a_t, axis=1, keep_dims=True) + 1e-10)
advantage = r_t - v
loss_policy = - log_prob * tf.stop_gradient(advantage) # maximize policy
loss_value = LOSS_V * tf.square(advantage) # minimize value error
entropy = LOSS_ENTROPY * tf.reduce_sum(p * tf.log(p + 1e-10), axis=1, keep_dims=True) # maximize entropy (regularization)
loss_total = tf.reduce_mean(loss_policy + loss_value + entropy)
optimizer = tf.train.RMSPropOptimizer(LEARNING_RATE, decay=.99)
minimize = optimizer.minimize(loss_total)
return s_t, a_t, r_t, minimize
def optimize(self):
if len(self.train_queue[0]) < MIN_BATCH:
time.sleep(0) # yield
return
with self.lock_queue:
if len(self.train_queue[0]) < MIN_BATCH: # more thread could have passed without lock
return # we can't yield inside lock
s, a, r, s_, s_mask = self.train_queue
self.train_queue = [ [], [], [], [], [] ]
s = np.vstack(s)
a = np.vstack(a)
r = np.vstack(r)
s_ = np.vstack(s_)
s_mask = np.vstack(s_mask)
if len(s) > 5*MIN_BATCH: print("Optimizer alert! Minimizing batch of %d" % len(s))
v = self.predict_v(s_)
r = r + GAMMA_N * v * s_mask # set v to 0 where s_ is terminal state
s_t, a_t, r_t, minimize = self.graph
self.session.run(minimize, feed_dict={s_t: s, a_t: a, r_t: r})
def train_push(self, s, a, r, s_):
with self.lock_queue:
self.train_queue[0].append(s)
self.train_queue[1].append(a)
self.train_queue[2].append(r)
if s_ is None:
self.train_queue[3].append(NONE_STATE)
self.train_queue[4].append(0.)
else:
self.train_queue[3].append(s_)
self.train_queue[4].append(1.)
def predict(self, s):
with self.default_graph.as_default():
p, v = self.model.predict(s)
return p, v
def predict_p(self, s):
with self.default_graph.as_default():
p, v = self.model.predict(s)
return p
def predict_v(self, s):
with self.default_graph.as_default():
p, v = self.model.predict(s)
return v
#---------
frames = 0
class Agent:
def __init__(self, eps_start, eps_end, eps_steps):
self.eps_start = eps_start
self.eps_end = eps_end
self.eps_steps = eps_steps
self.memory = [] # used for n_step return
self.R = 0.
def getEpsilon(self):
if(frames >= self.eps_steps):
return self.eps_end
else:
return self.eps_start + frames * (self.eps_end - self.eps_start) / self.eps_steps # linearly interpolate
def act(self, s):
eps = self.getEpsilon()
global frames; frames = frames + 1
if random.random() < eps:
return random.randint(0, NUM_ACTIONS-1)
else:
s = np.array([s])
p = brain.predict_p(s)[0]
# a = np.argmax(p)
a = np.random.choice(NUM_ACTIONS, p=p)
return a
def train(self, s, a, r, s_):
def get_sample(memory, n):
s, a, _, _ = memory[0]
_, _, _, s_ = memory[n-1]
return s, a, self.R, s_
a_cats = np.zeros(NUM_ACTIONS) # turn action into one-hot representation
a_cats[a] = 1
self.memory.append( (s, a_cats, r, s_) )
self.R = ( self.R + r * GAMMA_N ) / GAMMA
if s_ is None:
while len(self.memory) > 0:
n = len(self.memory)
s, a, r, s_ = get_sample(self.memory, n)
brain.train_push(s, a, r, s_)
self.R = ( self.R - self.memory[0][2] ) / GAMMA
self.memory.pop(0)
self.R = 0
if len(self.memory) >= N_STEP_RETURN:
s, a, r, s_ = get_sample(self.memory, N_STEP_RETURN)
brain.train_push(s, a, r, s_)
self.R = self.R - self.memory[0][2]
self.memory.pop(0)
# possible edge case - if an episode ends in <N steps, the computation is incorrect
#---------
class Environment(threading.Thread):
stop_signal = False
def __init__(self, render=False, eps_start=EPS_START, eps_end=EPS_STOP, eps_steps=EPS_STEPS):
threading.Thread.__init__(self)
self.render = render
self.env = gym.make(ENV)
self.agent = Agent(eps_start, eps_end, eps_steps)
def runEpisode(self):
s = self.env.reset()
R = 0
while True:
time.sleep(THREAD_DELAY) # yield
if self.render: self.env.render()
a = self.agent.act(s)
s_, r, done, info = self.env.step(a)
if done: # terminal state
s_ = None
self.agent.train(s, a, r, s_)
s = s_
R += r
if done or self.stop_signal:
break
print("Total R:", R)
def run(self):
while not self.stop_signal:
self.runEpisode()
def stop(self):
self.stop_signal = True
#---------
class Optimizer(threading.Thread):
stop_signal = False
def __init__(self):
threading.Thread.__init__(self)
def run(self):
while not self.stop_signal:
brain.optimize()
def stop(self):
self.stop_signal = True
#-- main
env_test = Environment(render=True, eps_start=0., eps_end=0.)
NUM_STATE = env_test.env.observation_space.shape[0]
NUM_ACTIONS = env_test.env.action_space.n
NONE_STATE = np.zeros(NUM_STATE)
brain = Brain() # brain is global in A3C
envs = [Environment() for i in range(THREADS)]
opts = [Optimizer() for i in range(OPTIMIZERS)]
for o in opts:
o.start()
for e in envs:
e.start()
time.sleep(RUN_TIME)
for e in envs:
e.stop()
for e in envs:
e.join()
for o in opts:
o.stop()
for o in opts:
o.join()
print("Training finished")
env_test.run()
================================================
FILE: CartPole-DQN.py
================================================
# OpenGym CartPole-v0
# -------------------
#
# This code demonstrates use a full DQN implementation
# to solve OpenGym CartPole-v0 problem.
#
# Made as part of blog series Let's make a DQN, available at:
# https://jaromiru.com/2016/09/27/lets-make-a-dqn-theory/
#
# author: Jaromir Janisch, 2016
import random, numpy, math, gym, sys
from keras import backend as K
import tensorflow as tf
#----------
HUBER_LOSS_DELTA = 1.0
LEARNING_RATE = 0.00025
#----------
def huber_loss(y_true, y_pred):
err = y_true - y_pred
cond = K.abs(err) < HUBER_LOSS_DELTA
L2 = 0.5 * K.square(err)
L1 = HUBER_LOSS_DELTA * (K.abs(err) - 0.5 * HUBER_LOSS_DELTA)
loss = tf.where(cond, L2, L1) # Keras does not cover where function in tensorflow :-(
return K.mean(loss)
#-------------------- BRAIN ---------------------------
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import *
class Brain:
def __init__(self, stateCnt, actionCnt):
self.stateCnt = stateCnt
self.actionCnt = actionCnt
self.model = self._createModel()
self.model_ = self._createModel()
def _createModel(self):
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=stateCnt))
model.add(Dense(units=actionCnt, activation='linear'))
opt = RMSprop(lr=LEARNING_RATE)
model.compile(loss=huber_loss, optimizer=opt)
return model
def train(self, x, y, epochs=1, verbose=0):
self.model.fit(x, y, batch_size=64, epochs=epochs, verbose=verbose)
def predict(self, s, target=False):
if target:
return self.model_.predict(s)
else:
return self.model.predict(s)
def predictOne(self, s, target=False):
return self.predict(s.reshape(1, self.stateCnt), target=target).flatten()
def updateTargetModel(self):
self.model_.set_weights(self.model.get_weights())
#-------------------- MEMORY --------------------------
class Memory: # stored as ( s, a, r, s_ )
samples = []
def __init__(self, capacity):
self.capacity = capacity
def add(self, sample):
self.samples.append(sample)
if len(self.samples) > self.capacity:
self.samples.pop(0)
def sample(self, n):
n = min(n, len(self.samples))
return random.sample(self.samples, n)
def isFull(self):
return len(self.samples) >= self.capacity
#-------------------- AGENT ---------------------------
MEMORY_CAPACITY = 100000
BATCH_SIZE = 64
GAMMA = 0.99
MAX_EPSILON = 1
MIN_EPSILON = 0.01
LAMBDA = 0.001 # speed of decay
UPDATE_TARGET_FREQUENCY = 1000
class Agent:
steps = 0
epsilon = MAX_EPSILON
def __init__(self, stateCnt, actionCnt):
self.stateCnt = stateCnt
self.actionCnt = actionCnt
self.brain = Brain(stateCnt, actionCnt)
self.memory = Memory(MEMORY_CAPACITY)
def act(self, s):
if random.random() < self.epsilon:
return random.randint(0, self.actionCnt-1)
else:
return numpy.argmax(self.brain.predictOne(s))
def observe(self, sample): # in (s, a, r, s_) format
self.memory.add(sample)
if self.steps % UPDATE_TARGET_FREQUENCY == 0:
self.brain.updateTargetModel()
# debug the Q function in poin S
if self.steps % 100 == 0:
S = numpy.array([-0.01335408, -0.04600273, -0.00677248, 0.01517507])
pred = agent.brain.predictOne(S)
print(pred[0])
sys.stdout.flush()
# slowly decrease Epsilon based on our eperience
self.steps += 1
self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps)
def replay(self):
batch = self.memory.sample(BATCH_SIZE)
batchLen = len(batch)
no_state = numpy.zeros(self.stateCnt)
states = numpy.array([ o[0] for o in batch ])
states_ = numpy.array([ (no_state if o[3] is None else o[3]) for o in batch ])
p = self.brain.predict(states)
p_ = self.brain.predict(states_, target=True)
x = numpy.zeros((batchLen, self.stateCnt))
y = numpy.zeros((batchLen, self.actionCnt))
for i in range(batchLen):
o = batch[i]
s = o[0]; a = o[1]; r = o[2]; s_ = o[3]
t = p[i]
if s_ is None:
t[a] = r
else:
t[a] = r + GAMMA * numpy.amax(p_[i])
x[i] = s
y[i] = t
self.brain.train(x, y)
class RandomAgent:
memory = Memory(MEMORY_CAPACITY)
def __init__(self, actionCnt):
self.actionCnt = actionCnt
def act(self, s):
return random.randint(0, self.actionCnt-1)
def observe(self, sample): # in (s, a, r, s_) format
self.memory.add(sample)
def replay(self):
pass
#-------------------- ENVIRONMENT ---------------------
class Environment:
def __init__(self, problem):
self.problem = problem
self.env = gym.make(problem)
def run(self, agent):
s = self.env.reset()
R = 0
while True:
# self.env.render()
a = agent.act(s)
s_, r, done, info = self.env.step(a)
if done: # terminal state
s_ = None
agent.observe( (s, a, r, s_) )
agent.replay()
s = s_
R += r
if done:
break
# print("Total reward:", R)
#-------------------- MAIN ----------------------------
PROBLEM = 'CartPole-v0'
env = Environment(PROBLEM)
stateCnt = env.env.observation_space.shape[0]
actionCnt = env.env.action_space.n
agent = Agent(stateCnt, actionCnt)
randomAgent = RandomAgent(actionCnt)
try:
while randomAgent.memory.isFull() == False:
env.run(randomAgent)
agent.memory.samples = randomAgent.memory.samples
randomAgent = None
while True:
env.run(agent)
finally:
agent.brain.model.save("cartpole-dqn.h5")
================================================
FILE: CartPole-basic.py
================================================
# OpenGym CartPole-v0
# -------------------
#
# This code demonstrates use of a basic Q-network (without target network)
# to solve OpenGym CartPole-v0 problem.
#
# Made as part of blog series Let's make a DQN, available at:
# https://jaromiru.com/2016/10/03/lets-make-a-dqn-implementation/
#
# author: Jaromir Janisch, 2016
#--- enable this to run on GPU
# import os
# os.environ['THEANO_FLAGS'] = "device=gpu,floatX=float32"
import random, numpy, math, gym
#-------------------- BRAIN ---------------------------
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import *
class Brain:
def __init__(self, stateCnt, actionCnt):
self.stateCnt = stateCnt
self.actionCnt = actionCnt
self.model = self._createModel()
# self.model.load_weights("cartpole-basic.h5")
def _createModel(self):
model = Sequential()
model.add(Dense(output_dim=64, activation='relu', input_dim=stateCnt))
model.add(Dense(output_dim=actionCnt, activation='linear'))
opt = RMSprop(lr=0.00025)
model.compile(loss='mse', optimizer=opt)
return model
def train(self, x, y, epoch=1, verbose=0):
self.model.fit(x, y, batch_size=64, nb_epoch=epoch, verbose=verbose)
def predict(self, s):
return self.model.predict(s)
def predictOne(self, s):
return self.predict(s.reshape(1, self.stateCnt)).flatten()
#-------------------- MEMORY --------------------------
class Memory: # stored as ( s, a, r, s_ )
samples = []
def __init__(self, capacity):
self.capacity = capacity
def add(self, sample):
self.samples.append(sample)
if len(self.samples) > self.capacity:
self.samples.pop(0)
def sample(self, n):
n = min(n, len(self.samples))
return random.sample(self.samples, n)
#-------------------- AGENT ---------------------------
MEMORY_CAPACITY = 100000
BATCH_SIZE = 64
GAMMA = 0.99
MAX_EPSILON = 1
MIN_EPSILON = 0.01
LAMBDA = 0.001 # speed of decay
class Agent:
steps = 0
epsilon = MAX_EPSILON
def __init__(self, stateCnt, actionCnt):
self.stateCnt = stateCnt
self.actionCnt = actionCnt
self.brain = Brain(stateCnt, actionCnt)
self.memory = Memory(MEMORY_CAPACITY)
def act(self, s):
if random.random() < self.epsilon:
return random.randint(0, self.actionCnt-1)
else:
return numpy.argmax(self.brain.predictOne(s))
def observe(self, sample): # in (s, a, r, s_) format
self.memory.add(sample)
# slowly decrease Epsilon based on our eperience
self.steps += 1
self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps)
def replay(self):
batch = self.memory.sample(BATCH_SIZE)
batchLen = len(batch)
no_state = numpy.zeros(self.stateCnt)
states = numpy.array([ o[0] for o in batch ])
states_ = numpy.array([ (no_state if o[3] is None else o[3]) for o in batch ])
p = self.brain.predict(states)
p_ = self.brain.predict(states_)
x = numpy.zeros((batchLen, self.stateCnt))
y = numpy.zeros((batchLen, self.actionCnt))
for i in range(batchLen):
o = batch[i]
s = o[0]; a = o[1]; r = o[2]; s_ = o[3]
t = p[i]
if s_ is None:
t[a] = r
else:
t[a] = r + GAMMA * numpy.amax(p_[i])
x[i] = s
y[i] = t
self.brain.train(x, y)
#-------------------- ENVIRONMENT ---------------------
class Environment:
def __init__(self, problem):
self.problem = problem
self.env = gym.make(problem)
def run(self, agent):
s = self.env.reset()
R = 0
while True:
self.env.render()
a = agent.act(s)
s_, r, done, info = self.env.step(a)
if done: # terminal state
s_ = None
agent.observe( (s, a, r, s_) )
agent.replay()
s = s_
R += r
if done:
break
print("Total reward:", R)
#-------------------- MAIN ----------------------------
PROBLEM = 'CartPole-v0'
env = Environment(PROBLEM)
stateCnt = env.env.observation_space.shape[0]
actionCnt = env.env.action_space.n
agent = Agent(stateCnt, actionCnt)
try:
while True:
env.run(agent)
finally:
agent.brain.model.save("cartpole-basic.h5")
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2018 Jaromír Janisch
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: MountainCar-basic.py
================================================
# OpenGym MountainCar-v0
# -------------------
#
# This code demonstrates debugging of a basic Q-network (without target network)
# in an OpenGym MountainCar-v0 environment.
#
# Made as part of blog series Let's make a DQN, available at:
# https://jaromiru.com/2016/10/12/lets-make-a-dqn-debugging/
#
# author: Jaromir Janisch, 2016
#--- enable this to run on GPU
# import os
# os.environ['THEANO_FLAGS'] = "device=gpu,floatX=float32"
import random, numpy, math, gym
#-------------------- UTILITIES -----------------------
import matplotlib.pyplot as plt
from matplotlib import colors
import sys
def printQ(agent):
P = [
[-0.15955113, 0. ], # s_start
[ 0.83600049, 0.27574312], # s'' -> s'
[ 0.85796947, 0.28245832], # s' -> s
[ 0.88062271, 0.29125591], # s -> terminal
]
pred = agent.brain.predict( numpy.array(P) )
for o in pred:
sys.stdout.write(str(o[1])+" ")
print(";")
sys.stdout.flush()
def mapBrain(brain, res):
s = numpy.zeros( (res * res, 2) )
i = 0
for i1 in range(res):
for i2 in range(res):
s[i] = numpy.array( [ 2 * (i1 - res / 2) / res, 2 * (i2 - res / 2) / res ] )
i += 1
mapV = numpy.amax(brain.predict(s), axis=1).reshape( (res, res) )
mapA = numpy.argmax(brain.predict(s), axis=1).reshape( (res, res) )
return (mapV, mapA)
def displayBrain(brain, res=50):
mapV, mapA = mapBrain(brain, res)
plt.close()
plt.show()
fig = plt.figure(figsize=(5,7))
fig.add_subplot(211)
plt.imshow(mapV)
plt.colorbar(orientation='vertical')
fig.add_subplot(212)
cmap = colors.ListedColormap(['blue', 'red'])
bounds=[-0.5,0.5,1.5]
norm = colors.BoundaryNorm(bounds, cmap.N)
plt.imshow(mapA, cmap=cmap, norm=norm)
cb = plt.colorbar(orientation='vertical', ticks=[0,1])
plt.pause(0.001)
#-------------------- BRAIN ---------------------------
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import *
class Brain:
def __init__(self, stateCnt, actionCnt):
self.stateCnt = stateCnt
self.actionCnt = actionCnt
self.model = self._createModel()
# self.model.load_weights("MountainCar-basic.h5")
def _createModel(self):
model = Sequential()
model.add(Dense(output_dim=64, activation='relu', input_dim=stateCnt))
model.add(Dense(output_dim=actionCnt, activation='linear'))
opt = RMSprop(lr=0.00025)
model.compile(loss='mse', optimizer=opt)
return model
def train(self, x, y, epoch=1, verbose=0):
self.model.fit(x, y, batch_size=64, nb_epoch=epoch, verbose=verbose)
def predict(self, s):
return self.model.predict(s)
def predictOne(self, s):
return self.predict(s.reshape(1, self.stateCnt)).flatten()
#-------------------- MEMORY --------------------------
class Memory: # stored as ( s, a, r, s_ )
samples = []
def __init__(self, capacity):
self.capacity = capacity
def add(self, sample):
self.samples.append(sample)
if len(self.samples) > self.capacity:
self.samples.pop(0)
def sample(self, n):
n = min(n, len(self.samples))
return random.sample(self.samples, n)
def isFull(self):
return len(self.samples) >= self.capacity
#-------------------- AGENT ---------------------------
MEMORY_CAPACITY = 100000
BATCH_SIZE = 64
GAMMA = 0.99
MAX_EPSILON = 1
MIN_EPSILON = 0.1
LAMBDA = 0.001 # speed of decay
class Agent:
steps = 0
epsilon = MAX_EPSILON
def __init__(self, stateCnt, actionCnt):
self.stateCnt = stateCnt
self.actionCnt = actionCnt
self.brain = Brain(stateCnt, actionCnt)
self.memory = Memory(MEMORY_CAPACITY)
def act(self, s):
if random.random() < self.epsilon:
return random.randint(0, self.actionCnt-1)
else:
return numpy.argmax(self.brain.predictOne(s))
def observe(self, sample): # in (s, a, r, s_) format
self.memory.add(sample)
# ----- debug
if self.steps % 1000 == 0:
printQ(self)
if self.steps % 10000 == 0:
displayBrain(self.brain)
# slowly decrease Epsilon based on our eperience
self.steps += 1
self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps)
def replay(self):
batch = self.memory.sample(BATCH_SIZE)
batchLen = len(batch)
no_state = numpy.zeros(self.stateCnt)
states = numpy.array([ o[0] for o in batch ])
states_ = numpy.array([ (no_state if o[3] is None else o[3]) for o in batch ])
p = agent.brain.predict(states)
p_ = agent.brain.predict(states_)
x = numpy.zeros((batchLen, self.stateCnt))
y = numpy.zeros((batchLen, self.actionCnt))
for i in range(batchLen):
o = batch[i]
s = o[0]; a = o[1]; r = o[2]; s_ = o[3]
t = p[i]
if s_ is None:
t[a] = r
else:
t[a] = r + GAMMA * numpy.amax(p_[i])
x[i] = s
y[i] = t
self.brain.train(x, y)
class RandomAgent:
memory = Memory(MEMORY_CAPACITY)
def __init__(self, actionCnt):
self.actionCnt = actionCnt
def act(self, s):
return random.randint(0, self.actionCnt-1)
def observe(self, sample): # in (s, a, r, s_) format
self.memory.add(sample)
def replay(self):
pass
#-------------------- ENVIRONMENT ---------------------
class Environment:
def __init__(self, problem):
self.problem = problem
self.env = gym.make(problem)
high = self.env.observation_space.high
low = self.env.observation_space.low
self.mean = (high + low) / 2
self.spread = abs(high - low) / 2
def normalize(self, s):
return (s - self.mean) / self.spread
def run(self, agent):
s = self.env.reset()
s = self.normalize(s)
R = 0
while True:
# self.env.render()
a = agent.act(s) # map actions; 0 = left, 2 = right
if a == 0:
a_ = 0
elif a == 1:
a_ = 2
s_, r, done, info = self.env.step(a_)
s_ = self.normalize(s_)
if done: # terminal state
s_ = None
agent.observe( (s, a, r, s_) )
agent.replay()
s = s_
R += r
if done:
break
# print("Total reward:", R)
#-------------------- MAIN ----------------------------
PROBLEM = 'MountainCar-v0'
env = Environment(PROBLEM)
stateCnt = env.env.observation_space.shape[0]
actionCnt = 2 #env.env.action_space.n
agent = Agent(stateCnt, actionCnt)
randomAgent = RandomAgent(actionCnt)
try:
while randomAgent.memory.isFull() == False:
env.run(randomAgent)
agent.memory = randomAgent.memory
randomAgent = None
while True:
env.run(agent)
finally:
agent.brain.model.save("MountainCar-basic.h5")
================================================
FILE: Seaquest-DDQN-PER.py
================================================
# OpenGym Seaquest-v0
# -------------------
#
# This code demonstrates a Double DQN network with Priority Experience Replay
# in an OpenGym Seaquest-v0 environment.
#
# Made as part of blog series Let's make a DQN, available at:
# https://jaromiru.com/2016/11/07/lets-make-a-dqn-double-learning-and-prioritized-experience-replay/
#
# author: Jaromir Janisch, 2016
import random, numpy, math, gym, scipy
from SumTree import SumTree
IMAGE_WIDTH = 84
IMAGE_HEIGHT = 84
IMAGE_STACK = 2
HUBER_LOSS_DELTA = 2.0
LEARNING_RATE = 0.00025
#-------------------- UTILITIES -----------------------
def huber_loss(y_true, y_pred):
err = y_true - y_pred
cond = K.abs(err) < HUBER_LOSS_DELTA
L2 = 0.5 * K.square(err)
L1 = HUBER_LOSS_DELTA * (K.abs(err) - 0.5 * HUBER_LOSS_DELTA)
loss = tf.where(cond, L2, L1) # Keras does not cover where function in tensorflow :-(
return K.mean(loss)
def processImage( img ):
rgb = scipy.misc.imresize(img, (IMAGE_WIDTH, IMAGE_HEIGHT), interp='bilinear')
r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2]
gray = 0.2989 * r + 0.5870 * g + 0.1140 * b # extract luminance
o = gray.astype('float32') / 128 - 1 # normalize
return o
#-------------------- BRAIN ---------------------------
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import *
class Brain:
def __init__(self, stateCnt, actionCnt):
self.stateCnt = stateCnt
self.actionCnt = actionCnt
self.model = self._createModel()
self.model_ = self._createModel() # target network
def _createModel(self):
model = Sequential()
model.add(Conv2D(32, (8, 8), strides=(4,4), activation='relu', input_shape=(self.stateCnt), data_format='channels_first'))
model.add(Conv2D(64, (4, 4), strides=(2,2), activation='relu'))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(Flatten())
model.add(Dense(units=512, activation='relu'))
model.add(Dense(units=actionCnt, activation='linear'))
opt = RMSprop(lr=LEARNING_RATE)
model.compile(loss=huber_loss, optimizer=opt)
return model
def train(self, x, y, epochs=1, verbose=0):
self.model.fit(x, y, batch_size=32, epochs=epochs, verbose=verbose)
def predict(self, s, target=False):
if target:
return self.model_.predict(s)
else:
return self.model.predict(s)
def predictOne(self, s, target=False):
return self.predict(s.reshape(1, IMAGE_STACK, IMAGE_WIDTH, IMAGE_HEIGHT), target).flatten()
def updateTargetModel(self):
self.model_.set_weights(self.model.get_weights())
#-------------------- MEMORY --------------------------
class Memory: # stored as ( s, a, r, s_ ) in SumTree
e = 0.01
a = 0.6
def __init__(self, capacity):
self.tree = SumTree(capacity)
def _getPriority(self, error):
return (error + self.e) ** self.a
def add(self, error, sample):
p = self._getPriority(error)
self.tree.add(p, sample)
def sample(self, n):
batch = []
segment = self.tree.total() / n
for i in range(n):
a = segment * i
b = segment * (i + 1)
s = random.uniform(a, b)
(idx, p, data) = self.tree.get(s)
batch.append( (idx, data) )
return batch
def update(self, idx, error):
p = self._getPriority(error)
self.tree.update(idx, p)
#-------------------- AGENT ---------------------------
MEMORY_CAPACITY = 200000
BATCH_SIZE = 32
GAMMA = 0.99
MAX_EPSILON = 1
MIN_EPSILON = 0.1
EXPLORATION_STOP = 500000 # at this step epsilon will be 0.01
LAMBDA = - math.log(0.01) / EXPLORATION_STOP # speed of decay
UPDATE_TARGET_FREQUENCY = 10000
class Agent:
steps = 0
epsilon = MAX_EPSILON
def __init__(self, stateCnt, actionCnt):
self.stateCnt = stateCnt
self.actionCnt = actionCnt
self.brain = Brain(stateCnt, actionCnt)
# self.memory = Memory(MEMORY_CAPACITY)
def act(self, s):
if random.random() < self.epsilon:
return random.randint(0, self.actionCnt-1)
else:
return numpy.argmax(self.brain.predictOne(s))
def observe(self, sample): # in (s, a, r, s_) format
x, y, errors = self._getTargets([(0, sample)])
self.memory.add(errors[0], sample)
if self.steps % UPDATE_TARGET_FREQUENCY == 0:
self.brain.updateTargetModel()
# slowly decrease Epsilon based on our eperience
self.steps += 1
self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps)
def _getTargets(self, batch):
no_state = numpy.zeros(self.stateCnt)
states = numpy.array([ o[1][0] for o in batch ])
states_ = numpy.array([ (no_state if o[1][3] is None else o[1][3]) for o in batch ])
p = agent.brain.predict(states)
p_ = agent.brain.predict(states_, target=False)
pTarget_ = agent.brain.predict(states_, target=True)
x = numpy.zeros((len(batch), IMAGE_STACK, IMAGE_WIDTH, IMAGE_HEIGHT))
y = numpy.zeros((len(batch), self.actionCnt))
errors = numpy.zeros(len(batch))
for i in range(len(batch)):
o = batch[i][1]
s = o[0]; a = o[1]; r = o[2]; s_ = o[3]
t = p[i]
oldVal = t[a]
if s_ is None:
t[a] = r
else:
t[a] = r + GAMMA * pTarget_[i][ numpy.argmax(p_[i]) ] # double DQN
x[i] = s
y[i] = t
errors[i] = abs(oldVal - t[a])
return (x, y, errors)
def replay(self):
batch = self.memory.sample(BATCH_SIZE)
x, y, errors = self._getTargets(batch)
#update errors
for i in range(len(batch)):
idx = batch[i][0]
self.memory.update(idx, errors[i])
self.brain.train(x, y)
class RandomAgent:
memory = Memory(MEMORY_CAPACITY)
exp = 0
def __init__(self, actionCnt):
self.actionCnt = actionCnt
def act(self, s):
return random.randint(0, self.actionCnt-1)
def observe(self, sample): # in (s, a, r, s_) format
error = abs(sample[2]) # reward
self.memory.add(error, sample)
self.exp += 1
def replay(self):
pass
#-------------------- ENVIRONMENT ---------------------
class Environment:
def __init__(self, problem):
self.problem = problem
self.env = gym.make(problem)
def run(self, agent):
img = self.env.reset()
w = processImage(img)
s = numpy.array([w, w])
R = 0
while True:
# self.env.render()
a = agent.act(s)
r = 0
img, r, done, info = self.env.step(a)
s_ = numpy.array([s[1], processImage(img)]) #last two screens
r = np.clip(r, -1, 1) # clip reward to [-1, 1]
if done: # terminal state
s_ = None
agent.observe( (s, a, r, s_) )
agent.replay()
s = s_
R += r
if done:
break
print("Total reward:", R)
#-------------------- MAIN ----------------------------
PROBLEM = 'Seaquest-v0'
env = Environment(PROBLEM)
stateCnt = (IMAGE_STACK, IMAGE_WIDTH, IMAGE_HEIGHT)
actionCnt = env.env.action_space.n
agent = Agent(stateCnt, actionCnt)
randomAgent = RandomAgent(actionCnt)
try:
print("Initialization with random agent...")
while randomAgent.exp < MEMORY_CAPACITY:
env.run(randomAgent)
print(randomAgent.exp, "/", MEMORY_CAPACITY)
agent.memory = randomAgent.memory
randomAgent = None
print("Starting learning")
while True:
env.run(agent)
finally:
agent.brain.model.save("Seaquest-DQN-PER.h5")
================================================
FILE: SumTree.py
================================================
import numpy
class SumTree:
write = 0
def __init__(self, capacity):
self.capacity = capacity
self.tree = numpy.zeros( 2*capacity - 1 )
self.data = numpy.zeros( capacity, dtype=object )
def _propagate(self, idx, change):
parent = (idx - 1) // 2
self.tree[parent] += change
if parent != 0:
self._propagate(parent, change)
def _retrieve(self, idx, s):
left = 2 * idx + 1
right = left + 1
if left >= len(self.tree):
return idx
if s <= self.tree[left]:
return self._retrieve(left, s)
else:
return self._retrieve(right, s-self.tree[left])
def total(self):
return self.tree[0]
def add(self, p, data):
idx = self.write + self.capacity - 1
self.data[self.write] = data
self.update(idx, p)
self.write += 1
if self.write >= self.capacity:
self.write = 0
def update(self, idx, p):
change = p - self.tree[idx]
self.tree[idx] = p
self._propagate(idx, change)
def get(self, s):
idx = self._retrieve(0, s)
dataIdx = idx - self.capacity + 1
return (idx, self.tree[idx], self.data[dataIdx])
gitextract_jgc2t_gv/ ├── CartPole-A3C.py ├── CartPole-DQN.py ├── CartPole-basic.py ├── LICENSE ├── MountainCar-basic.py ├── Seaquest-DDQN-PER.py └── SumTree.py
SYMBOL INDEX (132 symbols across 6 files)
FILE: CartPole-A3C.py
class Brain (line 44) | class Brain:
method __init__ (line 48) | def __init__(self):
method _build_model (line 61) | def _build_model(self):
method _build_graph (line 74) | def _build_graph(self, model):
method optimize (line 95) | def optimize(self):
method train_push (line 121) | def train_push(self, s, a, r, s_):
method predict (line 134) | def predict(self, s):
method predict_p (line 139) | def predict_p(self, s):
method predict_v (line 144) | def predict_v(self, s):
class Agent (line 151) | class Agent:
method __init__ (line 152) | def __init__(self, eps_start, eps_end, eps_steps):
method getEpsilon (line 160) | def getEpsilon(self):
method act (line 166) | def act(self, s):
method train (line 182) | def train(self, s, a, r, s_):
class Environment (line 217) | class Environment(threading.Thread):
method __init__ (line 220) | def __init__(self, render=False, eps_start=EPS_START, eps_end=EPS_STOP...
method runEpisode (line 227) | def runEpisode(self):
method run (line 252) | def run(self):
method stop (line 256) | def stop(self):
class Optimizer (line 260) | class Optimizer(threading.Thread):
method __init__ (line 263) | def __init__(self):
method run (line 266) | def run(self):
method stop (line 270) | def stop(self):
FILE: CartPole-DQN.py
function huber_loss (line 22) | def huber_loss(y_true, y_pred):
class Brain (line 38) | class Brain:
method __init__ (line 39) | def __init__(self, stateCnt, actionCnt):
method _createModel (line 46) | def _createModel(self):
method train (line 57) | def train(self, x, y, epochs=1, verbose=0):
method predict (line 60) | def predict(self, s, target=False):
method predictOne (line 66) | def predictOne(self, s, target=False):
method updateTargetModel (line 69) | def updateTargetModel(self):
class Memory (line 73) | class Memory: # stored as ( s, a, r, s_ )
method __init__ (line 76) | def __init__(self, capacity):
method add (line 79) | def add(self, sample):
method sample (line 85) | def sample(self, n):
method isFull (line 89) | def isFull(self):
class Agent (line 104) | class Agent:
method __init__ (line 108) | def __init__(self, stateCnt, actionCnt):
method act (line 115) | def act(self, s):
method observe (line 121) | def observe(self, sample): # in (s, a, r, s_) format
method replay (line 138) | def replay(self):
class RandomAgent (line 169) | class RandomAgent:
method __init__ (line 172) | def __init__(self, actionCnt):
method act (line 175) | def act(self, s):
method observe (line 178) | def observe(self, sample): # in (s, a, r, s_) format
method replay (line 181) | def replay(self):
class Environment (line 185) | class Environment:
method __init__ (line 186) | def __init__(self, problem):
method run (line 190) | def run(self, agent):
FILE: CartPole-basic.py
class Brain (line 24) | class Brain:
method __init__ (line 25) | def __init__(self, stateCnt, actionCnt):
method _createModel (line 32) | def _createModel(self):
method train (line 43) | def train(self, x, y, epoch=1, verbose=0):
method predict (line 46) | def predict(self, s):
method predictOne (line 49) | def predictOne(self, s):
class Memory (line 53) | class Memory: # stored as ( s, a, r, s_ )
method __init__ (line 56) | def __init__(self, capacity):
method add (line 59) | def add(self, sample):
method sample (line 65) | def sample(self, n):
class Agent (line 79) | class Agent:
method __init__ (line 83) | def __init__(self, stateCnt, actionCnt):
method act (line 90) | def act(self, s):
method observe (line 96) | def observe(self, sample): # in (s, a, r, s_) format
method replay (line 103) | def replay(self):
class Environment (line 134) | class Environment:
method __init__ (line 135) | def __init__(self, problem):
method run (line 139) | def run(self, agent):
FILE: MountainCar-basic.py
function printQ (line 24) | def printQ(agent):
function mapBrain (line 41) | def mapBrain(brain, res):
function displayBrain (line 55) | def displayBrain(brain, res=50):
class Brain (line 83) | class Brain:
method __init__ (line 84) | def __init__(self, stateCnt, actionCnt):
method _createModel (line 91) | def _createModel(self):
method train (line 102) | def train(self, x, y, epoch=1, verbose=0):
method predict (line 105) | def predict(self, s):
method predictOne (line 108) | def predictOne(self, s):
class Memory (line 112) | class Memory: # stored as ( s, a, r, s_ )
method __init__ (line 115) | def __init__(self, capacity):
method add (line 118) | def add(self, sample):
method sample (line 124) | def sample(self, n):
method isFull (line 128) | def isFull(self):
class Agent (line 141) | class Agent:
method __init__ (line 145) | def __init__(self, stateCnt, actionCnt):
method act (line 152) | def act(self, s):
method observe (line 158) | def observe(self, sample): # in (s, a, r, s_) format
method replay (line 172) | def replay(self):
class RandomAgent (line 202) | class RandomAgent:
method __init__ (line 205) | def __init__(self, actionCnt):
method act (line 208) | def act(self, s):
method observe (line 211) | def observe(self, sample): # in (s, a, r, s_) format
method replay (line 214) | def replay(self):
class Environment (line 218) | class Environment:
method __init__ (line 219) | def __init__(self, problem):
method normalize (line 229) | def normalize(self, s):
method run (line 232) | def run(self, agent):
FILE: Seaquest-DDQN-PER.py
function huber_loss (line 23) | def huber_loss(y_true, y_pred):
function processImage (line 34) | def processImage( img ):
class Brain (line 48) | class Brain:
method __init__ (line 49) | def __init__(self, stateCnt, actionCnt):
method _createModel (line 56) | def _createModel(self):
method train (line 72) | def train(self, x, y, epochs=1, verbose=0):
method predict (line 75) | def predict(self, s, target=False):
method predictOne (line 81) | def predictOne(self, s, target=False):
method updateTargetModel (line 84) | def updateTargetModel(self):
class Memory (line 88) | class Memory: # stored as ( s, a, r, s_ ) in SumTree
method __init__ (line 92) | def __init__(self, capacity):
method _getPriority (line 95) | def _getPriority(self, error):
method add (line 98) | def add(self, error, sample):
method sample (line 102) | def sample(self, n):
method update (line 116) | def update(self, idx, error):
class Agent (line 135) | class Agent:
method __init__ (line 139) | def __init__(self, stateCnt, actionCnt):
method act (line 146) | def act(self, s):
method observe (line 152) | def observe(self, sample): # in (s, a, r, s_) format
method _getTargets (line 163) | def _getTargets(self, batch):
method replay (line 195) | def replay(self):
class RandomAgent (line 206) | class RandomAgent:
method __init__ (line 210) | def __init__(self, actionCnt):
method act (line 213) | def act(self, s):
method observe (line 216) | def observe(self, sample): # in (s, a, r, s_) format
method replay (line 221) | def replay(self):
class Environment (line 225) | class Environment:
method __init__ (line 226) | def __init__(self, problem):
method run (line 230) | def run(self, agent):
FILE: SumTree.py
class SumTree (line 3) | class SumTree:
method __init__ (line 6) | def __init__(self, capacity):
method _propagate (line 11) | def _propagate(self, idx, change):
method _retrieve (line 19) | def _retrieve(self, idx, s):
method total (line 31) | def total(self):
method add (line 34) | def add(self, p, data):
method update (line 44) | def update(self, idx, p):
method get (line 50) | def get(self, s):
Condensed preview — 7 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (38K chars).
[
{
"path": "CartPole-A3C.py",
"chars": 7142,
"preview": "# OpenGym CartPole-v0 with A3C on GPU\n# -----------------------------------\n#\n# A3C implementation with GPU optimizer th"
},
{
"path": "CartPole-DQN.py",
"chars": 6147,
"preview": "# OpenGym CartPole-v0\n# -------------------\n#\n# This code demonstrates use a full DQN implementation\n# to solve OpenGym "
},
{
"path": "CartPole-basic.py",
"chars": 4634,
"preview": "# OpenGym CartPole-v0\n# -------------------\n#\n# This code demonstrates use of a basic Q-network (without target network)"
},
{
"path": "LICENSE",
"chars": 1072,
"preview": "MIT License\n\nCopyright (c) 2018 Jaromír Janisch\n\nPermission is hereby granted, free of charge, to any person obtaining a"
},
{
"path": "MountainCar-basic.py",
"chars": 7311,
"preview": "# OpenGym MountainCar-v0\n# -------------------\n#\n# This code demonstrates debugging of a basic Q-network (without target"
},
{
"path": "Seaquest-DDQN-PER.py",
"chars": 7973,
"preview": "# OpenGym Seaquest-v0\n# -------------------\n#\n# This code demonstrates a Double DQN network with Priority Experience Rep"
},
{
"path": "SumTree.py",
"chars": 1264,
"preview": "import numpy\n\nclass SumTree:\n write = 0\n\n def __init__(self, capacity):\n self.capacity = capacity\n s"
}
]
About this extraction
This page contains the full source code of the jaromiru/AI-blog GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 7 files (34.7 KB), approximately 9.7k tokens, and a symbol index with 132 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.