Repository: jaromiru/AI-blog Branch: master Commit: 348628b10505 Files: 7 Total size: 34.7 KB Directory structure: gitextract_jgc2t_gv/ ├── CartPole-A3C.py ├── CartPole-DQN.py ├── CartPole-basic.py ├── LICENSE ├── MountainCar-basic.py ├── Seaquest-DDQN-PER.py └── SumTree.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: CartPole-A3C.py ================================================ # OpenGym CartPole-v0 with A3C on GPU # ----------------------------------- # # A3C implementation with GPU optimizer threads. # # Made as part of blog series Let's make an A3C, available at # https://jaromiru.com/2017/02/16/lets-make-an-a3c-theory/ # # author: Jaromir Janisch, 2017 import numpy as np import tensorflow as tf import gym, time, random, threading from keras.models import * from keras.layers import * from keras import backend as K #-- constants ENV = 'CartPole-v0' RUN_TIME = 30 THREADS = 8 OPTIMIZERS = 2 THREAD_DELAY = 0.001 GAMMA = 0.99 N_STEP_RETURN = 8 GAMMA_N = GAMMA ** N_STEP_RETURN EPS_START = 0.4 EPS_STOP = .15 EPS_STEPS = 75000 MIN_BATCH = 32 LEARNING_RATE = 5e-3 LOSS_V = .5 # v loss coefficient LOSS_ENTROPY = .01 # entropy coefficient #--------- class Brain: train_queue = [ [], [], [], [], [] ] # s, a, r, s', s' terminal mask lock_queue = threading.Lock() def __init__(self): self.session = tf.Session() K.set_session(self.session) K.manual_variable_initialization(True) self.model = self._build_model() self.graph = self._build_graph(self.model) self.session.run(tf.global_variables_initializer()) self.default_graph = tf.get_default_graph() self.default_graph.finalize() # avoid modifications def _build_model(self): l_input = Input( batch_shape=(None, NUM_STATE) ) l_dense = Dense(16, activation='relu')(l_input) out_actions = Dense(NUM_ACTIONS, activation='softmax')(l_dense) out_value = Dense(1, activation='linear')(l_dense) model = Model(inputs=[l_input], outputs=[out_actions, out_value]) model._make_predict_function() # have to initialize before threading return model def _build_graph(self, model): s_t = tf.placeholder(tf.float32, shape=(None, NUM_STATE)) a_t = tf.placeholder(tf.float32, shape=(None, NUM_ACTIONS)) r_t = tf.placeholder(tf.float32, shape=(None, 1)) # not immediate, but discounted n step reward p, v = model(s_t) log_prob = tf.log( tf.reduce_sum(p * a_t, axis=1, keep_dims=True) + 1e-10) advantage = r_t - v loss_policy = - log_prob * tf.stop_gradient(advantage) # maximize policy loss_value = LOSS_V * tf.square(advantage) # minimize value error entropy = LOSS_ENTROPY * tf.reduce_sum(p * tf.log(p + 1e-10), axis=1, keep_dims=True) # maximize entropy (regularization) loss_total = tf.reduce_mean(loss_policy + loss_value + entropy) optimizer = tf.train.RMSPropOptimizer(LEARNING_RATE, decay=.99) minimize = optimizer.minimize(loss_total) return s_t, a_t, r_t, minimize def optimize(self): if len(self.train_queue[0]) < MIN_BATCH: time.sleep(0) # yield return with self.lock_queue: if len(self.train_queue[0]) < MIN_BATCH: # more thread could have passed without lock return # we can't yield inside lock s, a, r, s_, s_mask = self.train_queue self.train_queue = [ [], [], [], [], [] ] s = np.vstack(s) a = np.vstack(a) r = np.vstack(r) s_ = np.vstack(s_) s_mask = np.vstack(s_mask) if len(s) > 5*MIN_BATCH: print("Optimizer alert! Minimizing batch of %d" % len(s)) v = self.predict_v(s_) r = r + GAMMA_N * v * s_mask # set v to 0 where s_ is terminal state s_t, a_t, r_t, minimize = self.graph self.session.run(minimize, feed_dict={s_t: s, a_t: a, r_t: r}) def train_push(self, s, a, r, s_): with self.lock_queue: self.train_queue[0].append(s) self.train_queue[1].append(a) self.train_queue[2].append(r) if s_ is None: self.train_queue[3].append(NONE_STATE) self.train_queue[4].append(0.) else: self.train_queue[3].append(s_) self.train_queue[4].append(1.) def predict(self, s): with self.default_graph.as_default(): p, v = self.model.predict(s) return p, v def predict_p(self, s): with self.default_graph.as_default(): p, v = self.model.predict(s) return p def predict_v(self, s): with self.default_graph.as_default(): p, v = self.model.predict(s) return v #--------- frames = 0 class Agent: def __init__(self, eps_start, eps_end, eps_steps): self.eps_start = eps_start self.eps_end = eps_end self.eps_steps = eps_steps self.memory = [] # used for n_step return self.R = 0. def getEpsilon(self): if(frames >= self.eps_steps): return self.eps_end else: return self.eps_start + frames * (self.eps_end - self.eps_start) / self.eps_steps # linearly interpolate def act(self, s): eps = self.getEpsilon() global frames; frames = frames + 1 if random.random() < eps: return random.randint(0, NUM_ACTIONS-1) else: s = np.array([s]) p = brain.predict_p(s)[0] # a = np.argmax(p) a = np.random.choice(NUM_ACTIONS, p=p) return a def train(self, s, a, r, s_): def get_sample(memory, n): s, a, _, _ = memory[0] _, _, _, s_ = memory[n-1] return s, a, self.R, s_ a_cats = np.zeros(NUM_ACTIONS) # turn action into one-hot representation a_cats[a] = 1 self.memory.append( (s, a_cats, r, s_) ) self.R = ( self.R + r * GAMMA_N ) / GAMMA if s_ is None: while len(self.memory) > 0: n = len(self.memory) s, a, r, s_ = get_sample(self.memory, n) brain.train_push(s, a, r, s_) self.R = ( self.R - self.memory[0][2] ) / GAMMA self.memory.pop(0) self.R = 0 if len(self.memory) >= N_STEP_RETURN: s, a, r, s_ = get_sample(self.memory, N_STEP_RETURN) brain.train_push(s, a, r, s_) self.R = self.R - self.memory[0][2] self.memory.pop(0) # possible edge case - if an episode ends in self.capacity: self.samples.pop(0) def sample(self, n): n = min(n, len(self.samples)) return random.sample(self.samples, n) def isFull(self): return len(self.samples) >= self.capacity #-------------------- AGENT --------------------------- MEMORY_CAPACITY = 100000 BATCH_SIZE = 64 GAMMA = 0.99 MAX_EPSILON = 1 MIN_EPSILON = 0.01 LAMBDA = 0.001 # speed of decay UPDATE_TARGET_FREQUENCY = 1000 class Agent: steps = 0 epsilon = MAX_EPSILON def __init__(self, stateCnt, actionCnt): self.stateCnt = stateCnt self.actionCnt = actionCnt self.brain = Brain(stateCnt, actionCnt) self.memory = Memory(MEMORY_CAPACITY) def act(self, s): if random.random() < self.epsilon: return random.randint(0, self.actionCnt-1) else: return numpy.argmax(self.brain.predictOne(s)) def observe(self, sample): # in (s, a, r, s_) format self.memory.add(sample) if self.steps % UPDATE_TARGET_FREQUENCY == 0: self.brain.updateTargetModel() # debug the Q function in poin S if self.steps % 100 == 0: S = numpy.array([-0.01335408, -0.04600273, -0.00677248, 0.01517507]) pred = agent.brain.predictOne(S) print(pred[0]) sys.stdout.flush() # slowly decrease Epsilon based on our eperience self.steps += 1 self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps) def replay(self): batch = self.memory.sample(BATCH_SIZE) batchLen = len(batch) no_state = numpy.zeros(self.stateCnt) states = numpy.array([ o[0] for o in batch ]) states_ = numpy.array([ (no_state if o[3] is None else o[3]) for o in batch ]) p = self.brain.predict(states) p_ = self.brain.predict(states_, target=True) x = numpy.zeros((batchLen, self.stateCnt)) y = numpy.zeros((batchLen, self.actionCnt)) for i in range(batchLen): o = batch[i] s = o[0]; a = o[1]; r = o[2]; s_ = o[3] t = p[i] if s_ is None: t[a] = r else: t[a] = r + GAMMA * numpy.amax(p_[i]) x[i] = s y[i] = t self.brain.train(x, y) class RandomAgent: memory = Memory(MEMORY_CAPACITY) def __init__(self, actionCnt): self.actionCnt = actionCnt def act(self, s): return random.randint(0, self.actionCnt-1) def observe(self, sample): # in (s, a, r, s_) format self.memory.add(sample) def replay(self): pass #-------------------- ENVIRONMENT --------------------- class Environment: def __init__(self, problem): self.problem = problem self.env = gym.make(problem) def run(self, agent): s = self.env.reset() R = 0 while True: # self.env.render() a = agent.act(s) s_, r, done, info = self.env.step(a) if done: # terminal state s_ = None agent.observe( (s, a, r, s_) ) agent.replay() s = s_ R += r if done: break # print("Total reward:", R) #-------------------- MAIN ---------------------------- PROBLEM = 'CartPole-v0' env = Environment(PROBLEM) stateCnt = env.env.observation_space.shape[0] actionCnt = env.env.action_space.n agent = Agent(stateCnt, actionCnt) randomAgent = RandomAgent(actionCnt) try: while randomAgent.memory.isFull() == False: env.run(randomAgent) agent.memory.samples = randomAgent.memory.samples randomAgent = None while True: env.run(agent) finally: agent.brain.model.save("cartpole-dqn.h5") ================================================ FILE: CartPole-basic.py ================================================ # OpenGym CartPole-v0 # ------------------- # # This code demonstrates use of a basic Q-network (without target network) # to solve OpenGym CartPole-v0 problem. # # Made as part of blog series Let's make a DQN, available at: # https://jaromiru.com/2016/10/03/lets-make-a-dqn-implementation/ # # author: Jaromir Janisch, 2016 #--- enable this to run on GPU # import os # os.environ['THEANO_FLAGS'] = "device=gpu,floatX=float32" import random, numpy, math, gym #-------------------- BRAIN --------------------------- from keras.models import Sequential from keras.layers import * from keras.optimizers import * class Brain: def __init__(self, stateCnt, actionCnt): self.stateCnt = stateCnt self.actionCnt = actionCnt self.model = self._createModel() # self.model.load_weights("cartpole-basic.h5") def _createModel(self): model = Sequential() model.add(Dense(output_dim=64, activation='relu', input_dim=stateCnt)) model.add(Dense(output_dim=actionCnt, activation='linear')) opt = RMSprop(lr=0.00025) model.compile(loss='mse', optimizer=opt) return model def train(self, x, y, epoch=1, verbose=0): self.model.fit(x, y, batch_size=64, nb_epoch=epoch, verbose=verbose) def predict(self, s): return self.model.predict(s) def predictOne(self, s): return self.predict(s.reshape(1, self.stateCnt)).flatten() #-------------------- MEMORY -------------------------- class Memory: # stored as ( s, a, r, s_ ) samples = [] def __init__(self, capacity): self.capacity = capacity def add(self, sample): self.samples.append(sample) if len(self.samples) > self.capacity: self.samples.pop(0) def sample(self, n): n = min(n, len(self.samples)) return random.sample(self.samples, n) #-------------------- AGENT --------------------------- MEMORY_CAPACITY = 100000 BATCH_SIZE = 64 GAMMA = 0.99 MAX_EPSILON = 1 MIN_EPSILON = 0.01 LAMBDA = 0.001 # speed of decay class Agent: steps = 0 epsilon = MAX_EPSILON def __init__(self, stateCnt, actionCnt): self.stateCnt = stateCnt self.actionCnt = actionCnt self.brain = Brain(stateCnt, actionCnt) self.memory = Memory(MEMORY_CAPACITY) def act(self, s): if random.random() < self.epsilon: return random.randint(0, self.actionCnt-1) else: return numpy.argmax(self.brain.predictOne(s)) def observe(self, sample): # in (s, a, r, s_) format self.memory.add(sample) # slowly decrease Epsilon based on our eperience self.steps += 1 self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps) def replay(self): batch = self.memory.sample(BATCH_SIZE) batchLen = len(batch) no_state = numpy.zeros(self.stateCnt) states = numpy.array([ o[0] for o in batch ]) states_ = numpy.array([ (no_state if o[3] is None else o[3]) for o in batch ]) p = self.brain.predict(states) p_ = self.brain.predict(states_) x = numpy.zeros((batchLen, self.stateCnt)) y = numpy.zeros((batchLen, self.actionCnt)) for i in range(batchLen): o = batch[i] s = o[0]; a = o[1]; r = o[2]; s_ = o[3] t = p[i] if s_ is None: t[a] = r else: t[a] = r + GAMMA * numpy.amax(p_[i]) x[i] = s y[i] = t self.brain.train(x, y) #-------------------- ENVIRONMENT --------------------- class Environment: def __init__(self, problem): self.problem = problem self.env = gym.make(problem) def run(self, agent): s = self.env.reset() R = 0 while True: self.env.render() a = agent.act(s) s_, r, done, info = self.env.step(a) if done: # terminal state s_ = None agent.observe( (s, a, r, s_) ) agent.replay() s = s_ R += r if done: break print("Total reward:", R) #-------------------- MAIN ---------------------------- PROBLEM = 'CartPole-v0' env = Environment(PROBLEM) stateCnt = env.env.observation_space.shape[0] actionCnt = env.env.action_space.n agent = Agent(stateCnt, actionCnt) try: while True: env.run(agent) finally: agent.brain.model.save("cartpole-basic.h5") ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2018 Jaromír Janisch Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: MountainCar-basic.py ================================================ # OpenGym MountainCar-v0 # ------------------- # # This code demonstrates debugging of a basic Q-network (without target network) # in an OpenGym MountainCar-v0 environment. # # Made as part of blog series Let's make a DQN, available at: # https://jaromiru.com/2016/10/12/lets-make-a-dqn-debugging/ # # author: Jaromir Janisch, 2016 #--- enable this to run on GPU # import os # os.environ['THEANO_FLAGS'] = "device=gpu,floatX=float32" import random, numpy, math, gym #-------------------- UTILITIES ----------------------- import matplotlib.pyplot as plt from matplotlib import colors import sys def printQ(agent): P = [ [-0.15955113, 0. ], # s_start [ 0.83600049, 0.27574312], # s'' -> s' [ 0.85796947, 0.28245832], # s' -> s [ 0.88062271, 0.29125591], # s -> terminal ] pred = agent.brain.predict( numpy.array(P) ) for o in pred: sys.stdout.write(str(o[1])+" ") print(";") sys.stdout.flush() def mapBrain(brain, res): s = numpy.zeros( (res * res, 2) ) i = 0 for i1 in range(res): for i2 in range(res): s[i] = numpy.array( [ 2 * (i1 - res / 2) / res, 2 * (i2 - res / 2) / res ] ) i += 1 mapV = numpy.amax(brain.predict(s), axis=1).reshape( (res, res) ) mapA = numpy.argmax(brain.predict(s), axis=1).reshape( (res, res) ) return (mapV, mapA) def displayBrain(brain, res=50): mapV, mapA = mapBrain(brain, res) plt.close() plt.show() fig = plt.figure(figsize=(5,7)) fig.add_subplot(211) plt.imshow(mapV) plt.colorbar(orientation='vertical') fig.add_subplot(212) cmap = colors.ListedColormap(['blue', 'red']) bounds=[-0.5,0.5,1.5] norm = colors.BoundaryNorm(bounds, cmap.N) plt.imshow(mapA, cmap=cmap, norm=norm) cb = plt.colorbar(orientation='vertical', ticks=[0,1]) plt.pause(0.001) #-------------------- BRAIN --------------------------- from keras.models import Sequential from keras.layers import * from keras.optimizers import * class Brain: def __init__(self, stateCnt, actionCnt): self.stateCnt = stateCnt self.actionCnt = actionCnt self.model = self._createModel() # self.model.load_weights("MountainCar-basic.h5") def _createModel(self): model = Sequential() model.add(Dense(output_dim=64, activation='relu', input_dim=stateCnt)) model.add(Dense(output_dim=actionCnt, activation='linear')) opt = RMSprop(lr=0.00025) model.compile(loss='mse', optimizer=opt) return model def train(self, x, y, epoch=1, verbose=0): self.model.fit(x, y, batch_size=64, nb_epoch=epoch, verbose=verbose) def predict(self, s): return self.model.predict(s) def predictOne(self, s): return self.predict(s.reshape(1, self.stateCnt)).flatten() #-------------------- MEMORY -------------------------- class Memory: # stored as ( s, a, r, s_ ) samples = [] def __init__(self, capacity): self.capacity = capacity def add(self, sample): self.samples.append(sample) if len(self.samples) > self.capacity: self.samples.pop(0) def sample(self, n): n = min(n, len(self.samples)) return random.sample(self.samples, n) def isFull(self): return len(self.samples) >= self.capacity #-------------------- AGENT --------------------------- MEMORY_CAPACITY = 100000 BATCH_SIZE = 64 GAMMA = 0.99 MAX_EPSILON = 1 MIN_EPSILON = 0.1 LAMBDA = 0.001 # speed of decay class Agent: steps = 0 epsilon = MAX_EPSILON def __init__(self, stateCnt, actionCnt): self.stateCnt = stateCnt self.actionCnt = actionCnt self.brain = Brain(stateCnt, actionCnt) self.memory = Memory(MEMORY_CAPACITY) def act(self, s): if random.random() < self.epsilon: return random.randint(0, self.actionCnt-1) else: return numpy.argmax(self.brain.predictOne(s)) def observe(self, sample): # in (s, a, r, s_) format self.memory.add(sample) # ----- debug if self.steps % 1000 == 0: printQ(self) if self.steps % 10000 == 0: displayBrain(self.brain) # slowly decrease Epsilon based on our eperience self.steps += 1 self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps) def replay(self): batch = self.memory.sample(BATCH_SIZE) batchLen = len(batch) no_state = numpy.zeros(self.stateCnt) states = numpy.array([ o[0] for o in batch ]) states_ = numpy.array([ (no_state if o[3] is None else o[3]) for o in batch ]) p = agent.brain.predict(states) p_ = agent.brain.predict(states_) x = numpy.zeros((batchLen, self.stateCnt)) y = numpy.zeros((batchLen, self.actionCnt)) for i in range(batchLen): o = batch[i] s = o[0]; a = o[1]; r = o[2]; s_ = o[3] t = p[i] if s_ is None: t[a] = r else: t[a] = r + GAMMA * numpy.amax(p_[i]) x[i] = s y[i] = t self.brain.train(x, y) class RandomAgent: memory = Memory(MEMORY_CAPACITY) def __init__(self, actionCnt): self.actionCnt = actionCnt def act(self, s): return random.randint(0, self.actionCnt-1) def observe(self, sample): # in (s, a, r, s_) format self.memory.add(sample) def replay(self): pass #-------------------- ENVIRONMENT --------------------- class Environment: def __init__(self, problem): self.problem = problem self.env = gym.make(problem) high = self.env.observation_space.high low = self.env.observation_space.low self.mean = (high + low) / 2 self.spread = abs(high - low) / 2 def normalize(self, s): return (s - self.mean) / self.spread def run(self, agent): s = self.env.reset() s = self.normalize(s) R = 0 while True: # self.env.render() a = agent.act(s) # map actions; 0 = left, 2 = right if a == 0: a_ = 0 elif a == 1: a_ = 2 s_, r, done, info = self.env.step(a_) s_ = self.normalize(s_) if done: # terminal state s_ = None agent.observe( (s, a, r, s_) ) agent.replay() s = s_ R += r if done: break # print("Total reward:", R) #-------------------- MAIN ---------------------------- PROBLEM = 'MountainCar-v0' env = Environment(PROBLEM) stateCnt = env.env.observation_space.shape[0] actionCnt = 2 #env.env.action_space.n agent = Agent(stateCnt, actionCnt) randomAgent = RandomAgent(actionCnt) try: while randomAgent.memory.isFull() == False: env.run(randomAgent) agent.memory = randomAgent.memory randomAgent = None while True: env.run(agent) finally: agent.brain.model.save("MountainCar-basic.h5") ================================================ FILE: Seaquest-DDQN-PER.py ================================================ # OpenGym Seaquest-v0 # ------------------- # # This code demonstrates a Double DQN network with Priority Experience Replay # in an OpenGym Seaquest-v0 environment. # # Made as part of blog series Let's make a DQN, available at: # https://jaromiru.com/2016/11/07/lets-make-a-dqn-double-learning-and-prioritized-experience-replay/ # # author: Jaromir Janisch, 2016 import random, numpy, math, gym, scipy from SumTree import SumTree IMAGE_WIDTH = 84 IMAGE_HEIGHT = 84 IMAGE_STACK = 2 HUBER_LOSS_DELTA = 2.0 LEARNING_RATE = 0.00025 #-------------------- UTILITIES ----------------------- def huber_loss(y_true, y_pred): err = y_true - y_pred cond = K.abs(err) < HUBER_LOSS_DELTA L2 = 0.5 * K.square(err) L1 = HUBER_LOSS_DELTA * (K.abs(err) - 0.5 * HUBER_LOSS_DELTA) loss = tf.where(cond, L2, L1) # Keras does not cover where function in tensorflow :-( return K.mean(loss) def processImage( img ): rgb = scipy.misc.imresize(img, (IMAGE_WIDTH, IMAGE_HEIGHT), interp='bilinear') r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2] gray = 0.2989 * r + 0.5870 * g + 0.1140 * b # extract luminance o = gray.astype('float32') / 128 - 1 # normalize return o #-------------------- BRAIN --------------------------- from keras.models import Sequential from keras.layers import * from keras.optimizers import * class Brain: def __init__(self, stateCnt, actionCnt): self.stateCnt = stateCnt self.actionCnt = actionCnt self.model = self._createModel() self.model_ = self._createModel() # target network def _createModel(self): model = Sequential() model.add(Conv2D(32, (8, 8), strides=(4,4), activation='relu', input_shape=(self.stateCnt), data_format='channels_first')) model.add(Conv2D(64, (4, 4), strides=(2,2), activation='relu')) model.add(Conv2D(64, (3, 3), activation='relu')) model.add(Flatten()) model.add(Dense(units=512, activation='relu')) model.add(Dense(units=actionCnt, activation='linear')) opt = RMSprop(lr=LEARNING_RATE) model.compile(loss=huber_loss, optimizer=opt) return model def train(self, x, y, epochs=1, verbose=0): self.model.fit(x, y, batch_size=32, epochs=epochs, verbose=verbose) def predict(self, s, target=False): if target: return self.model_.predict(s) else: return self.model.predict(s) def predictOne(self, s, target=False): return self.predict(s.reshape(1, IMAGE_STACK, IMAGE_WIDTH, IMAGE_HEIGHT), target).flatten() def updateTargetModel(self): self.model_.set_weights(self.model.get_weights()) #-------------------- MEMORY -------------------------- class Memory: # stored as ( s, a, r, s_ ) in SumTree e = 0.01 a = 0.6 def __init__(self, capacity): self.tree = SumTree(capacity) def _getPriority(self, error): return (error + self.e) ** self.a def add(self, error, sample): p = self._getPriority(error) self.tree.add(p, sample) def sample(self, n): batch = [] segment = self.tree.total() / n for i in range(n): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, data) = self.tree.get(s) batch.append( (idx, data) ) return batch def update(self, idx, error): p = self._getPriority(error) self.tree.update(idx, p) #-------------------- AGENT --------------------------- MEMORY_CAPACITY = 200000 BATCH_SIZE = 32 GAMMA = 0.99 MAX_EPSILON = 1 MIN_EPSILON = 0.1 EXPLORATION_STOP = 500000 # at this step epsilon will be 0.01 LAMBDA = - math.log(0.01) / EXPLORATION_STOP # speed of decay UPDATE_TARGET_FREQUENCY = 10000 class Agent: steps = 0 epsilon = MAX_EPSILON def __init__(self, stateCnt, actionCnt): self.stateCnt = stateCnt self.actionCnt = actionCnt self.brain = Brain(stateCnt, actionCnt) # self.memory = Memory(MEMORY_CAPACITY) def act(self, s): if random.random() < self.epsilon: return random.randint(0, self.actionCnt-1) else: return numpy.argmax(self.brain.predictOne(s)) def observe(self, sample): # in (s, a, r, s_) format x, y, errors = self._getTargets([(0, sample)]) self.memory.add(errors[0], sample) if self.steps % UPDATE_TARGET_FREQUENCY == 0: self.brain.updateTargetModel() # slowly decrease Epsilon based on our eperience self.steps += 1 self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps) def _getTargets(self, batch): no_state = numpy.zeros(self.stateCnt) states = numpy.array([ o[1][0] for o in batch ]) states_ = numpy.array([ (no_state if o[1][3] is None else o[1][3]) for o in batch ]) p = agent.brain.predict(states) p_ = agent.brain.predict(states_, target=False) pTarget_ = agent.brain.predict(states_, target=True) x = numpy.zeros((len(batch), IMAGE_STACK, IMAGE_WIDTH, IMAGE_HEIGHT)) y = numpy.zeros((len(batch), self.actionCnt)) errors = numpy.zeros(len(batch)) for i in range(len(batch)): o = batch[i][1] s = o[0]; a = o[1]; r = o[2]; s_ = o[3] t = p[i] oldVal = t[a] if s_ is None: t[a] = r else: t[a] = r + GAMMA * pTarget_[i][ numpy.argmax(p_[i]) ] # double DQN x[i] = s y[i] = t errors[i] = abs(oldVal - t[a]) return (x, y, errors) def replay(self): batch = self.memory.sample(BATCH_SIZE) x, y, errors = self._getTargets(batch) #update errors for i in range(len(batch)): idx = batch[i][0] self.memory.update(idx, errors[i]) self.brain.train(x, y) class RandomAgent: memory = Memory(MEMORY_CAPACITY) exp = 0 def __init__(self, actionCnt): self.actionCnt = actionCnt def act(self, s): return random.randint(0, self.actionCnt-1) def observe(self, sample): # in (s, a, r, s_) format error = abs(sample[2]) # reward self.memory.add(error, sample) self.exp += 1 def replay(self): pass #-------------------- ENVIRONMENT --------------------- class Environment: def __init__(self, problem): self.problem = problem self.env = gym.make(problem) def run(self, agent): img = self.env.reset() w = processImage(img) s = numpy.array([w, w]) R = 0 while True: # self.env.render() a = agent.act(s) r = 0 img, r, done, info = self.env.step(a) s_ = numpy.array([s[1], processImage(img)]) #last two screens r = np.clip(r, -1, 1) # clip reward to [-1, 1] if done: # terminal state s_ = None agent.observe( (s, a, r, s_) ) agent.replay() s = s_ R += r if done: break print("Total reward:", R) #-------------------- MAIN ---------------------------- PROBLEM = 'Seaquest-v0' env = Environment(PROBLEM) stateCnt = (IMAGE_STACK, IMAGE_WIDTH, IMAGE_HEIGHT) actionCnt = env.env.action_space.n agent = Agent(stateCnt, actionCnt) randomAgent = RandomAgent(actionCnt) try: print("Initialization with random agent...") while randomAgent.exp < MEMORY_CAPACITY: env.run(randomAgent) print(randomAgent.exp, "/", MEMORY_CAPACITY) agent.memory = randomAgent.memory randomAgent = None print("Starting learning") while True: env.run(agent) finally: agent.brain.model.save("Seaquest-DQN-PER.h5") ================================================ FILE: SumTree.py ================================================ import numpy class SumTree: write = 0 def __init__(self, capacity): self.capacity = capacity self.tree = numpy.zeros( 2*capacity - 1 ) self.data = numpy.zeros( capacity, dtype=object ) def _propagate(self, idx, change): parent = (idx - 1) // 2 self.tree[parent] += change if parent != 0: self._propagate(parent, change) def _retrieve(self, idx, s): left = 2 * idx + 1 right = left + 1 if left >= len(self.tree): return idx if s <= self.tree[left]: return self._retrieve(left, s) else: return self._retrieve(right, s-self.tree[left]) def total(self): return self.tree[0] def add(self, p, data): idx = self.write + self.capacity - 1 self.data[self.write] = data self.update(idx, p) self.write += 1 if self.write >= self.capacity: self.write = 0 def update(self, idx, p): change = p - self.tree[idx] self.tree[idx] = p self._propagate(idx, change) def get(self, s): idx = self._retrieve(0, s) dataIdx = idx - self.capacity + 1 return (idx, self.tree[idx], self.data[dataIdx])