Repository: MorvanZhou/Reinforcement-learning-with-tensorflow Branch: master Commit: 93e333484be0 Files: 60 Total size: 325.0 KB Directory structure: gitextract_8twtflmr/ ├── .gitignore ├── LICENCE ├── README.md ├── contents/ │ ├── 10_A3C/ │ │ ├── A3C_RNN.py │ │ ├── A3C_continuous_action.py │ │ ├── A3C_discrete_action.py │ │ └── A3C_distributed_tf.py │ ├── 11_Dyna_Q/ │ │ ├── RL_brain.py │ │ ├── maze_env.py │ │ └── run_this.py │ ├── 12_Proximal_Policy_Optimization/ │ │ ├── DPPO.py │ │ ├── discrete_DPPO.py │ │ └── simply_PPO.py │ ├── 1_command_line_reinforcement_learning/ │ │ └── treasure_on_right.py │ ├── 2_Q_Learning_maze/ │ │ ├── RL_brain.py │ │ ├── maze_env.py │ │ └── run_this.py │ ├── 3_Sarsa_maze/ │ │ ├── RL_brain.py │ │ ├── maze_env.py │ │ └── run_this.py │ ├── 4_Sarsa_lambda_maze/ │ │ ├── RL_brain.py │ │ ├── maze_env.py │ │ └── run_this.py │ ├── 5.1_Double_DQN/ │ │ ├── RL_brain.py │ │ └── run_Pendulum.py │ ├── 5.2_Prioritized_Replay_DQN/ │ │ ├── RL_brain.py │ │ └── run_MountainCar.py │ ├── 5.3_Dueling_DQN/ │ │ ├── RL_brain.py │ │ └── run_Pendulum.py │ ├── 5_Deep_Q_Network/ │ │ ├── DQN_modified.py │ │ ├── RL_brain.py │ │ ├── maze_env.py │ │ └── run_this.py │ ├── 6_OpenAI_gym/ │ │ ├── RL_brain.py │ │ ├── run_CartPole.py │ │ └── run_MountainCar.py │ ├── 7_Policy_gradient_softmax/ │ │ ├── RL_brain.py │ │ ├── run_CartPole.py │ │ └── run_MountainCar.py │ ├── 8_Actor_Critic_Advantage/ │ │ ├── AC_CartPole.py │ │ └── AC_continue_Pendulum.py │ ├── 9_Deep_Deterministic_Policy_Gradient_DDPG/ │ │ ├── DDPG.py │ │ ├── DDPG_update.py │ │ └── DDPG_update2.py │ └── Curiosity_Model/ │ ├── Curiosity.py │ └── Random_Network_Distillation.py └── experiments/ ├── 2D_car/ │ ├── DDPG.py │ ├── car_env.py │ └── collision.py ├── Robot_arm/ │ ├── A3C.py │ ├── DDPG.py │ ├── DPPO.py │ └── arm_env.py ├── Solve_BipedalWalker/ │ ├── A3C.py │ ├── A3C_rnn.py │ ├── DDPG.py │ └── log/ │ └── events.out.tfevents.1490801027.Morvan └── Solve_LunarLander/ ├── A3C.py ├── DuelingDQNPrioritizedReplay.py └── run_LunarLander.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ .idea __pycache__ ================================================ FILE: LICENCE ================================================ MIT License Copyright (c) 2017 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================
### [Double DQN](contents/5.1_Double_DQN)
### [Dueling DQN](contents/5.3_Dueling_DQN)
### [Actor Critic](contents/8_Actor_Critic_Advantage)
### [Deep Deterministic Policy Gradient](contents/9_Deep_Deterministic_Policy_Gradient_DDPG)
### [A3C](contents/10_A3C)
### [Proximal Policy Optimization (PPO)](contents/12_Proximal_Policy_Optimization)
### [Curiosity Model](/contents/Curiosity_Model)
# Donation
*If this does help you, please consider donating to support me for better tutorials. Any contribution is greatly appreciated!*
================================================
FILE: contents/10_A3C/A3C_RNN.py
================================================
"""
Asynchronous Advantage Actor Critic (A3C) + RNN with continuous action space, Reinforcement Learning.
The Pendulum example.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
tensorflow 1.8.0
gym 0.10.5
"""
import multiprocessing
import threading
import tensorflow as tf
import numpy as np
import gym
import os
import shutil
import matplotlib.pyplot as plt
GAME = 'Pendulum-v0'
OUTPUT_GRAPH = True
LOG_DIR = './log'
N_WORKERS = multiprocessing.cpu_count()
MAX_EP_STEP = 200
MAX_GLOBAL_EP = 1500
GLOBAL_NET_SCOPE = 'Global_Net'
UPDATE_GLOBAL_ITER = 5
GAMMA = 0.9
ENTROPY_BETA = 0.01
LR_A = 0.0001 # learning rate for actor
LR_C = 0.001 # learning rate for critic
GLOBAL_RUNNING_R = []
GLOBAL_EP = 0
env = gym.make(GAME)
N_S = env.observation_space.shape[0]
N_A = env.action_space.shape[0]
A_BOUND = [env.action_space.low, env.action_space.high]
class ACNet(object):
def __init__(self, scope, globalAC=None):
if scope == GLOBAL_NET_SCOPE: # get global network
with tf.variable_scope(scope):
self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
self.a_params, self.c_params = self._build_net(scope)[-2:]
else: # local net, calculate losses
with tf.variable_scope(scope):
self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A')
self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
mu, sigma, self.v, self.a_params, self.c_params = self._build_net(scope)
td = tf.subtract(self.v_target, self.v, name='TD_error')
with tf.name_scope('c_loss'):
self.c_loss = tf.reduce_mean(tf.square(td))
with tf.name_scope('wrap_a_out'):
mu, sigma = mu * A_BOUND[1], sigma + 1e-4
normal_dist = tf.distributions.Normal(mu, sigma)
with tf.name_scope('a_loss'):
log_prob = normal_dist.log_prob(self.a_his)
exp_v = log_prob * tf.stop_gradient(td)
entropy = normal_dist.entropy() # encourage exploration
self.exp_v = ENTROPY_BETA * entropy + exp_v
self.a_loss = tf.reduce_mean(-self.exp_v)
with tf.name_scope('choose_a'): # use local params to choose action
self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=[0, 1]), A_BOUND[0], A_BOUND[1])
with tf.name_scope('local_grad'):
self.a_grads = tf.gradients(self.a_loss, self.a_params)
self.c_grads = tf.gradients(self.c_loss, self.c_params)
with tf.name_scope('sync'):
with tf.name_scope('pull'):
self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
with tf.name_scope('push'):
self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
def _build_net(self, scope):
w_init = tf.random_normal_initializer(0., .1)
with tf.variable_scope('critic'): # only critic controls the rnn update
cell_size = 64
s = tf.expand_dims(self.s, axis=1,
name='timely_input') # [time_step, feature] => [time_step, batch, feature]
rnn_cell = tf.contrib.rnn.BasicRNNCell(cell_size)
self.init_state = rnn_cell.zero_state(batch_size=1, dtype=tf.float32)
outputs, self.final_state = tf.nn.dynamic_rnn(
cell=rnn_cell, inputs=s, initial_state=self.init_state, time_major=True)
cell_out = tf.reshape(outputs, [-1, cell_size], name='flatten_rnn_outputs') # joined state representation
l_c = tf.layers.dense(cell_out, 50, tf.nn.relu6, kernel_initializer=w_init, name='lc')
v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value
with tf.variable_scope('actor'): # state representation is based on critic
l_a = tf.layers.dense(cell_out, 80, tf.nn.relu6, kernel_initializer=w_init, name='la')
mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu')
sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma')
a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
return mu, sigma, v, a_params, c_params
def update_global(self, feed_dict): # run by a local
SESS.run([self.update_a_op, self.update_c_op], feed_dict) # local grads applies to global net
def pull_global(self): # run by a local
SESS.run([self.pull_a_params_op, self.pull_c_params_op])
def choose_action(self, s, cell_state): # run by a local
s = s[np.newaxis, :]
a, cell_state = SESS.run([self.A, self.final_state], {self.s: s, self.init_state: cell_state})
return a, cell_state
class Worker(object):
def __init__(self, name, globalAC):
self.env = gym.make(GAME).unwrapped
self.name = name
self.AC = ACNet(name, globalAC)
def work(self):
global GLOBAL_RUNNING_R, GLOBAL_EP
total_step = 1
buffer_s, buffer_a, buffer_r = [], [], []
while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
s = self.env.reset()
ep_r = 0
rnn_state = SESS.run(self.AC.init_state) # zero rnn state at beginning
keep_state = rnn_state.copy() # keep rnn state for updating global net
for ep_t in range(MAX_EP_STEP):
if self.name == 'W_0':
self.env.render()
a, rnn_state_ = self.AC.choose_action(s, rnn_state) # get the action and next rnn state
s_, r, done, info = self.env.step(a)
done = True if ep_t == MAX_EP_STEP - 1 else False
ep_r += r
buffer_s.append(s)
buffer_a.append(a)
buffer_r.append((r+8)/8) # normalize
if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net
if done:
v_s_ = 0 # terminal
else:
v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :], self.AC.init_state: rnn_state_})[0, 0]
buffer_v_target = []
for r in buffer_r[::-1]: # reverse buffer r
v_s_ = r + GAMMA * v_s_
buffer_v_target.append(v_s_)
buffer_v_target.reverse()
buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target)
feed_dict = {
self.AC.s: buffer_s,
self.AC.a_his: buffer_a,
self.AC.v_target: buffer_v_target,
self.AC.init_state: keep_state,
}
self.AC.update_global(feed_dict)
buffer_s, buffer_a, buffer_r = [], [], []
self.AC.pull_global()
keep_state = rnn_state_.copy() # replace the keep_state as the new initial rnn state_
s = s_
rnn_state = rnn_state_ # renew rnn state
total_step += 1
if done:
if len(GLOBAL_RUNNING_R) == 0: # record running episode reward
GLOBAL_RUNNING_R.append(ep_r)
else:
GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r)
print(
self.name,
"Ep:", GLOBAL_EP,
"| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
)
GLOBAL_EP += 1
break
if __name__ == "__main__":
SESS = tf.Session()
with tf.device("/cpu:0"):
OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params
workers = []
# Create worker
for i in range(N_WORKERS):
i_name = 'W_%i' % i # worker name
workers.append(Worker(i_name, GLOBAL_AC))
COORD = tf.train.Coordinator()
SESS.run(tf.global_variables_initializer())
if OUTPUT_GRAPH:
if os.path.exists(LOG_DIR):
shutil.rmtree(LOG_DIR)
tf.summary.FileWriter(LOG_DIR, SESS.graph)
worker_threads = []
for worker in workers:
job = lambda: worker.work()
t = threading.Thread(target=job)
t.start()
worker_threads.append(t)
COORD.join(worker_threads)
plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
plt.xlabel('step')
plt.ylabel('Total moving reward')
plt.show()
================================================
FILE: contents/10_A3C/A3C_continuous_action.py
================================================
"""
Asynchronous Advantage Actor Critic (A3C) with continuous action space, Reinforcement Learning.
The Pendulum example.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
tensorflow 1.8.0
gym 0.10.5
"""
import multiprocessing
import threading
import tensorflow as tf
import numpy as np
import gym
import os
import shutil
import matplotlib.pyplot as plt
GAME = 'Pendulum-v0'
OUTPUT_GRAPH = True
LOG_DIR = './log'
N_WORKERS = multiprocessing.cpu_count()
MAX_EP_STEP = 200
MAX_GLOBAL_EP = 2000
GLOBAL_NET_SCOPE = 'Global_Net'
UPDATE_GLOBAL_ITER = 10
GAMMA = 0.9
ENTROPY_BETA = 0.01
LR_A = 0.0001 # learning rate for actor
LR_C = 0.001 # learning rate for critic
GLOBAL_RUNNING_R = []
GLOBAL_EP = 0
env = gym.make(GAME)
N_S = env.observation_space.shape[0]
N_A = env.action_space.shape[0]
A_BOUND = [env.action_space.low, env.action_space.high]
class ACNet(object):
def __init__(self, scope, globalAC=None):
if scope == GLOBAL_NET_SCOPE: # get global network
with tf.variable_scope(scope):
self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
self.a_params, self.c_params = self._build_net(scope)[-2:]
else: # local net, calculate losses
with tf.variable_scope(scope):
self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A')
self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
mu, sigma, self.v, self.a_params, self.c_params = self._build_net(scope)
td = tf.subtract(self.v_target, self.v, name='TD_error')
with tf.name_scope('c_loss'):
self.c_loss = tf.reduce_mean(tf.square(td))
with tf.name_scope('wrap_a_out'):
mu, sigma = mu * A_BOUND[1], sigma + 1e-4
normal_dist = tf.distributions.Normal(mu, sigma)
with tf.name_scope('a_loss'):
log_prob = normal_dist.log_prob(self.a_his)
exp_v = log_prob * tf.stop_gradient(td)
entropy = normal_dist.entropy() # encourage exploration
self.exp_v = ENTROPY_BETA * entropy + exp_v
self.a_loss = tf.reduce_mean(-self.exp_v)
with tf.name_scope('choose_a'): # use local params to choose action
self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=[0, 1]), A_BOUND[0], A_BOUND[1])
with tf.name_scope('local_grad'):
self.a_grads = tf.gradients(self.a_loss, self.a_params)
self.c_grads = tf.gradients(self.c_loss, self.c_params)
with tf.name_scope('sync'):
with tf.name_scope('pull'):
self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
with tf.name_scope('push'):
self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
def _build_net(self, scope):
w_init = tf.random_normal_initializer(0., .1)
with tf.variable_scope('actor'):
l_a = tf.layers.dense(self.s, 200, tf.nn.relu6, kernel_initializer=w_init, name='la')
mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu')
sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma')
with tf.variable_scope('critic'):
l_c = tf.layers.dense(self.s, 100, tf.nn.relu6, kernel_initializer=w_init, name='lc')
v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value
a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
return mu, sigma, v, a_params, c_params
def update_global(self, feed_dict): # run by a local
SESS.run([self.update_a_op, self.update_c_op], feed_dict) # local grads applies to global net
def pull_global(self): # run by a local
SESS.run([self.pull_a_params_op, self.pull_c_params_op])
def choose_action(self, s): # run by a local
s = s[np.newaxis, :]
return SESS.run(self.A, {self.s: s})
class Worker(object):
def __init__(self, name, globalAC):
self.env = gym.make(GAME).unwrapped
self.name = name
self.AC = ACNet(name, globalAC)
def work(self):
global GLOBAL_RUNNING_R, GLOBAL_EP
total_step = 1
buffer_s, buffer_a, buffer_r = [], [], []
while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
s = self.env.reset()
ep_r = 0
for ep_t in range(MAX_EP_STEP):
# if self.name == 'W_0':
# self.env.render()
a = self.AC.choose_action(s)
s_, r, done, info = self.env.step(a)
done = True if ep_t == MAX_EP_STEP - 1 else False
ep_r += r
buffer_s.append(s)
buffer_a.append(a)
buffer_r.append((r+8)/8) # normalize
if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net
if done:
v_s_ = 0 # terminal
else:
v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
buffer_v_target = []
for r in buffer_r[::-1]: # reverse buffer r
v_s_ = r + GAMMA * v_s_
buffer_v_target.append(v_s_)
buffer_v_target.reverse()
buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target)
feed_dict = {
self.AC.s: buffer_s,
self.AC.a_his: buffer_a,
self.AC.v_target: buffer_v_target,
}
self.AC.update_global(feed_dict)
buffer_s, buffer_a, buffer_r = [], [], []
self.AC.pull_global()
s = s_
total_step += 1
if done:
if len(GLOBAL_RUNNING_R) == 0: # record running episode reward
GLOBAL_RUNNING_R.append(ep_r)
else:
GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r)
print(
self.name,
"Ep:", GLOBAL_EP,
"| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
)
GLOBAL_EP += 1
break
if __name__ == "__main__":
SESS = tf.Session()
with tf.device("/cpu:0"):
OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params
workers = []
# Create worker
for i in range(N_WORKERS):
i_name = 'W_%i' % i # worker name
workers.append(Worker(i_name, GLOBAL_AC))
COORD = tf.train.Coordinator()
SESS.run(tf.global_variables_initializer())
if OUTPUT_GRAPH:
if os.path.exists(LOG_DIR):
shutil.rmtree(LOG_DIR)
tf.summary.FileWriter(LOG_DIR, SESS.graph)
worker_threads = []
for worker in workers:
job = lambda: worker.work()
t = threading.Thread(target=job)
t.start()
worker_threads.append(t)
COORD.join(worker_threads)
plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
plt.xlabel('step')
plt.ylabel('Total moving reward')
plt.show()
================================================
FILE: contents/10_A3C/A3C_discrete_action.py
================================================
"""
Asynchronous Advantage Actor Critic (A3C) with discrete action space, Reinforcement Learning.
The Cartpole example.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
tensorflow 1.8.0
gym 0.10.5
"""
import multiprocessing
import threading
import tensorflow as tf
import numpy as np
import gym
import os
import shutil
import matplotlib.pyplot as plt
GAME = 'CartPole-v0'
OUTPUT_GRAPH = True
LOG_DIR = './log'
N_WORKERS = multiprocessing.cpu_count()
MAX_GLOBAL_EP = 1000
GLOBAL_NET_SCOPE = 'Global_Net'
UPDATE_GLOBAL_ITER = 10
GAMMA = 0.9
ENTROPY_BETA = 0.001
LR_A = 0.001 # learning rate for actor
LR_C = 0.001 # learning rate for critic
GLOBAL_RUNNING_R = []
GLOBAL_EP = 0
env = gym.make(GAME)
N_S = env.observation_space.shape[0]
N_A = env.action_space.n
class ACNet(object):
def __init__(self, scope, globalAC=None):
if scope == GLOBAL_NET_SCOPE: # get global network
with tf.variable_scope(scope):
self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
self.a_params, self.c_params = self._build_net(scope)[-2:]
else: # local net, calculate losses
with tf.variable_scope(scope):
self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
self.a_his = tf.placeholder(tf.int32, [None, ], 'A')
self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
self.a_prob, self.v, self.a_params, self.c_params = self._build_net(scope)
td = tf.subtract(self.v_target, self.v, name='TD_error')
with tf.name_scope('c_loss'):
self.c_loss = tf.reduce_mean(tf.square(td))
with tf.name_scope('a_loss'):
log_prob = tf.reduce_sum(tf.log(self.a_prob + 1e-5) * tf.one_hot(self.a_his, N_A, dtype=tf.float32), axis=1, keep_dims=True)
exp_v = log_prob * tf.stop_gradient(td)
entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob + 1e-5),
axis=1, keep_dims=True) # encourage exploration
self.exp_v = ENTROPY_BETA * entropy + exp_v
self.a_loss = tf.reduce_mean(-self.exp_v)
with tf.name_scope('local_grad'):
self.a_grads = tf.gradients(self.a_loss, self.a_params)
self.c_grads = tf.gradients(self.c_loss, self.c_params)
with tf.name_scope('sync'):
with tf.name_scope('pull'):
self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
with tf.name_scope('push'):
self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
def _build_net(self, scope):
w_init = tf.random_normal_initializer(0., .1)
with tf.variable_scope('actor'):
l_a = tf.layers.dense(self.s, 200, tf.nn.relu6, kernel_initializer=w_init, name='la')
a_prob = tf.layers.dense(l_a, N_A, tf.nn.softmax, kernel_initializer=w_init, name='ap')
with tf.variable_scope('critic'):
l_c = tf.layers.dense(self.s, 100, tf.nn.relu6, kernel_initializer=w_init, name='lc')
v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value
a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
return a_prob, v, a_params, c_params
def update_global(self, feed_dict): # run by a local
SESS.run([self.update_a_op, self.update_c_op], feed_dict) # local grads applies to global net
def pull_global(self): # run by a local
SESS.run([self.pull_a_params_op, self.pull_c_params_op])
def choose_action(self, s): # run by a local
prob_weights = SESS.run(self.a_prob, feed_dict={self.s: s[np.newaxis, :]})
action = np.random.choice(range(prob_weights.shape[1]),
p=prob_weights.ravel()) # select action w.r.t the actions prob
return action
class Worker(object):
def __init__(self, name, globalAC):
self.env = gym.make(GAME).unwrapped
self.name = name
self.AC = ACNet(name, globalAC)
def work(self):
global GLOBAL_RUNNING_R, GLOBAL_EP
total_step = 1
buffer_s, buffer_a, buffer_r = [], [], []
while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
s = self.env.reset()
ep_r = 0
while True:
# if self.name == 'W_0':
# self.env.render()
a = self.AC.choose_action(s)
s_, r, done, info = self.env.step(a)
if done: r = -5
ep_r += r
buffer_s.append(s)
buffer_a.append(a)
buffer_r.append(r)
if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net
if done:
v_s_ = 0 # terminal
else:
v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
buffer_v_target = []
for r in buffer_r[::-1]: # reverse buffer r
v_s_ = r + GAMMA * v_s_
buffer_v_target.append(v_s_)
buffer_v_target.reverse()
buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.array(buffer_a), np.vstack(buffer_v_target)
feed_dict = {
self.AC.s: buffer_s,
self.AC.a_his: buffer_a,
self.AC.v_target: buffer_v_target,
}
self.AC.update_global(feed_dict)
buffer_s, buffer_a, buffer_r = [], [], []
self.AC.pull_global()
s = s_
total_step += 1
if done:
if len(GLOBAL_RUNNING_R) == 0: # record running episode reward
GLOBAL_RUNNING_R.append(ep_r)
else:
GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] + 0.01 * ep_r)
print(
self.name,
"Ep:", GLOBAL_EP,
"| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
)
GLOBAL_EP += 1
break
if __name__ == "__main__":
SESS = tf.Session()
with tf.device("/cpu:0"):
OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params
workers = []
# Create worker
for i in range(N_WORKERS):
i_name = 'W_%i' % i # worker name
workers.append(Worker(i_name, GLOBAL_AC))
COORD = tf.train.Coordinator()
SESS.run(tf.global_variables_initializer())
if OUTPUT_GRAPH:
if os.path.exists(LOG_DIR):
shutil.rmtree(LOG_DIR)
tf.summary.FileWriter(LOG_DIR, SESS.graph)
worker_threads = []
for worker in workers:
job = lambda: worker.work()
t = threading.Thread(target=job)
t.start()
worker_threads.append(t)
COORD.join(worker_threads)
plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
plt.xlabel('step')
plt.ylabel('Total moving reward')
plt.show()
================================================
FILE: contents/10_A3C/A3C_distributed_tf.py
================================================
"""
Asynchronous Advantage Actor Critic (A3C) with discrete action space, Reinforcement Learning.
The Cartpole example using distributed tensorflow + multiprocessing.
View more on my tutorial page: https://morvanzhou.github.io/
"""
import multiprocessing as mp
import tensorflow as tf
import numpy as np
import gym, time
import matplotlib.pyplot as plt
UPDATE_GLOBAL_ITER = 10
GAMMA = 0.9
ENTROPY_BETA = 0.001
LR_A = 0.001 # learning rate for actor
LR_C = 0.001 # learning rate for critic
env = gym.make('CartPole-v0')
N_S = env.observation_space.shape[0]
N_A = env.action_space.n
class ACNet(object):
sess = None
def __init__(self, scope, opt_a=None, opt_c=None, global_net=None):
if scope == 'global_net': # get global network
with tf.variable_scope(scope):
self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
self.a_params, self.c_params = self._build_net(scope)[-2:]
else:
with tf.variable_scope(scope):
self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
self.a_his = tf.placeholder(tf.int32, [None, ], 'A')
self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
self.a_prob, self.v, self.a_params, self.c_params = self._build_net(scope)
td = tf.subtract(self.v_target, self.v, name='TD_error')
with tf.name_scope('c_loss'):
self.c_loss = tf.reduce_mean(tf.square(td))
with tf.name_scope('a_loss'):
log_prob = tf.reduce_sum(
tf.log(self.a_prob) * tf.one_hot(self.a_his, N_A, dtype=tf.float32),
axis=1, keep_dims=True)
exp_v = log_prob * tf.stop_gradient(td)
entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob + 1e-5),
axis=1, keep_dims=True) # encourage exploration
self.exp_v = ENTROPY_BETA * entropy + exp_v
self.a_loss = tf.reduce_mean(-self.exp_v)
with tf.name_scope('local_grad'):
self.a_grads = tf.gradients(self.a_loss, self.a_params)
self.c_grads = tf.gradients(self.c_loss, self.c_params)
self.global_step = tf.train.get_or_create_global_step()
with tf.name_scope('sync'):
with tf.name_scope('pull'):
self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, global_net.a_params)]
self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, global_net.c_params)]
with tf.name_scope('push'):
self.update_a_op = opt_a.apply_gradients(zip(self.a_grads, global_net.a_params), global_step=self.global_step)
self.update_c_op = opt_c.apply_gradients(zip(self.c_grads, global_net.c_params))
def _build_net(self, scope):
w_init = tf.random_normal_initializer(0., .1)
with tf.variable_scope('actor'):
l_a = tf.layers.dense(self.s, 200, tf.nn.relu6, kernel_initializer=w_init, name='la')
a_prob = tf.layers.dense(l_a, N_A, tf.nn.softmax, kernel_initializer=w_init, name='ap')
with tf.variable_scope('critic'):
l_c = tf.layers.dense(self.s, 100, tf.nn.relu6, kernel_initializer=w_init, name='lc')
v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value
a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
return a_prob, v, a_params, c_params
def choose_action(self, s): # run by a local
prob_weights = self.sess.run(self.a_prob, feed_dict={self.s: s[np.newaxis, :]})
action = np.random.choice(range(prob_weights.shape[1]),
p=prob_weights.ravel()) # select action w.r.t the actions prob
return action
def update_global(self, feed_dict): # run by a local
self.sess.run([self.update_a_op, self.update_c_op], feed_dict) # local grads applies to global net
def pull_global(self): # run by a local
self.sess.run([self.pull_a_params_op, self.pull_c_params_op])
def work(job_name, task_index, global_ep, lock, r_queue, global_running_r):
# set work's ip:port
cluster = tf.train.ClusterSpec({
"ps": ['localhost:2220', 'localhost:2221',],
"worker": ['localhost:2222', 'localhost:2223', 'localhost:2224', 'localhost:2225',]
})
server = tf.train.Server(cluster, job_name=job_name, task_index=task_index)
if job_name == 'ps':
print('Start Parameter Sever: ', task_index)
server.join()
else:
t1 = time.time()
env = gym.make('CartPole-v0').unwrapped
print('Start Worker: ', task_index)
with tf.device(tf.train.replica_device_setter(
worker_device="/job:worker/task:%d" % task_index,
cluster=cluster)):
opt_a = tf.train.RMSPropOptimizer(LR_A, name='opt_a')
opt_c = tf.train.RMSPropOptimizer(LR_C, name='opt_c')
global_net = ACNet('global_net')
local_net = ACNet('local_ac%d' % task_index, opt_a, opt_c, global_net)
# set training steps
hooks = [tf.train.StopAtStepHook(last_step=100000)]
with tf.train.MonitoredTrainingSession(master=server.target,
is_chief=True,
hooks=hooks,) as sess:
print('Start Worker Session: ', task_index)
local_net.sess = sess
total_step = 1
buffer_s, buffer_a, buffer_r = [], [], []
while (not sess.should_stop()) and (global_ep.value < 1000):
s = env.reset()
ep_r = 0
while True:
# if task_index:
# env.render()
a = local_net.choose_action(s)
s_, r, done, info = env.step(a)
if done: r = -5.
ep_r += r
buffer_s.append(s)
buffer_a.append(a)
buffer_r.append(r)
if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net
if done:
v_s_ = 0 # terminal
else:
v_s_ = sess.run(local_net.v, {local_net.s: s_[np.newaxis, :]})[0, 0]
buffer_v_target = []
for r in buffer_r[::-1]: # reverse buffer r
v_s_ = r + GAMMA * v_s_
buffer_v_target.append(v_s_)
buffer_v_target.reverse()
buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.array(buffer_a), np.vstack(
buffer_v_target)
feed_dict = {
local_net.s: buffer_s,
local_net.a_his: buffer_a,
local_net.v_target: buffer_v_target,
}
local_net.update_global(feed_dict)
buffer_s, buffer_a, buffer_r = [], [], []
local_net.pull_global()
s = s_
total_step += 1
if done:
if r_queue.empty(): # record running episode reward
global_running_r.value = ep_r
else:
global_running_r.value = .99 * global_running_r.value + 0.01 * ep_r
r_queue.put(global_running_r.value)
print(
"Task: %i" % task_index,
"| Ep: %i" % global_ep.value,
"| Ep_r: %i" % global_running_r.value,
"| Global_step: %i" % sess.run(local_net.global_step),
)
with lock:
global_ep.value += 1
break
print('Worker Done: ', task_index, time.time()-t1)
if __name__ == "__main__":
# use multiprocessing to create a local cluster with 2 parameter servers and 2 workers
global_ep = mp.Value('i', 0)
lock = mp.Lock()
r_queue = mp.Queue()
global_running_r = mp.Value('d', 0)
jobs = [
('ps', 0), ('ps', 1),
('worker', 0), ('worker', 1), ('worker', 2), ('worker', 3)
]
ps = [mp.Process(target=work, args=(j, i, global_ep, lock, r_queue, global_running_r), ) for j, i in jobs]
[p.start() for p in ps]
[p.join() for p in ps[2:]]
ep_r = []
while not r_queue.empty():
ep_r.append(r_queue.get())
plt.plot(np.arange(len(ep_r)), ep_r)
plt.title('Distributed training')
plt.xlabel('Step')
plt.ylabel('Total moving reward')
plt.show()
================================================
FILE: contents/11_Dyna_Q/RL_brain.py
================================================
"""
This part of code is the Dyna-Q learning brain, which is a brain of the agent.
All decisions and learning processes are made in here.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
import numpy as np
import pandas as pd
class QLearningTable:
def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
self.actions = actions # a list
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon = e_greedy
## argmax type error
self.q_table = pd.DataFrame(columns=self.actions).astype('float32')
def choose_action(self, observation):
self.check_state_exist(observation)
# action selection
if np.random.uniform() < self.epsilon:
# choose best action
# state_action = self.q_table.ix[observation, :]
state_action = self.q_table.loc[observation, :] # for label indexing
state_action = state_action.reindex(np.random.permutation(state_action.index)) # some actions have same value
action = state_action.argmax()
else:
# choose random action
action = np.random.choice(self.actions)
return action
def learn(self, s, a, r, s_):
self.check_state_exist(s_)
q_predict = self.q_table.loc[s, a]
if s_ != 'terminal':
q_target = r + self.gamma * self.q_table.loc[s_, :].max() # next state is not terminal
else:
q_target = r # next state is terminal
self.q_table.loc[s, a] += self.lr * (q_target - q_predict) # update
def check_state_exist(self, state):
if state not in self.q_table.index:
# append new state to q table
self.q_table = self.q_table.append(
pd.Series(
[0]*len(self.actions),
index=self.q_table.columns,
name=state,
)
)
class EnvModel:
"""Similar to the memory buffer in DQN, you can store past experiences in here.
Alternatively, the model can generate next state and reward signal accurately."""
def __init__(self, actions):
# the simplest case is to think about the model is a memory which has all past transition information
self.actions = actions
self.database = pd.DataFrame(columns=actions, dtype=np.object)
def store_transition(self, s, a, r, s_):
if s not in self.database.index:
self.database = self.database.append(
pd.Series(
[None] * len(self.actions),
index=self.database.columns,
name=s,
))
self.database.set_value(s, a, (r, s_))
def sample_s_a(self):
s = np.random.choice(self.database.index)
a = np.random.choice(self.database.loc[s].dropna().index) # filter out the None value
return s, a
def get_r_s_(self, s, a):
r, s_ = self.database.loc[s, a]
return r, s_
================================================
FILE: contents/11_Dyna_Q/maze_env.py
================================================
"""
Reinforcement learning maze example.
Red rectangle: explorer.
Black rectangles: hells [reward = -1].
Yellow bin circle: paradise [reward = +1].
All other states: ground [reward = 0].
This script is the environment part of this example. The RL is in RL_brain.py.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
import numpy as np
np.random.seed(1)
import tkinter as tk
import time
UNIT = 40 # pixels
MAZE_H = 4 # grid height
MAZE_W = 4 # grid width
class Maze(tk.Tk, object):
def __init__(self):
super(Maze, self).__init__()
self.action_space = ['u', 'd', 'l', 'r']
self.n_actions = len(self.action_space)
self.title('maze')
self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
self._build_maze()
def _build_maze(self):
self.canvas = tk.Canvas(self, bg='white',
height=MAZE_H * UNIT,
width=MAZE_W * UNIT)
# create grids
for c in range(0, MAZE_W * UNIT, UNIT):
x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
self.canvas.create_line(x0, y0, x1, y1)
for r in range(0, MAZE_H * UNIT, UNIT):
x0, y0, x1, y1 = 0, r, MAZE_H * UNIT, r
self.canvas.create_line(x0, y0, x1, y1)
# create origin
origin = np.array([20, 20])
# hell
hell1_center = origin + np.array([UNIT * 2, UNIT])
self.hell1 = self.canvas.create_rectangle(
hell1_center[0] - 15, hell1_center[1] - 15,
hell1_center[0] + 15, hell1_center[1] + 15,
fill='black')
# hell
hell2_center = origin + np.array([UNIT, UNIT * 2])
self.hell2 = self.canvas.create_rectangle(
hell2_center[0] - 15, hell2_center[1] - 15,
hell2_center[0] + 15, hell2_center[1] + 15,
fill='black')
# create oval
oval_center = origin + UNIT * 2
self.oval = self.canvas.create_oval(
oval_center[0] - 15, oval_center[1] - 15,
oval_center[0] + 15, oval_center[1] + 15,
fill='yellow')
# create red rect
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# pack all
self.canvas.pack()
def reset(self):
self.update()
time.sleep(0.5)
self.canvas.delete(self.rect)
origin = np.array([20, 20])
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# return observation
return self.canvas.coords(self.rect)
def step(self, action):
s = self.canvas.coords(self.rect)
base_action = np.array([0, 0])
if action == 0: # up
if s[1] > UNIT:
base_action[1] -= UNIT
elif action == 1: # down
if s[1] < (MAZE_H - 1) * UNIT:
base_action[1] += UNIT
elif action == 2: # right
if s[0] < (MAZE_W - 1) * UNIT:
base_action[0] += UNIT
elif action == 3: # left
if s[0] > UNIT:
base_action[0] -= UNIT
self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
s_ = self.canvas.coords(self.rect) # next state
# reward function
if s_ == self.canvas.coords(self.oval):
reward = 1
done = True
elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
reward = -1
done = True
else:
reward = 0
done = False
return s_, reward, done
def render(self):
# time.sleep(0.1)
self.update()
================================================
FILE: contents/11_Dyna_Q/run_this.py
================================================
"""
Simplest model-based RL, Dyna-Q.
Red rectangle: explorer.
Black rectangles: hells [reward = -1].
Yellow bin circle: paradise [reward = +1].
All other states: ground [reward = 0].
This script is the main part which controls the update method of this example.
The RL is in RL_brain.py.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
from maze_env import Maze
from RL_brain import QLearningTable, EnvModel
def update():
for episode in range(40):
s = env.reset()
while True:
env.render()
a = RL.choose_action(str(s))
s_, r, done = env.step(a)
RL.learn(str(s), a, r, str(s_))
# use a model to output (r, s_) by inputting (s, a)
# the model in dyna Q version is just like a memory replay buffer
env_model.store_transition(str(s), a, r, s_)
for n in range(10): # learn 10 more times using the env_model
ms, ma = env_model.sample_s_a() # ms in here is a str
mr, ms_ = env_model.get_r_s_(ms, ma)
RL.learn(ms, ma, mr, str(ms_))
s = s_
if done:
break
# end of game
print('game over')
env.destroy()
if __name__ == "__main__":
env = Maze()
RL = QLearningTable(actions=list(range(env.n_actions)))
env_model = EnvModel(actions=list(range(env.n_actions)))
env.after(0, update)
env.mainloop()
================================================
FILE: contents/12_Proximal_Policy_Optimization/DPPO.py
================================================
"""
A simple version of OpenAI's Proximal Policy Optimization (PPO). [https://arxiv.org/abs/1707.06347]
Distributing workers in parallel to collect data, then stop worker's roll-out and train PPO on collected data.
Restart workers once PPO is updated.
The global PPO updating rule is adopted from DeepMind's paper (DPPO):
Emergence of Locomotion Behaviours in Rich Environments (Google Deepmind): [https://arxiv.org/abs/1707.02286]
View more on my tutorial website: https://morvanzhou.github.io/tutorials
Dependencies:
tensorflow r1.3
gym 0.9.2
"""
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import gym, threading, queue
EP_MAX = 1000
EP_LEN = 200
N_WORKER = 4 # parallel workers
GAMMA = 0.9 # reward discount factor
A_LR = 0.0001 # learning rate for actor
C_LR = 0.0002 # learning rate for critic
MIN_BATCH_SIZE = 64 # minimum batch size for updating PPO
UPDATE_STEP = 10 # loop update operation n-steps
EPSILON = 0.2 # for clipping surrogate objective
GAME = 'Pendulum-v0'
S_DIM, A_DIM = 3, 1 # state and action dimension
class PPO(object):
def __init__(self):
self.sess = tf.Session()
self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state')
# critic
l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu)
self.v = tf.layers.dense(l1, 1)
self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
self.advantage = self.tfdc_r - self.v
self.closs = tf.reduce_mean(tf.square(self.advantage))
self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs)
# actor
pi, pi_params = self._build_anet('pi', trainable=True)
oldpi, oldpi_params = self._build_anet('oldpi', trainable=False)
self.sample_op = tf.squeeze(pi.sample(1), axis=0) # operation of choosing action
self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]
self.tfa = tf.placeholder(tf.float32, [None, A_DIM], 'action')
self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')
# ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
ratio = pi.prob(self.tfa) / (oldpi.prob(self.tfa) + 1e-5)
surr = ratio * self.tfadv # surrogate loss
self.aloss = -tf.reduce_mean(tf.minimum( # clipped surrogate objective
surr,
tf.clip_by_value(ratio, 1. - EPSILON, 1. + EPSILON) * self.tfadv))
self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.aloss)
self.sess.run(tf.global_variables_initializer())
def update(self):
global GLOBAL_UPDATE_COUNTER
while not COORD.should_stop():
if GLOBAL_EP < EP_MAX:
UPDATE_EVENT.wait() # wait until get batch of data
self.sess.run(self.update_oldpi_op) # copy pi to old pi
data = [QUEUE.get() for _ in range(QUEUE.qsize())] # collect data from all workers
data = np.vstack(data)
s, a, r = data[:, :S_DIM], data[:, S_DIM: S_DIM + A_DIM], data[:, -1:]
adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r})
# update actor and critic in a update loop
[self.sess.run(self.atrain_op, {self.tfs: s, self.tfa: a, self.tfadv: adv}) for _ in range(UPDATE_STEP)]
[self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(UPDATE_STEP)]
UPDATE_EVENT.clear() # updating finished
GLOBAL_UPDATE_COUNTER = 0 # reset counter
ROLLING_EVENT.set() # set roll-out available
def _build_anet(self, name, trainable):
with tf.variable_scope(name):
l1 = tf.layers.dense(self.tfs, 200, tf.nn.relu, trainable=trainable)
mu = 2 * tf.layers.dense(l1, A_DIM, tf.nn.tanh, trainable=trainable)
sigma = tf.layers.dense(l1, A_DIM, tf.nn.softplus, trainable=trainable)
norm_dist = tf.distributions.Normal(loc=mu, scale=sigma)
params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
return norm_dist, params
def choose_action(self, s):
s = s[np.newaxis, :]
a = self.sess.run(self.sample_op, {self.tfs: s})[0]
return np.clip(a, -2, 2)
def get_v(self, s):
if s.ndim < 2: s = s[np.newaxis, :]
return self.sess.run(self.v, {self.tfs: s})[0, 0]
class Worker(object):
def __init__(self, wid):
self.wid = wid
self.env = gym.make(GAME).unwrapped
self.ppo = GLOBAL_PPO
def work(self):
global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
while not COORD.should_stop():
s = self.env.reset()
ep_r = 0
buffer_s, buffer_a, buffer_r = [], [], []
for t in range(EP_LEN):
if not ROLLING_EVENT.is_set(): # while global PPO is updating
ROLLING_EVENT.wait() # wait until PPO is updated
buffer_s, buffer_a, buffer_r = [], [], [] # clear history buffer, use new policy to collect data
a = self.ppo.choose_action(s)
s_, r, done, _ = self.env.step(a)
buffer_s.append(s)
buffer_a.append(a)
buffer_r.append((r + 8) / 8) # normalize reward, find to be useful
s = s_
ep_r += r
GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers
if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
v_s_ = self.ppo.get_v(s_)
discounted_r = [] # compute discounted reward
for r in buffer_r[::-1]:
v_s_ = r + GAMMA * v_s_
discounted_r.append(v_s_)
discounted_r.reverse()
bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis]
buffer_s, buffer_a, buffer_r = [], [], []
QUEUE.put(np.hstack((bs, ba, br))) # put data in the queue
if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
ROLLING_EVENT.clear() # stop collecting data
UPDATE_EVENT.set() # globalPPO update
if GLOBAL_EP >= EP_MAX: # stop training
COORD.request_stop()
break
# record reward changes, plot later
if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r)
else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1]*0.9+ep_r*0.1)
GLOBAL_EP += 1
print('{0:.1f}%'.format(GLOBAL_EP/EP_MAX*100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r,)
if __name__ == '__main__':
GLOBAL_PPO = PPO()
UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event()
UPDATE_EVENT.clear() # not update now
ROLLING_EVENT.set() # start to roll out
workers = [Worker(wid=i) for i in range(N_WORKER)]
GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0
GLOBAL_RUNNING_R = []
COORD = tf.train.Coordinator()
QUEUE = queue.Queue() # workers putting data in this queue
threads = []
for worker in workers: # worker threads
t = threading.Thread(target=worker.work, args=())
t.start() # training
threads.append(t)
# add a PPO updating thread
threads.append(threading.Thread(target=GLOBAL_PPO.update,))
threads[-1].start()
COORD.join(threads)
# plot reward change and test
plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
plt.xlabel('Episode'); plt.ylabel('Moving reward'); plt.ion(); plt.show()
env = gym.make('Pendulum-v0')
while True:
s = env.reset()
for t in range(300):
env.render()
s = env.step(GLOBAL_PPO.choose_action(s))[0]
================================================
FILE: contents/12_Proximal_Policy_Optimization/discrete_DPPO.py
================================================
"""
A simple version of OpenAI's Proximal Policy Optimization (PPO). [https://arxiv.org/abs/1707.06347]
Distributing workers in parallel to collect data, then stop worker's roll-out and train PPO on collected data.
Restart workers once PPO is updated.
The global PPO updating rule is adopted from DeepMind's paper (DPPO):
Emergence of Locomotion Behaviours in Rich Environments (Google Deepmind): [https://arxiv.org/abs/1707.02286]
View more on my tutorial website: https://morvanzhou.github.io/tutorials
Dependencies:
tensorflow 1.8.0
gym 0.9.2
"""
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import gym, threading, queue
EP_MAX = 1000
EP_LEN = 500
N_WORKER = 4 # parallel workers
GAMMA = 0.9 # reward discount factor
A_LR = 0.0001 # learning rate for actor
C_LR = 0.0001 # learning rate for critic
MIN_BATCH_SIZE = 64 # minimum batch size for updating PPO
UPDATE_STEP = 15 # loop update operation n-steps
EPSILON = 0.2 # for clipping surrogate objective
GAME = 'CartPole-v0'
env = gym.make(GAME)
S_DIM = env.observation_space.shape[0]
A_DIM = env.action_space.n
class PPONet(object):
def __init__(self):
self.sess = tf.Session()
self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state')
# critic
w_init = tf.random_normal_initializer(0., .1)
lc = tf.layers.dense(self.tfs, 200, tf.nn.relu, kernel_initializer=w_init, name='lc')
self.v = tf.layers.dense(lc, 1)
self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
self.advantage = self.tfdc_r - self.v
self.closs = tf.reduce_mean(tf.square(self.advantage))
self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs)
# actor
self.pi, pi_params = self._build_anet('pi', trainable=True)
oldpi, oldpi_params = self._build_anet('oldpi', trainable=False)
self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]
self.tfa = tf.placeholder(tf.int32, [None, ], 'action')
self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')
a_indices = tf.stack([tf.range(tf.shape(self.tfa)[0], dtype=tf.int32), self.tfa], axis=1)
pi_prob = tf.gather_nd(params=self.pi, indices=a_indices) # shape=(None, )
oldpi_prob = tf.gather_nd(params=oldpi, indices=a_indices) # shape=(None, )
ratio = pi_prob/(oldpi_prob + 1e-5)
surr = ratio * self.tfadv # surrogate loss
self.aloss = -tf.reduce_mean(tf.minimum( # clipped surrogate objective
surr,
tf.clip_by_value(ratio, 1. - EPSILON, 1. + EPSILON) * self.tfadv))
self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.aloss)
self.sess.run(tf.global_variables_initializer())
def update(self):
global GLOBAL_UPDATE_COUNTER
while not COORD.should_stop():
if GLOBAL_EP < EP_MAX:
UPDATE_EVENT.wait() # wait until get batch of data
self.sess.run(self.update_oldpi_op) # copy pi to old pi
data = [QUEUE.get() for _ in range(QUEUE.qsize())] # collect data from all workers
data = np.vstack(data)
s, a, r = data[:, :S_DIM], data[:, S_DIM: S_DIM + 1].ravel(), data[:, -1:]
adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r})
# update actor and critic in a update loop
[self.sess.run(self.atrain_op, {self.tfs: s, self.tfa: a, self.tfadv: adv}) for _ in range(UPDATE_STEP)]
[self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(UPDATE_STEP)]
UPDATE_EVENT.clear() # updating finished
GLOBAL_UPDATE_COUNTER = 0 # reset counter
ROLLING_EVENT.set() # set roll-out available
def _build_anet(self, name, trainable):
with tf.variable_scope(name):
l_a = tf.layers.dense(self.tfs, 200, tf.nn.relu, trainable=trainable)
a_prob = tf.layers.dense(l_a, A_DIM, tf.nn.softmax, trainable=trainable)
params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
return a_prob, params
def choose_action(self, s): # run by a local
prob_weights = self.sess.run(self.pi, feed_dict={self.tfs: s[None, :]})
action = np.random.choice(range(prob_weights.shape[1]),
p=prob_weights.ravel()) # select action w.r.t the actions prob
return action
def get_v(self, s):
if s.ndim < 2: s = s[np.newaxis, :]
return self.sess.run(self.v, {self.tfs: s})[0, 0]
class Worker(object):
def __init__(self, wid):
self.wid = wid
self.env = gym.make(GAME).unwrapped
self.ppo = GLOBAL_PPO
def work(self):
global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
while not COORD.should_stop():
s = self.env.reset()
ep_r = 0
buffer_s, buffer_a, buffer_r = [], [], []
for t in range(EP_LEN):
if not ROLLING_EVENT.is_set(): # while global PPO is updating
ROLLING_EVENT.wait() # wait until PPO is updated
buffer_s, buffer_a, buffer_r = [], [], [] # clear history buffer, use new policy to collect data
a = self.ppo.choose_action(s)
s_, r, done, _ = self.env.step(a)
if done: r = -10
buffer_s.append(s)
buffer_a.append(a)
buffer_r.append(r-1) # 0 for not down, -11 for down. Reward engineering
s = s_
ep_r += r
GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers
if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE or done:
if done:
v_s_ = 0 # end of episode
else:
v_s_ = self.ppo.get_v(s_)
discounted_r = [] # compute discounted reward
for r in buffer_r[::-1]:
v_s_ = r + GAMMA * v_s_
discounted_r.append(v_s_)
discounted_r.reverse()
bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, None]
buffer_s, buffer_a, buffer_r = [], [], []
QUEUE.put(np.hstack((bs, ba, br))) # put data in the queue
if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
ROLLING_EVENT.clear() # stop collecting data
UPDATE_EVENT.set() # globalPPO update
if GLOBAL_EP >= EP_MAX: # stop training
COORD.request_stop()
break
if done: break
# record reward changes, plot later
if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r)
else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1]*0.9+ep_r*0.1)
GLOBAL_EP += 1
print('{0:.1f}%'.format(GLOBAL_EP/EP_MAX*100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r,)
if __name__ == '__main__':
GLOBAL_PPO = PPONet()
UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event()
UPDATE_EVENT.clear() # not update now
ROLLING_EVENT.set() # start to roll out
workers = [Worker(wid=i) for i in range(N_WORKER)]
GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0
GLOBAL_RUNNING_R = []
COORD = tf.train.Coordinator()
QUEUE = queue.Queue() # workers putting data in this queue
threads = []
for worker in workers: # worker threads
t = threading.Thread(target=worker.work, args=())
t.start() # training
threads.append(t)
# add a PPO updating thread
threads.append(threading.Thread(target=GLOBAL_PPO.update,))
threads[-1].start()
COORD.join(threads)
# plot reward change and test
plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
plt.xlabel('Episode'); plt.ylabel('Moving reward'); plt.ion(); plt.show()
env = gym.make('CartPole-v0')
while True:
s = env.reset()
for t in range(1000):
env.render()
s, r, done, info = env.step(GLOBAL_PPO.choose_action(s))
if done:
break
================================================
FILE: contents/12_Proximal_Policy_Optimization/simply_PPO.py
================================================
"""
A simple version of Proximal Policy Optimization (PPO) using single thread.
Based on:
1. Emergence of Locomotion Behaviours in Rich Environments (Google Deepmind): [https://arxiv.org/abs/1707.02286]
2. Proximal Policy Optimization Algorithms (OpenAI): [https://arxiv.org/abs/1707.06347]
View more on my tutorial website: https://morvanzhou.github.io/tutorials
Dependencies:
tensorflow r1.2
gym 0.9.2
"""
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import gym
EP_MAX = 1000
EP_LEN = 200
GAMMA = 0.9
A_LR = 0.0001
C_LR = 0.0002
BATCH = 32
A_UPDATE_STEPS = 10
C_UPDATE_STEPS = 10
S_DIM, A_DIM = 3, 1
METHOD = [
dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty
dict(name='clip', epsilon=0.2), # Clipped surrogate objective, find this is better
][1] # choose the method for optimization
class PPO(object):
def __init__(self):
self.sess = tf.Session()
self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state')
# critic
with tf.variable_scope('critic'):
l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu)
self.v = tf.layers.dense(l1, 1)
self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
self.advantage = self.tfdc_r - self.v
self.closs = tf.reduce_mean(tf.square(self.advantage))
self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs)
# actor
pi, pi_params = self._build_anet('pi', trainable=True)
oldpi, oldpi_params = self._build_anet('oldpi', trainable=False)
with tf.variable_scope('sample_action'):
self.sample_op = tf.squeeze(pi.sample(1), axis=0) # choosing action
with tf.variable_scope('update_oldpi'):
self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]
self.tfa = tf.placeholder(tf.float32, [None, A_DIM], 'action')
self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')
with tf.variable_scope('loss'):
with tf.variable_scope('surrogate'):
# ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
ratio = pi.prob(self.tfa) / (oldpi.prob(self.tfa) + 1e-5)
surr = ratio * self.tfadv
if METHOD['name'] == 'kl_pen':
self.tflam = tf.placeholder(tf.float32, None, 'lambda')
kl = tf.distributions.kl_divergence(oldpi, pi)
self.kl_mean = tf.reduce_mean(kl)
self.aloss = -(tf.reduce_mean(surr - self.tflam * kl))
else: # clipping method, find this is better
self.aloss = -tf.reduce_mean(tf.minimum(
surr,
tf.clip_by_value(ratio, 1.-METHOD['epsilon'], 1.+METHOD['epsilon'])*self.tfadv))
with tf.variable_scope('atrain'):
self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.aloss)
tf.summary.FileWriter("log/", self.sess.graph)
self.sess.run(tf.global_variables_initializer())
def update(self, s, a, r):
self.sess.run(self.update_oldpi_op)
adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r})
# adv = (adv - adv.mean())/(adv.std()+1e-6) # sometimes helpful
# update actor
if METHOD['name'] == 'kl_pen':
for _ in range(A_UPDATE_STEPS):
_, kl = self.sess.run(
[self.atrain_op, self.kl_mean],
{self.tfs: s, self.tfa: a, self.tfadv: adv, self.tflam: METHOD['lam']})
if kl > 4*METHOD['kl_target']: # this in in google's paper
break
if kl < METHOD['kl_target'] / 1.5: # adaptive lambda, this is in OpenAI's paper
METHOD['lam'] /= 2
elif kl > METHOD['kl_target'] * 1.5:
METHOD['lam'] *= 2
METHOD['lam'] = np.clip(METHOD['lam'], 1e-4, 10) # sometimes explode, this clipping is my solution
else: # clipping method, find this is better (OpenAI's paper)
[self.sess.run(self.atrain_op, {self.tfs: s, self.tfa: a, self.tfadv: adv}) for _ in range(A_UPDATE_STEPS)]
# update critic
[self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(C_UPDATE_STEPS)]
def _build_anet(self, name, trainable):
with tf.variable_scope(name):
l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu, trainable=trainable)
mu = 2 * tf.layers.dense(l1, A_DIM, tf.nn.tanh, trainable=trainable)
sigma = tf.layers.dense(l1, A_DIM, tf.nn.softplus, trainable=trainable)
norm_dist = tf.distributions.Normal(loc=mu, scale=sigma)
params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
return norm_dist, params
def choose_action(self, s):
s = s[np.newaxis, :]
a = self.sess.run(self.sample_op, {self.tfs: s})[0]
return np.clip(a, -2, 2)
def get_v(self, s):
if s.ndim < 2: s = s[np.newaxis, :]
return self.sess.run(self.v, {self.tfs: s})[0, 0]
env = gym.make('Pendulum-v0').unwrapped
ppo = PPO()
all_ep_r = []
for ep in range(EP_MAX):
s = env.reset()
buffer_s, buffer_a, buffer_r = [], [], []
ep_r = 0
for t in range(EP_LEN): # in one episode
env.render()
a = ppo.choose_action(s)
s_, r, done, _ = env.step(a)
buffer_s.append(s)
buffer_a.append(a)
buffer_r.append((r+8)/8) # normalize reward, find to be useful
s = s_
ep_r += r
# update ppo
if (t+1) % BATCH == 0 or t == EP_LEN-1:
v_s_ = ppo.get_v(s_)
discounted_r = []
for r in buffer_r[::-1]:
v_s_ = r + GAMMA * v_s_
discounted_r.append(v_s_)
discounted_r.reverse()
bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis]
buffer_s, buffer_a, buffer_r = [], [], []
ppo.update(bs, ba, br)
if ep == 0: all_ep_r.append(ep_r)
else: all_ep_r.append(all_ep_r[-1]*0.9 + ep_r*0.1)
print(
'Ep: %i' % ep,
"|Ep_r: %i" % ep_r,
("|Lam: %.4f" % METHOD['lam']) if METHOD['name'] == 'kl_pen' else '',
)
plt.plot(np.arange(len(all_ep_r)), all_ep_r)
plt.xlabel('Episode');plt.ylabel('Moving averaged episode reward');plt.show()
================================================
FILE: contents/1_command_line_reinforcement_learning/treasure_on_right.py
================================================
"""
A simple example for Reinforcement Learning using table lookup Q-learning method.
An agent "o" is on the left of a 1 dimensional world, the treasure is on the rightmost location.
Run this program and to see how the agent will improve its strategy of finding the treasure.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
import numpy as np
import pandas as pd
import time
np.random.seed(2) # reproducible
N_STATES = 6 # the length of the 1 dimensional world
ACTIONS = ['left', 'right'] # available actions
EPSILON = 0.9 # greedy police
ALPHA = 0.1 # learning rate
GAMMA = 0.9 # discount factor
MAX_EPISODES = 13 # maximum episodes
FRESH_TIME = 0.3 # fresh time for one move
def build_q_table(n_states, actions):
table = pd.DataFrame(
np.zeros((n_states, len(actions))), # q_table initial values
columns=actions, # actions's name
)
# print(table) # show table
return table
def choose_action(state, q_table):
# This is how to choose an action
state_actions = q_table.iloc[state, :]
if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()): # act non-greedy or state-action have no value
action_name = np.random.choice(ACTIONS)
else: # act greedy
action_name = state_actions.idxmax() # replace argmax to idxmax as argmax means a different function in newer version of pandas
return action_name
def get_env_feedback(S, A):
# This is how agent will interact with the environment
if A == 'right': # move right
if S == N_STATES - 2: # terminate
S_ = 'terminal'
R = 1
else:
S_ = S + 1
R = 0
else: # move left
R = 0
if S == 0:
S_ = S # reach the wall
else:
S_ = S - 1
return S_, R
def update_env(S, episode, step_counter):
# This is how environment be updated
env_list = ['-']*(N_STATES-1) + ['T'] # '---------T' our environment
if S == 'terminal':
interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
print('\r{}'.format(interaction), end='')
time.sleep(2)
print('\r ', end='')
else:
env_list[S] = 'o'
interaction = ''.join(env_list)
print('\r{}'.format(interaction), end='')
time.sleep(FRESH_TIME)
def rl():
# main part of RL loop
q_table = build_q_table(N_STATES, ACTIONS)
for episode in range(MAX_EPISODES):
step_counter = 0
S = 0
is_terminated = False
update_env(S, episode, step_counter)
while not is_terminated:
A = choose_action(S, q_table)
S_, R = get_env_feedback(S, A) # take action & get next state and reward
q_predict = q_table.loc[S, A]
if S_ != 'terminal':
q_target = R + GAMMA * q_table.iloc[S_, :].max() # next state is not terminal
else:
q_target = R # next state is terminal
is_terminated = True # terminate this episode
q_table.loc[S, A] += ALPHA * (q_target - q_predict) # update
S = S_ # move to next state
update_env(S, episode, step_counter+1)
step_counter += 1
return q_table
if __name__ == "__main__":
q_table = rl()
print('\r\nQ-table:\n')
print(q_table)
================================================
FILE: contents/2_Q_Learning_maze/RL_brain.py
================================================
"""
This part of code is the Q learning brain, which is a brain of the agent.
All decisions are made in here.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
import numpy as np
import pandas as pd
class QLearningTable:
def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
self.actions = actions # a list
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon = e_greedy
self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
def choose_action(self, observation):
self.check_state_exist(observation)
# action selection
if np.random.uniform() < self.epsilon:
# choose best action
state_action = self.q_table.loc[observation, :]
# some actions may have the same value, randomly choose on in these actions
action = np.random.choice(state_action[state_action == np.max(state_action)].index)
else:
# choose random action
action = np.random.choice(self.actions)
return action
def learn(self, s, a, r, s_):
self.check_state_exist(s_)
q_predict = self.q_table.loc[s, a]
if s_ != 'terminal':
q_target = r + self.gamma * self.q_table.loc[s_, :].max() # next state is not terminal
else:
q_target = r # next state is terminal
self.q_table.loc[s, a] += self.lr * (q_target - q_predict) # update
def check_state_exist(self, state):
if state not in self.q_table.index:
# append new state to q table
self.q_table = self.q_table.append(
pd.Series(
[0]*len(self.actions),
index=self.q_table.columns,
name=state,
)
)
================================================
FILE: contents/2_Q_Learning_maze/maze_env.py
================================================
"""
Reinforcement learning maze example.
Red rectangle: explorer.
Black rectangles: hells [reward = -1].
Yellow bin circle: paradise [reward = +1].
All other states: ground [reward = 0].
This script is the environment part of this example. The RL is in RL_brain.py.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
import numpy as np
import time
import sys
if sys.version_info.major == 2:
import Tkinter as tk
else:
import tkinter as tk
UNIT = 40 # pixels
MAZE_H = 4 # grid height
MAZE_W = 4 # grid width
class Maze(tk.Tk, object):
def __init__(self):
super(Maze, self).__init__()
self.action_space = ['u', 'd', 'l', 'r']
self.n_actions = len(self.action_space)
self.title('maze')
self.geometry('{0}x{1}'.format(MAZE_W * UNIT, MAZE_H * UNIT))
self._build_maze()
def _build_maze(self):
self.canvas = tk.Canvas(self, bg='white',
height=MAZE_H * UNIT,
width=MAZE_W * UNIT)
# create grids
for c in range(0, MAZE_W * UNIT, UNIT):
x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
self.canvas.create_line(x0, y0, x1, y1)
for r in range(0, MAZE_H * UNIT, UNIT):
x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r
self.canvas.create_line(x0, y0, x1, y1)
# create origin
origin = np.array([20, 20])
# hell
hell1_center = origin + np.array([UNIT * 2, UNIT])
self.hell1 = self.canvas.create_rectangle(
hell1_center[0] - 15, hell1_center[1] - 15,
hell1_center[0] + 15, hell1_center[1] + 15,
fill='black')
# hell
hell2_center = origin + np.array([UNIT, UNIT * 2])
self.hell2 = self.canvas.create_rectangle(
hell2_center[0] - 15, hell2_center[1] - 15,
hell2_center[0] + 15, hell2_center[1] + 15,
fill='black')
# create oval
oval_center = origin + UNIT * 2
self.oval = self.canvas.create_oval(
oval_center[0] - 15, oval_center[1] - 15,
oval_center[0] + 15, oval_center[1] + 15,
fill='yellow')
# create red rect
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# pack all
self.canvas.pack()
def reset(self):
self.update()
time.sleep(0.5)
self.canvas.delete(self.rect)
origin = np.array([20, 20])
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# return observation
return self.canvas.coords(self.rect)
def step(self, action):
s = self.canvas.coords(self.rect)
base_action = np.array([0, 0])
if action == 0: # up
if s[1] > UNIT:
base_action[1] -= UNIT
elif action == 1: # down
if s[1] < (MAZE_H - 1) * UNIT:
base_action[1] += UNIT
elif action == 2: # right
if s[0] < (MAZE_W - 1) * UNIT:
base_action[0] += UNIT
elif action == 3: # left
if s[0] > UNIT:
base_action[0] -= UNIT
self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
s_ = self.canvas.coords(self.rect) # next state
# reward function
if s_ == self.canvas.coords(self.oval):
reward = 1
done = True
s_ = 'terminal'
elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
reward = -1
done = True
s_ = 'terminal'
else:
reward = 0
done = False
return s_, reward, done
def render(self):
time.sleep(0.1)
self.update()
def update():
for t in range(10):
s = env.reset()
while True:
env.render()
a = 1
s, r, done = env.step(a)
if done:
break
if __name__ == '__main__':
env = Maze()
env.after(100, update)
env.mainloop()
================================================
FILE: contents/2_Q_Learning_maze/run_this.py
================================================
"""
Reinforcement learning maze example.
Red rectangle: explorer.
Black rectangles: hells [reward = -1].
Yellow bin circle: paradise [reward = +1].
All other states: ground [reward = 0].
This script is the main part which controls the update method of this example.
The RL is in RL_brain.py.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
from maze_env import Maze
from RL_brain import QLearningTable
def update():
for episode in range(100):
# initial observation
observation = env.reset()
while True:
# fresh env
env.render()
# RL choose action based on observation
action = RL.choose_action(str(observation))
# RL take action and get next observation and reward
observation_, reward, done = env.step(action)
# RL learn from this transition
RL.learn(str(observation), action, reward, str(observation_))
# swap observation
observation = observation_
# break while loop when end of this episode
if done:
break
# end of game
print('game over')
env.destroy()
if __name__ == "__main__":
env = Maze()
RL = QLearningTable(actions=list(range(env.n_actions)))
env.after(100, update)
env.mainloop()
================================================
FILE: contents/3_Sarsa_maze/RL_brain.py
================================================
"""
This part of code is the Q learning brain, which is a brain of the agent.
All decisions are made in here.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
import numpy as np
import pandas as pd
class RL(object):
def __init__(self, action_space, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
self.actions = action_space # a list
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon = e_greedy
self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
def check_state_exist(self, state):
if state not in self.q_table.index:
# append new state to q table
self.q_table = self.q_table.append(
pd.Series(
[0]*len(self.actions),
index=self.q_table.columns,
name=state,
)
)
def choose_action(self, observation):
self.check_state_exist(observation)
# action selection
if np.random.rand() < self.epsilon:
# choose best action
state_action = self.q_table.loc[observation, :]
# some actions may have the same value, randomly choose on in these actions
action = np.random.choice(state_action[state_action == np.max(state_action)].index)
else:
# choose random action
action = np.random.choice(self.actions)
return action
def learn(self, *args):
pass
# off-policy
class QLearningTable(RL):
def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
super(QLearningTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
def learn(self, s, a, r, s_):
self.check_state_exist(s_)
q_predict = self.q_table.loc[s, a]
if s_ != 'terminal':
q_target = r + self.gamma * self.q_table.loc[s_, :].max() # next state is not terminal
else:
q_target = r # next state is terminal
self.q_table.loc[s, a] += self.lr * (q_target - q_predict) # update
# on-policy
class SarsaTable(RL):
def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
super(SarsaTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
def learn(self, s, a, r, s_, a_):
self.check_state_exist(s_)
q_predict = self.q_table.loc[s, a]
if s_ != 'terminal':
q_target = r + self.gamma * self.q_table.loc[s_, a_] # next state is not terminal
else:
q_target = r # next state is terminal
self.q_table.loc[s, a] += self.lr * (q_target - q_predict) # update
================================================
FILE: contents/3_Sarsa_maze/maze_env.py
================================================
"""
Reinforcement learning maze example.
Red rectangle: explorer.
Black rectangles: hells [reward = -1].
Yellow bin circle: paradise [reward = +1].
All other states: ground [reward = 0].
This script is the environment part of this example.
The RL is in RL_brain.py.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
import numpy as np
import time
import sys
if sys.version_info.major == 2:
import Tkinter as tk
else:
import tkinter as tk
UNIT = 40 # pixels
MAZE_H = 4 # grid height
MAZE_W = 4 # grid width
class Maze(tk.Tk, object):
def __init__(self):
super(Maze, self).__init__()
self.action_space = ['u', 'd', 'l', 'r']
self.n_actions = len(self.action_space)
self.title('maze')
self.geometry('{0}x{1}'.format(MAZE_W * UNIT, MAZE_H * UNIT))
self._build_maze()
def _build_maze(self):
self.canvas = tk.Canvas(self, bg='white',
height=MAZE_H * UNIT,
width=MAZE_W * UNIT)
# create grids
for c in range(0, MAZE_W * UNIT, UNIT):
x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
self.canvas.create_line(x0, y0, x1, y1)
for r in range(0, MAZE_H * UNIT, UNIT):
x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r
self.canvas.create_line(x0, y0, x1, y1)
# create origin
origin = np.array([20, 20])
# hell
hell1_center = origin + np.array([UNIT * 2, UNIT])
self.hell1 = self.canvas.create_rectangle(
hell1_center[0] - 15, hell1_center[1] - 15,
hell1_center[0] + 15, hell1_center[1] + 15,
fill='black')
# hell
hell2_center = origin + np.array([UNIT, UNIT * 2])
self.hell2 = self.canvas.create_rectangle(
hell2_center[0] - 15, hell2_center[1] - 15,
hell2_center[0] + 15, hell2_center[1] + 15,
fill='black')
# create oval
oval_center = origin + UNIT * 2
self.oval = self.canvas.create_oval(
oval_center[0] - 15, oval_center[1] - 15,
oval_center[0] + 15, oval_center[1] + 15,
fill='yellow')
# create red rect
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# pack all
self.canvas.pack()
def reset(self):
self.update()
time.sleep(0.5)
self.canvas.delete(self.rect)
origin = np.array([20, 20])
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# return observation
return self.canvas.coords(self.rect)
def step(self, action):
s = self.canvas.coords(self.rect)
base_action = np.array([0, 0])
if action == 0: # up
if s[1] > UNIT:
base_action[1] -= UNIT
elif action == 1: # down
if s[1] < (MAZE_H - 1) * UNIT:
base_action[1] += UNIT
elif action == 2: # right
if s[0] < (MAZE_W - 1) * UNIT:
base_action[0] += UNIT
elif action == 3: # left
if s[0] > UNIT:
base_action[0] -= UNIT
self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
s_ = self.canvas.coords(self.rect) # next state
# reward function
if s_ == self.canvas.coords(self.oval):
reward = 1
done = True
s_ = 'terminal'
elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
reward = -1
done = True
s_ = 'terminal'
else:
reward = 0
done = False
return s_, reward, done
def render(self):
time.sleep(0.1)
self.update()
================================================
FILE: contents/3_Sarsa_maze/run_this.py
================================================
"""
Sarsa is a online updating method for Reinforcement learning.
Unlike Q learning which is a offline updating method, Sarsa is updating while in the current trajectory.
You will see the sarsa is more coward when punishment is close because it cares about all behaviours,
while q learning is more brave because it only cares about maximum behaviour.
"""
from maze_env import Maze
from RL_brain import SarsaTable
def update():
for episode in range(100):
# initial observation
observation = env.reset()
# RL choose action based on observation
action = RL.choose_action(str(observation))
while True:
# fresh env
env.render()
# RL take action and get next observation and reward
observation_, reward, done = env.step(action)
# RL choose action based on next observation
action_ = RL.choose_action(str(observation_))
# RL learn from this transition (s, a, r, s, a) ==> Sarsa
RL.learn(str(observation), action, reward, str(observation_), action_)
# swap observation and action
observation = observation_
action = action_
# break while loop when end of this episode
if done:
break
# end of game
print('game over')
env.destroy()
if __name__ == "__main__":
env = Maze()
RL = SarsaTable(actions=list(range(env.n_actions)))
env.after(100, update)
env.mainloop()
================================================
FILE: contents/4_Sarsa_lambda_maze/RL_brain.py
================================================
"""
This part of code is the Q learning brain, which is a brain of the agent.
All decisions are made in here.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
import numpy as np
import pandas as pd
class RL(object):
def __init__(self, action_space, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
self.actions = action_space # a list
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon = e_greedy
self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
def check_state_exist(self, state):
if state not in self.q_table.index:
# append new state to q table
self.q_table = self.q_table.append(
pd.Series(
[0]*len(self.actions),
index=self.q_table.columns,
name=state,
)
)
def choose_action(self, observation):
self.check_state_exist(observation)
# action selection
if np.random.rand() < self.epsilon:
# choose best action
state_action = self.q_table.loc[observation, :]
# some actions may have the same value, randomly choose on in these actions
action = np.random.choice(state_action[state_action == np.max(state_action)].index)
else:
# choose random action
action = np.random.choice(self.actions)
return action
def learn(self, *args):
pass
# backward eligibility traces
class SarsaLambdaTable(RL):
def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, trace_decay=0.9):
super(SarsaLambdaTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
# backward view, eligibility trace.
self.lambda_ = trace_decay
self.eligibility_trace = self.q_table.copy()
def check_state_exist(self, state):
if state not in self.q_table.index:
# append new state to q table
to_be_append = pd.Series(
[0] * len(self.actions),
index=self.q_table.columns,
name=state,
)
self.q_table = self.q_table.append(to_be_append)
# also update eligibility trace
self.eligibility_trace = self.eligibility_trace.append(to_be_append)
def learn(self, s, a, r, s_, a_):
self.check_state_exist(s_)
q_predict = self.q_table.loc[s, a]
if s_ != 'terminal':
q_target = r + self.gamma * self.q_table.loc[s_, a_] # next state is not terminal
else:
q_target = r # next state is terminal
error = q_target - q_predict
# increase trace amount for visited state-action pair
# Method 1:
# self.eligibility_trace.loc[s, a] += 1
# Method 2:
self.eligibility_trace.loc[s, :] *= 0
self.eligibility_trace.loc[s, a] = 1
# Q update
self.q_table += self.lr * error * self.eligibility_trace
# decay eligibility trace after update
self.eligibility_trace *= self.gamma*self.lambda_
================================================
FILE: contents/4_Sarsa_lambda_maze/maze_env.py
================================================
"""
Reinforcement learning maze example.
Red rectangle: explorer.
Black rectangles: hells [reward = -1].
Yellow bin circle: paradise [reward = +1].
All other states: ground [reward = 0].
This script is the environment part of this example.
The RL is in RL_brain.py.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
import numpy as np
import time
import sys
if sys.version_info.major == 2:
import Tkinter as tk
else:
import tkinter as tk
UNIT = 40 # pixels
MAZE_H = 4 # grid height
MAZE_W = 4 # grid width
class Maze(tk.Tk, object):
def __init__(self):
super(Maze, self).__init__()
self.action_space = ['u', 'd', 'l', 'r']
self.n_actions = len(self.action_space)
self.title('maze')
self.geometry('{0}x{1}'.format(MAZE_W * UNIT, MAZE_H * UNIT))
self._build_maze()
def _build_maze(self):
self.canvas = tk.Canvas(self, bg='white',
height=MAZE_H * UNIT,
width=MAZE_W * UNIT)
# create grids
for c in range(0, MAZE_W * UNIT, UNIT):
x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
self.canvas.create_line(x0, y0, x1, y1)
for r in range(0, MAZE_H * UNIT, UNIT):
x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r
self.canvas.create_line(x0, y0, x1, y1)
# create origin
origin = np.array([20, 20])
# hell
hell1_center = origin + np.array([UNIT * 2, UNIT])
self.hell1 = self.canvas.create_rectangle(
hell1_center[0] - 15, hell1_center[1] - 15,
hell1_center[0] + 15, hell1_center[1] + 15,
fill='black')
# hell
hell2_center = origin + np.array([UNIT, UNIT * 2])
self.hell2 = self.canvas.create_rectangle(
hell2_center[0] - 15, hell2_center[1] - 15,
hell2_center[0] + 15, hell2_center[1] + 15,
fill='black')
# create oval
oval_center = origin + UNIT * 2
self.oval = self.canvas.create_oval(
oval_center[0] - 15, oval_center[1] - 15,
oval_center[0] + 15, oval_center[1] + 15,
fill='yellow')
# create red rect
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# pack all
self.canvas.pack()
def reset(self):
self.update()
time.sleep(0.5)
self.canvas.delete(self.rect)
origin = np.array([20, 20])
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# return observation
return self.canvas.coords(self.rect)
def step(self, action):
s = self.canvas.coords(self.rect)
base_action = np.array([0, 0])
if action == 0: # up
if s[1] > UNIT:
base_action[1] -= UNIT
elif action == 1: # down
if s[1] < (MAZE_H - 1) * UNIT:
base_action[1] += UNIT
elif action == 2: # right
if s[0] < (MAZE_W - 1) * UNIT:
base_action[0] += UNIT
elif action == 3: # left
if s[0] > UNIT:
base_action[0] -= UNIT
self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
s_ = self.canvas.coords(self.rect) # next state
# reward function
if s_ == self.canvas.coords(self.oval):
reward = 1
done = True
s_ = 'terminal'
elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
reward = -1
done = True
s_ = 'terminal'
else:
reward = 0
done = False
return s_, reward, done
def render(self):
time.sleep(0.05)
self.update()
================================================
FILE: contents/4_Sarsa_lambda_maze/run_this.py
================================================
"""
Sarsa is a online updating method for Reinforcement learning.
Unlike Q learning which is a offline updating method, Sarsa is updating while in the current trajectory.
You will see the sarsa is more coward when punishment is close because it cares about all behaviours,
while q learning is more brave because it only cares about maximum behaviour.
"""
from maze_env import Maze
from RL_brain import SarsaLambdaTable
def update():
for episode in range(100):
# initial observation
observation = env.reset()
# RL choose action based on observation
action = RL.choose_action(str(observation))
# initial all zero eligibility trace
RL.eligibility_trace *= 0
while True:
# fresh env
env.render()
# RL take action and get next observation and reward
observation_, reward, done = env.step(action)
# RL choose action based on next observation
action_ = RL.choose_action(str(observation_))
# RL learn from this transition (s, a, r, s, a) ==> Sarsa
RL.learn(str(observation), action, reward, str(observation_), action_)
# swap observation and action
observation = observation_
action = action_
# break while loop when end of this episode
if done:
break
# end of game
print('game over')
env.destroy()
if __name__ == "__main__":
env = Maze()
RL = SarsaLambdaTable(actions=list(range(env.n_actions)))
env.after(100, update)
env.mainloop()
================================================
FILE: contents/5.1_Double_DQN/RL_brain.py
================================================
"""
The double DQN based on this paper: https://arxiv.org/abs/1509.06461
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
Tensorflow: 1.0
gym: 0.8.0
"""
import numpy as np
import tensorflow as tf
np.random.seed(1)
tf.set_random_seed(1)
class DoubleDQN:
def __init__(
self,
n_actions,
n_features,
learning_rate=0.005,
reward_decay=0.9,
e_greedy=0.9,
replace_target_iter=200,
memory_size=3000,
batch_size=32,
e_greedy_increment=None,
output_graph=False,
double_q=True,
sess=None,
):
self.n_actions = n_actions
self.n_features = n_features
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon_max = e_greedy
self.replace_target_iter = replace_target_iter
self.memory_size = memory_size
self.batch_size = batch_size
self.epsilon_increment = e_greedy_increment
self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
self.double_q = double_q # decide to use double q or not
self.learn_step_counter = 0
self.memory = np.zeros((self.memory_size, n_features*2+2))
self._build_net()
t_params = tf.get_collection('target_net_params')
e_params = tf.get_collection('eval_net_params')
self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
if sess is None:
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
else:
self.sess = sess
if output_graph:
tf.summary.FileWriter("logs/", self.sess.graph)
self.cost_his = []
def _build_net(self):
def build_layers(s, c_names, n_l1, w_initializer, b_initializer):
with tf.variable_scope('l1'):
w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
l1 = tf.nn.relu(tf.matmul(s, w1) + b1)
with tf.variable_scope('l2'):
w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
out = tf.matmul(l1, w2) + b2
return out
# ------------------ build evaluate_net ------------------
self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input
self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss
with tf.variable_scope('eval_net'):
c_names, n_l1, w_initializer, b_initializer = \
['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 20, \
tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers
self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer)
with tf.variable_scope('loss'):
self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
with tf.variable_scope('train'):
self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
# ------------------ build target_net ------------------
self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input
with tf.variable_scope('target_net'):
c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer)
def store_transition(self, s, a, r, s_):
if not hasattr(self, 'memory_counter'):
self.memory_counter = 0
transition = np.hstack((s, [a, r], s_))
index = self.memory_counter % self.memory_size
self.memory[index, :] = transition
self.memory_counter += 1
def choose_action(self, observation):
observation = observation[np.newaxis, :]
actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
action = np.argmax(actions_value)
if not hasattr(self, 'q'): # record action value it gets
self.q = []
self.running_q = 0
self.running_q = self.running_q*0.99 + 0.01 * np.max(actions_value)
self.q.append(self.running_q)
if np.random.uniform() > self.epsilon: # choosing action
action = np.random.randint(0, self.n_actions)
return action
def learn(self):
if self.learn_step_counter % self.replace_target_iter == 0:
self.sess.run(self.replace_target_op)
print('\ntarget_params_replaced\n')
if self.memory_counter > self.memory_size:
sample_index = np.random.choice(self.memory_size, size=self.batch_size)
else:
sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
batch_memory = self.memory[sample_index, :]
q_next, q_eval4next = self.sess.run(
[self.q_next, self.q_eval],
feed_dict={self.s_: batch_memory[:, -self.n_features:], # next observation
self.s: batch_memory[:, -self.n_features:]}) # next observation
q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_features]})
q_target = q_eval.copy()
batch_index = np.arange(self.batch_size, dtype=np.int32)
eval_act_index = batch_memory[:, self.n_features].astype(int)
reward = batch_memory[:, self.n_features + 1]
if self.double_q:
max_act4next = np.argmax(q_eval4next, axis=1) # the action that brings the highest value is evaluated by q_eval
selected_q_next = q_next[batch_index, max_act4next] # Double DQN, select q_next depending on above actions
else:
selected_q_next = np.max(q_next, axis=1) # the natural DQN
q_target[batch_index, eval_act_index] = reward + self.gamma * selected_q_next
_, self.cost = self.sess.run([self._train_op, self.loss],
feed_dict={self.s: batch_memory[:, :self.n_features],
self.q_target: q_target})
self.cost_his.append(self.cost)
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
self.learn_step_counter += 1
================================================
FILE: contents/5.1_Double_DQN/run_Pendulum.py
================================================
"""
Double DQN & Natural DQN comparison,
The Pendulum example.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
Tensorflow: 1.0
gym: 0.8.0
"""
import gym
from RL_brain import DoubleDQN
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
env = gym.make('Pendulum-v0')
env = env.unwrapped
env.seed(1)
MEMORY_SIZE = 3000
ACTION_SPACE = 11
sess = tf.Session()
with tf.variable_scope('Natural_DQN'):
natural_DQN = DoubleDQN(
n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE,
e_greedy_increment=0.001, double_q=False, sess=sess
)
with tf.variable_scope('Double_DQN'):
double_DQN = DoubleDQN(
n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE,
e_greedy_increment=0.001, double_q=True, sess=sess, output_graph=True)
sess.run(tf.global_variables_initializer())
def train(RL):
total_steps = 0
observation = env.reset()
while True:
# if total_steps - MEMORY_SIZE > 8000: env.render()
action = RL.choose_action(observation)
f_action = (action-(ACTION_SPACE-1)/2)/((ACTION_SPACE-1)/4) # convert to [-2 ~ 2] float actions
observation_, reward, done, info = env.step(np.array([f_action]))
reward /= 10 # normalize to a range of (-1, 0). r = 0 when get upright
# the Q target at upright state will be 0, because Q_target = r + gamma * Qmax(s', a') = 0 + gamma * 0
# so when Q at this state is greater than 0, the agent overestimates the Q. Please refer to the final result.
RL.store_transition(observation, action, reward, observation_)
if total_steps > MEMORY_SIZE: # learning
RL.learn()
if total_steps - MEMORY_SIZE > 20000: # stop game
break
observation = observation_
total_steps += 1
return RL.q
q_natural = train(natural_DQN)
q_double = train(double_DQN)
plt.plot(np.array(q_natural), c='r', label='natural')
plt.plot(np.array(q_double), c='b', label='double')
plt.legend(loc='best')
plt.ylabel('Q eval')
plt.xlabel('training steps')
plt.grid()
plt.show()
================================================
FILE: contents/5.2_Prioritized_Replay_DQN/RL_brain.py
================================================
"""
The DQN improvement: Prioritized Experience Replay (based on https://arxiv.org/abs/1511.05952)
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
Tensorflow: 1.0
gym: 0.8.0
"""
import numpy as np
import tensorflow as tf
np.random.seed(1)
tf.set_random_seed(1)
class SumTree(object):
"""
This SumTree code is a modified version and the original code is from:
https://github.com/jaara/AI-blog/blob/master/SumTree.py
Story data with its priority in the tree.
"""
data_pointer = 0
def __init__(self, capacity):
self.capacity = capacity # for all priority values
self.tree = np.zeros(2 * capacity - 1)
# [--------------Parent nodes-------------][-------leaves to recode priority-------]
# size: capacity - 1 size: capacity
self.data = np.zeros(capacity, dtype=object) # for all transitions
# [--------------data frame-------------]
# size: capacity
def add(self, p, data):
tree_idx = self.data_pointer + self.capacity - 1
self.data[self.data_pointer] = data # update data_frame
self.update(tree_idx, p) # update tree_frame
self.data_pointer += 1
if self.data_pointer >= self.capacity: # replace when exceed the capacity
self.data_pointer = 0
def update(self, tree_idx, p):
change = p - self.tree[tree_idx]
self.tree[tree_idx] = p
# then propagate the change through tree
while tree_idx != 0: # this method is faster than the recursive loop in the reference code
tree_idx = (tree_idx - 1) // 2
self.tree[tree_idx] += change
def get_leaf(self, v):
"""
Tree structure and array storage:
Tree index:
0 -> storing priority sum
/ \
1 2
/ \ / \
3 4 5 6 -> storing priority for transitions
Array type for storing:
[0,1,2,3,4,5,6]
"""
parent_idx = 0
while True: # the while loop is faster than the method in the reference code
cl_idx = 2 * parent_idx + 1 # this leaf's left and right kids
cr_idx = cl_idx + 1
if cl_idx >= len(self.tree): # reach bottom, end search
leaf_idx = parent_idx
break
else: # downward search, always search for a higher priority node
if v <= self.tree[cl_idx]:
parent_idx = cl_idx
else:
v -= self.tree[cl_idx]
parent_idx = cr_idx
data_idx = leaf_idx - self.capacity + 1
return leaf_idx, self.tree[leaf_idx], self.data[data_idx]
@property
def total_p(self):
return self.tree[0] # the root
class Memory(object): # stored as ( s, a, r, s_ ) in SumTree
"""
This Memory class is modified based on the original code from:
https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
"""
epsilon = 0.01 # small amount to avoid zero priority
alpha = 0.6 # [0~1] convert the importance of TD error to priority
beta = 0.4 # importance-sampling, from initial value increasing to 1
beta_increment_per_sampling = 0.001
abs_err_upper = 1. # clipped abs error
def __init__(self, capacity):
self.tree = SumTree(capacity)
def store(self, transition):
max_p = np.max(self.tree.tree[-self.tree.capacity:])
if max_p == 0:
max_p = self.abs_err_upper
self.tree.add(max_p, transition) # set the max p for new p
def sample(self, n):
b_idx, b_memory, ISWeights = np.empty((n,), dtype=np.int32), np.empty((n, self.tree.data[0].size)), np.empty((n, 1))
pri_seg = self.tree.total_p / n # priority segment
self.beta = np.min([1., self.beta + self.beta_increment_per_sampling]) # max = 1
min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_p # for later calculate ISweight
for i in range(n):
a, b = pri_seg * i, pri_seg * (i + 1)
v = np.random.uniform(a, b)
idx, p, data = self.tree.get_leaf(v)
prob = p / self.tree.total_p
ISWeights[i, 0] = np.power(prob/min_prob, -self.beta)
b_idx[i], b_memory[i, :] = idx, data
return b_idx, b_memory, ISWeights
def batch_update(self, tree_idx, abs_errors):
abs_errors += self.epsilon # convert to abs and avoid 0
clipped_errors = np.minimum(abs_errors, self.abs_err_upper)
ps = np.power(clipped_errors, self.alpha)
for ti, p in zip(tree_idx, ps):
self.tree.update(ti, p)
class DQNPrioritizedReplay:
def __init__(
self,
n_actions,
n_features,
learning_rate=0.005,
reward_decay=0.9,
e_greedy=0.9,
replace_target_iter=500,
memory_size=10000,
batch_size=32,
e_greedy_increment=None,
output_graph=False,
prioritized=True,
sess=None,
):
self.n_actions = n_actions
self.n_features = n_features
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon_max = e_greedy
self.replace_target_iter = replace_target_iter
self.memory_size = memory_size
self.batch_size = batch_size
self.epsilon_increment = e_greedy_increment
self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
self.prioritized = prioritized # decide to use double q or not
self.learn_step_counter = 0
self._build_net()
t_params = tf.get_collection('target_net_params')
e_params = tf.get_collection('eval_net_params')
self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
if self.prioritized:
self.memory = Memory(capacity=memory_size)
else:
self.memory = np.zeros((self.memory_size, n_features*2+2))
if sess is None:
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
else:
self.sess = sess
if output_graph:
tf.summary.FileWriter("logs/", self.sess.graph)
self.cost_his = []
def _build_net(self):
def build_layers(s, c_names, n_l1, w_initializer, b_initializer, trainable):
with tf.variable_scope('l1'):
w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names, trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names, trainable=trainable)
l1 = tf.nn.relu(tf.matmul(s, w1) + b1)
with tf.variable_scope('l2'):
w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names, trainable=trainable)
b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names, trainable=trainable)
out = tf.matmul(l1, w2) + b2
return out
# ------------------ build evaluate_net ------------------
self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input
self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss
if self.prioritized:
self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights')
with tf.variable_scope('eval_net'):
c_names, n_l1, w_initializer, b_initializer = \
['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 20, \
tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers
self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer, True)
with tf.variable_scope('loss'):
if self.prioritized:
self.abs_errors = tf.reduce_sum(tf.abs(self.q_target - self.q_eval), axis=1) # for updating Sumtree
self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.q_target, self.q_eval))
else:
self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
with tf.variable_scope('train'):
self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
# ------------------ build target_net ------------------
self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input
with tf.variable_scope('target_net'):
c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer, False)
def store_transition(self, s, a, r, s_):
if self.prioritized: # prioritized replay
transition = np.hstack((s, [a, r], s_))
self.memory.store(transition) # have high priority for newly arrived transition
else: # random replay
if not hasattr(self, 'memory_counter'):
self.memory_counter = 0
transition = np.hstack((s, [a, r], s_))
index = self.memory_counter % self.memory_size
self.memory[index, :] = transition
self.memory_counter += 1
def choose_action(self, observation):
observation = observation[np.newaxis, :]
if np.random.uniform() < self.epsilon:
actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
action = np.argmax(actions_value)
else:
action = np.random.randint(0, self.n_actions)
return action
def learn(self):
if self.learn_step_counter % self.replace_target_iter == 0:
self.sess.run(self.replace_target_op)
print('\ntarget_params_replaced\n')
if self.prioritized:
tree_idx, batch_memory, ISWeights = self.memory.sample(self.batch_size)
else:
sample_index = np.random.choice(self.memory_size, size=self.batch_size)
batch_memory = self.memory[sample_index, :]
q_next, q_eval = self.sess.run(
[self.q_next, self.q_eval],
feed_dict={self.s_: batch_memory[:, -self.n_features:],
self.s: batch_memory[:, :self.n_features]})
q_target = q_eval.copy()
batch_index = np.arange(self.batch_size, dtype=np.int32)
eval_act_index = batch_memory[:, self.n_features].astype(int)
reward = batch_memory[:, self.n_features + 1]
q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
if self.prioritized:
_, abs_errors, self.cost = self.sess.run([self._train_op, self.abs_errors, self.loss],
feed_dict={self.s: batch_memory[:, :self.n_features],
self.q_target: q_target,
self.ISWeights: ISWeights})
self.memory.batch_update(tree_idx, abs_errors) # update priority
else:
_, self.cost = self.sess.run([self._train_op, self.loss],
feed_dict={self.s: batch_memory[:, :self.n_features],
self.q_target: q_target})
self.cost_his.append(self.cost)
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
self.learn_step_counter += 1
================================================
FILE: contents/5.2_Prioritized_Replay_DQN/run_MountainCar.py
================================================
"""
The DQN improvement: Prioritized Experience Replay (based on https://arxiv.org/abs/1511.05952)
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
Tensorflow: 1.0
gym: 0.8.0
"""
import gym
from RL_brain import DQNPrioritizedReplay
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
env = gym.make('MountainCar-v0')
env = env.unwrapped
env.seed(21)
MEMORY_SIZE = 10000
sess = tf.Session()
with tf.variable_scope('natural_DQN'):
RL_natural = DQNPrioritizedReplay(
n_actions=3, n_features=2, memory_size=MEMORY_SIZE,
e_greedy_increment=0.00005, sess=sess, prioritized=False,
)
with tf.variable_scope('DQN_with_prioritized_replay'):
RL_prio = DQNPrioritizedReplay(
n_actions=3, n_features=2, memory_size=MEMORY_SIZE,
e_greedy_increment=0.00005, sess=sess, prioritized=True, output_graph=True,
)
sess.run(tf.global_variables_initializer())
def train(RL):
total_steps = 0
steps = []
episodes = []
for i_episode in range(20):
observation = env.reset()
while True:
# env.render()
action = RL.choose_action(observation)
observation_, reward, done, info = env.step(action)
if done: reward = 10
RL.store_transition(observation, action, reward, observation_)
if total_steps > MEMORY_SIZE:
RL.learn()
if done:
print('episode ', i_episode, ' finished')
steps.append(total_steps)
episodes.append(i_episode)
break
observation = observation_
total_steps += 1
return np.vstack((episodes, steps))
his_natural = train(RL_natural)
his_prio = train(RL_prio)
# compare based on first success
plt.plot(his_natural[0, :], his_natural[1, :] - his_natural[1, 0], c='b', label='natural DQN')
plt.plot(his_prio[0, :], his_prio[1, :] - his_prio[1, 0], c='r', label='DQN with prioritized replay')
plt.legend(loc='best')
plt.ylabel('total training time')
plt.xlabel('episode')
plt.grid()
plt.show()
================================================
FILE: contents/5.3_Dueling_DQN/RL_brain.py
================================================
"""
The Dueling DQN based on this paper: https://arxiv.org/abs/1511.06581
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
Tensorflow: 1.0
gym: 0.8.0
"""
import numpy as np
import tensorflow as tf
np.random.seed(1)
tf.set_random_seed(1)
class DuelingDQN:
def __init__(
self,
n_actions,
n_features,
learning_rate=0.001,
reward_decay=0.9,
e_greedy=0.9,
replace_target_iter=200,
memory_size=500,
batch_size=32,
e_greedy_increment=None,
output_graph=False,
dueling=True,
sess=None,
):
self.n_actions = n_actions
self.n_features = n_features
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon_max = e_greedy
self.replace_target_iter = replace_target_iter
self.memory_size = memory_size
self.batch_size = batch_size
self.epsilon_increment = e_greedy_increment
self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
self.dueling = dueling # decide to use dueling DQN or not
self.learn_step_counter = 0
self.memory = np.zeros((self.memory_size, n_features*2+2))
self._build_net()
t_params = tf.get_collection('target_net_params')
e_params = tf.get_collection('eval_net_params')
self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
if sess is None:
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
else:
self.sess = sess
if output_graph:
tf.summary.FileWriter("logs/", self.sess.graph)
self.cost_his = []
def _build_net(self):
def build_layers(s, c_names, n_l1, w_initializer, b_initializer):
with tf.variable_scope('l1'):
w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
l1 = tf.nn.relu(tf.matmul(s, w1) + b1)
if self.dueling:
# Dueling DQN
with tf.variable_scope('Value'):
w2 = tf.get_variable('w2', [n_l1, 1], initializer=w_initializer, collections=c_names)
b2 = tf.get_variable('b2', [1, 1], initializer=b_initializer, collections=c_names)
self.V = tf.matmul(l1, w2) + b2
with tf.variable_scope('Advantage'):
w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
self.A = tf.matmul(l1, w2) + b2
with tf.variable_scope('Q'):
out = self.V + (self.A - tf.reduce_mean(self.A, axis=1, keep_dims=True)) # Q = V(s) + A(s,a)
else:
with tf.variable_scope('Q'):
w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
out = tf.matmul(l1, w2) + b2
return out
# ------------------ build evaluate_net ------------------
self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input
self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss
with tf.variable_scope('eval_net'):
c_names, n_l1, w_initializer, b_initializer = \
['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 20, \
tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers
self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer)
with tf.variable_scope('loss'):
self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
with tf.variable_scope('train'):
self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
# ------------------ build target_net ------------------
self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input
with tf.variable_scope('target_net'):
c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer)
def store_transition(self, s, a, r, s_):
if not hasattr(self, 'memory_counter'):
self.memory_counter = 0
transition = np.hstack((s, [a, r], s_))
index = self.memory_counter % self.memory_size
self.memory[index, :] = transition
self.memory_counter += 1
def choose_action(self, observation):
observation = observation[np.newaxis, :]
if np.random.uniform() < self.epsilon: # choosing action
actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
action = np.argmax(actions_value)
else:
action = np.random.randint(0, self.n_actions)
return action
def learn(self):
if self.learn_step_counter % self.replace_target_iter == 0:
self.sess.run(self.replace_target_op)
print('\ntarget_params_replaced\n')
sample_index = np.random.choice(self.memory_size, size=self.batch_size)
batch_memory = self.memory[sample_index, :]
q_next = self.sess.run(self.q_next, feed_dict={self.s_: batch_memory[:, -self.n_features:]}) # next observation
q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_features]})
q_target = q_eval.copy()
batch_index = np.arange(self.batch_size, dtype=np.int32)
eval_act_index = batch_memory[:, self.n_features].astype(int)
reward = batch_memory[:, self.n_features + 1]
q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
_, self.cost = self.sess.run([self._train_op, self.loss],
feed_dict={self.s: batch_memory[:, :self.n_features],
self.q_target: q_target})
self.cost_his.append(self.cost)
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
self.learn_step_counter += 1
================================================
FILE: contents/5.3_Dueling_DQN/run_Pendulum.py
================================================
"""
Dueling DQN & Natural DQN comparison
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
Tensorflow: 1.0
gym: 0.8.0
"""
import gym
from RL_brain import DuelingDQN
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
env = gym.make('Pendulum-v0')
env = env.unwrapped
env.seed(1)
MEMORY_SIZE = 3000
ACTION_SPACE = 25
sess = tf.Session()
with tf.variable_scope('natural'):
natural_DQN = DuelingDQN(
n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE,
e_greedy_increment=0.001, sess=sess, dueling=False)
with tf.variable_scope('dueling'):
dueling_DQN = DuelingDQN(
n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE,
e_greedy_increment=0.001, sess=sess, dueling=True, output_graph=True)
sess.run(tf.global_variables_initializer())
def train(RL):
acc_r = [0]
total_steps = 0
observation = env.reset()
while True:
# if total_steps-MEMORY_SIZE > 9000: env.render()
action = RL.choose_action(observation)
f_action = (action-(ACTION_SPACE-1)/2)/((ACTION_SPACE-1)/4) # [-2 ~ 2] float actions
observation_, reward, done, info = env.step(np.array([f_action]))
reward /= 10 # normalize to a range of (-1, 0)
acc_r.append(reward + acc_r[-1]) # accumulated reward
RL.store_transition(observation, action, reward, observation_)
if total_steps > MEMORY_SIZE:
RL.learn()
if total_steps-MEMORY_SIZE > 15000:
break
observation = observation_
total_steps += 1
return RL.cost_his, acc_r
c_natural, r_natural = train(natural_DQN)
c_dueling, r_dueling = train(dueling_DQN)
plt.figure(1)
plt.plot(np.array(c_natural), c='r', label='natural')
plt.plot(np.array(c_dueling), c='b', label='dueling')
plt.legend(loc='best')
plt.ylabel('cost')
plt.xlabel('training steps')
plt.grid()
plt.figure(2)
plt.plot(np.array(r_natural), c='r', label='natural')
plt.plot(np.array(r_dueling), c='b', label='dueling')
plt.legend(loc='best')
plt.ylabel('accumulated reward')
plt.xlabel('training steps')
plt.grid()
plt.show()
================================================
FILE: contents/5_Deep_Q_Network/DQN_modified.py
================================================
"""
This part of code is the Deep Q Network (DQN) brain.
view the tensorboard picture about this DQN structure on: https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/4-3-DQN3/#modification
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
Tensorflow: r1.2
"""
import numpy as np
import tensorflow as tf
np.random.seed(1)
tf.set_random_seed(1)
# Deep Q Network off-policy
class DeepQNetwork:
def __init__(
self,
n_actions,
n_features,
learning_rate=0.01,
reward_decay=0.9,
e_greedy=0.9,
replace_target_iter=300,
memory_size=500,
batch_size=32,
e_greedy_increment=None,
output_graph=False,
):
self.n_actions = n_actions
self.n_features = n_features
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon_max = e_greedy
self.replace_target_iter = replace_target_iter
self.memory_size = memory_size
self.batch_size = batch_size
self.epsilon_increment = e_greedy_increment
self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
# total learning step
self.learn_step_counter = 0
# initialize zero memory [s, a, r, s_]
self.memory = np.zeros((self.memory_size, n_features * 2 + 2))
# consist of [target_net, evaluate_net]
self._build_net()
t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net')
e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_net')
with tf.variable_scope('hard_replacement'):
self.target_replace_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
self.sess = tf.Session()
if output_graph:
# $ tensorboard --logdir=logs
tf.summary.FileWriter("logs/", self.sess.graph)
self.sess.run(tf.global_variables_initializer())
self.cost_his = []
def _build_net(self):
# ------------------ all inputs ------------------------
self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input State
self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input Next State
self.r = tf.placeholder(tf.float32, [None, ], name='r') # input Reward
self.a = tf.placeholder(tf.int32, [None, ], name='a') # input Action
w_initializer, b_initializer = tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)
# ------------------ build evaluate_net ------------------
with tf.variable_scope('eval_net'):
e1 = tf.layers.dense(self.s, 20, tf.nn.relu, kernel_initializer=w_initializer,
bias_initializer=b_initializer, name='e1')
self.q_eval = tf.layers.dense(e1, self.n_actions, kernel_initializer=w_initializer,
bias_initializer=b_initializer, name='q')
# ------------------ build target_net ------------------
with tf.variable_scope('target_net'):
t1 = tf.layers.dense(self.s_, 20, tf.nn.relu, kernel_initializer=w_initializer,
bias_initializer=b_initializer, name='t1')
self.q_next = tf.layers.dense(t1, self.n_actions, kernel_initializer=w_initializer,
bias_initializer=b_initializer, name='t2')
with tf.variable_scope('q_target'):
q_target = self.r + self.gamma * tf.reduce_max(self.q_next, axis=1, name='Qmax_s_') # shape=(None, )
self.q_target = tf.stop_gradient(q_target)
with tf.variable_scope('q_eval'):
a_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)
self.q_eval_wrt_a = tf.gather_nd(params=self.q_eval, indices=a_indices) # shape=(None, )
with tf.variable_scope('loss'):
self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval_wrt_a, name='TD_error'))
with tf.variable_scope('train'):
self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
def store_transition(self, s, a, r, s_):
if not hasattr(self, 'memory_counter'):
self.memory_counter = 0
transition = np.hstack((s, [a, r], s_))
# replace the old memory with new memory
index = self.memory_counter % self.memory_size
self.memory[index, :] = transition
self.memory_counter += 1
def choose_action(self, observation):
# to have batch dimension when feed into tf placeholder
observation = observation[np.newaxis, :]
if np.random.uniform() < self.epsilon:
# forward feed the observation and get q value for every actions
actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
action = np.argmax(actions_value)
else:
action = np.random.randint(0, self.n_actions)
return action
def learn(self):
# check to replace target parameters
if self.learn_step_counter % self.replace_target_iter == 0:
self.sess.run(self.target_replace_op)
print('\ntarget_params_replaced\n')
# sample batch memory from all memory
if self.memory_counter > self.memory_size:
sample_index = np.random.choice(self.memory_size, size=self.batch_size)
else:
sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
batch_memory = self.memory[sample_index, :]
_, cost = self.sess.run(
[self._train_op, self.loss],
feed_dict={
self.s: batch_memory[:, :self.n_features],
self.a: batch_memory[:, self.n_features],
self.r: batch_memory[:, self.n_features + 1],
self.s_: batch_memory[:, -self.n_features:],
})
self.cost_his.append(cost)
# increasing epsilon
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
self.learn_step_counter += 1
def plot_cost(self):
import matplotlib.pyplot as plt
plt.plot(np.arange(len(self.cost_his)), self.cost_his)
plt.ylabel('Cost')
plt.xlabel('training steps')
plt.show()
if __name__ == '__main__':
DQN = DeepQNetwork(3,4, output_graph=True)
================================================
FILE: contents/5_Deep_Q_Network/RL_brain.py
================================================
"""
This part of code is the DQN brain, which is a brain of the agent.
All decisions are made in here.
Using Tensorflow to build the neural network.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
Tensorflow: 1.0
gym: 0.7.3
"""
import numpy as np
import pandas as pd
import tensorflow as tf
np.random.seed(1)
tf.set_random_seed(1)
# Deep Q Network off-policy
class DeepQNetwork:
def __init__(
self,
n_actions,
n_features,
learning_rate=0.01,
reward_decay=0.9,
e_greedy=0.9,
replace_target_iter=300,
memory_size=500,
batch_size=32,
e_greedy_increment=None,
output_graph=False,
):
self.n_actions = n_actions
self.n_features = n_features
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon_max = e_greedy
self.replace_target_iter = replace_target_iter
self.memory_size = memory_size
self.batch_size = batch_size
self.epsilon_increment = e_greedy_increment
self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
# total learning step
self.learn_step_counter = 0
# initialize zero memory [s, a, r, s_]
self.memory = np.zeros((self.memory_size, n_features * 2 + 2))
# consist of [target_net, evaluate_net]
self._build_net()
t_params = tf.get_collection('target_net_params')
e_params = tf.get_collection('eval_net_params')
self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
self.sess = tf.Session()
if output_graph:
# $ tensorboard --logdir=logs
# tf.train.SummaryWriter soon be deprecated, use following
tf.summary.FileWriter("logs/", self.sess.graph)
self.sess.run(tf.global_variables_initializer())
self.cost_his = []
def _build_net(self):
# ------------------ build evaluate_net ------------------
self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input
self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss
with tf.variable_scope('eval_net'):
# c_names(collections_names) are the collections to store variables
c_names, n_l1, w_initializer, b_initializer = \
['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \
tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers
# first layer. collections is used later when assign to target net
with tf.variable_scope('l1'):
w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
l1 = tf.nn.relu(tf.matmul(self.s, w1) + b1)
# second layer. collections is used later when assign to target net
with tf.variable_scope('l2'):
w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
self.q_eval = tf.matmul(l1, w2) + b2
with tf.variable_scope('loss'):
self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
with tf.variable_scope('train'):
self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
# ------------------ build target_net ------------------
self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input
with tf.variable_scope('target_net'):
# c_names(collections_names) are the collections to store variables
c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
# first layer. collections is used later when assign to target net
with tf.variable_scope('l1'):
w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
l1 = tf.nn.relu(tf.matmul(self.s_, w1) + b1)
# second layer. collections is used later when assign to target net
with tf.variable_scope('l2'):
w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
self.q_next = tf.matmul(l1, w2) + b2
def store_transition(self, s, a, r, s_):
if not hasattr(self, 'memory_counter'):
self.memory_counter = 0
transition = np.hstack((s, [a, r], s_))
# replace the old memory with new memory
index = self.memory_counter % self.memory_size
self.memory[index, :] = transition
self.memory_counter += 1
def choose_action(self, observation):
# to have batch dimension when feed into tf placeholder
observation = observation[np.newaxis, :]
if np.random.uniform() < self.epsilon:
# forward feed the observation and get q value for every actions
actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
action = np.argmax(actions_value)
else:
action = np.random.randint(0, self.n_actions)
return action
def learn(self):
# check to replace target parameters
if self.learn_step_counter % self.replace_target_iter == 0:
self.sess.run(self.replace_target_op)
print('\ntarget_params_replaced\n')
# sample batch memory from all memory
if self.memory_counter > self.memory_size:
sample_index = np.random.choice(self.memory_size, size=self.batch_size)
else:
sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
batch_memory = self.memory[sample_index, :]
q_next, q_eval = self.sess.run(
[self.q_next, self.q_eval],
feed_dict={
self.s_: batch_memory[:, -self.n_features:], # fixed params
self.s: batch_memory[:, :self.n_features], # newest params
})
# change q_target w.r.t q_eval's action
q_target = q_eval.copy()
batch_index = np.arange(self.batch_size, dtype=np.int32)
eval_act_index = batch_memory[:, self.n_features].astype(int)
reward = batch_memory[:, self.n_features + 1]
q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
"""
For example in this batch I have 2 samples and 3 actions:
q_eval =
[[1, 2, 3],
[4, 5, 6]]
q_target = q_eval =
[[1, 2, 3],
[4, 5, 6]]
Then change q_target with the real q_target value w.r.t the q_eval's action.
For example in:
sample 0, I took action 0, and the max q_target value is -1;
sample 1, I took action 2, and the max q_target value is -2:
q_target =
[[-1, 2, 3],
[4, 5, -2]]
So the (q_target - q_eval) becomes:
[[(-1)-(1), 0, 0],
[0, 0, (-2)-(6)]]
We then backpropagate this error w.r.t the corresponding action to network,
leave other action as error=0 cause we didn't choose it.
"""
# train eval network
_, self.cost = self.sess.run([self._train_op, self.loss],
feed_dict={self.s: batch_memory[:, :self.n_features],
self.q_target: q_target})
self.cost_his.append(self.cost)
# increasing epsilon
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
self.learn_step_counter += 1
def plot_cost(self):
import matplotlib.pyplot as plt
plt.plot(np.arange(len(self.cost_his)), self.cost_his)
plt.ylabel('Cost')
plt.xlabel('training steps')
plt.show()
================================================
FILE: contents/5_Deep_Q_Network/maze_env.py
================================================
"""
Reinforcement learning maze example.
Red rectangle: explorer.
Black rectangles: hells [reward = -1].
Yellow bin circle: paradise [reward = +1].
All other states: ground [reward = 0].
This script is the environment part of this example.
The RL is in RL_brain.py.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
import numpy as np
import time
import sys
if sys.version_info.major == 2:
import Tkinter as tk
else:
import tkinter as tk
UNIT = 40 # pixels
MAZE_H = 4 # grid height
MAZE_W = 4 # grid width
class Maze(tk.Tk, object):
def __init__(self):
super(Maze, self).__init__()
self.action_space = ['u', 'd', 'l', 'r']
self.n_actions = len(self.action_space)
self.n_features = 2
self.title('maze')
self.geometry('{0}x{1}'.format(MAZE_W * UNIT, MAZE_H * UNIT))
self._build_maze()
def _build_maze(self):
self.canvas = tk.Canvas(self, bg='white',
height=MAZE_H * UNIT,
width=MAZE_W * UNIT)
# create grids
for c in range(0, MAZE_W * UNIT, UNIT):
x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
self.canvas.create_line(x0, y0, x1, y1)
for r in range(0, MAZE_H * UNIT, UNIT):
x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r
self.canvas.create_line(x0, y0, x1, y1)
# create origin
origin = np.array([20, 20])
# hell
hell1_center = origin + np.array([UNIT * 2, UNIT])
self.hell1 = self.canvas.create_rectangle(
hell1_center[0] - 15, hell1_center[1] - 15,
hell1_center[0] + 15, hell1_center[1] + 15,
fill='black')
# hell
# hell2_center = origin + np.array([UNIT, UNIT * 2])
# self.hell2 = self.canvas.create_rectangle(
# hell2_center[0] - 15, hell2_center[1] - 15,
# hell2_center[0] + 15, hell2_center[1] + 15,
# fill='black')
# create oval
oval_center = origin + UNIT * 2
self.oval = self.canvas.create_oval(
oval_center[0] - 15, oval_center[1] - 15,
oval_center[0] + 15, oval_center[1] + 15,
fill='yellow')
# create red rect
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# pack all
self.canvas.pack()
def reset(self):
self.update()
time.sleep(0.1)
self.canvas.delete(self.rect)
origin = np.array([20, 20])
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# return observation
return (np.array(self.canvas.coords(self.rect)[:2]) - np.array(self.canvas.coords(self.oval)[:2]))/(MAZE_H*UNIT)
def step(self, action):
s = self.canvas.coords(self.rect)
base_action = np.array([0, 0])
if action == 0: # up
if s[1] > UNIT:
base_action[1] -= UNIT
elif action == 1: # down
if s[1] < (MAZE_H - 1) * UNIT:
base_action[1] += UNIT
elif action == 2: # right
if s[0] < (MAZE_W - 1) * UNIT:
base_action[0] += UNIT
elif action == 3: # left
if s[0] > UNIT:
base_action[0] -= UNIT
self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
next_coords = self.canvas.coords(self.rect) # next state
# reward function
if next_coords == self.canvas.coords(self.oval):
reward = 1
done = True
elif next_coords in [self.canvas.coords(self.hell1)]:
reward = -1
done = True
else:
reward = 0
done = False
s_ = (np.array(next_coords[:2]) - np.array(self.canvas.coords(self.oval)[:2]))/(MAZE_H*UNIT)
return s_, reward, done
def render(self):
# time.sleep(0.01)
self.update()
================================================
FILE: contents/5_Deep_Q_Network/run_this.py
================================================
from maze_env import Maze
from RL_brain import DeepQNetwork
def run_maze():
step = 0
for episode in range(300):
# initial observation
observation = env.reset()
while True:
# fresh env
env.render()
# RL choose action based on observation
action = RL.choose_action(observation)
# RL take action and get next observation and reward
observation_, reward, done = env.step(action)
RL.store_transition(observation, action, reward, observation_)
if (step > 200) and (step % 5 == 0):
RL.learn()
# swap observation
observation = observation_
# break while loop when end of this episode
if done:
break
step += 1
# end of game
print('game over')
env.destroy()
if __name__ == "__main__":
# maze game
env = Maze()
RL = DeepQNetwork(env.n_actions, env.n_features,
learning_rate=0.01,
reward_decay=0.9,
e_greedy=0.9,
replace_target_iter=200,
memory_size=2000,
# output_graph=True
)
env.after(100, run_maze)
env.mainloop()
RL.plot_cost()
================================================
FILE: contents/6_OpenAI_gym/RL_brain.py
================================================
"""
This part of code is the DQN brain, which is a brain of the agent.
All decisions are made in here.
Using Tensorflow to build the neural network.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
Tensorflow: 1.0
gym: 0.8.0
"""
import numpy as np
import pandas as pd
import tensorflow as tf
# Deep Q Network off-policy
class DeepQNetwork:
def __init__(
self,
n_actions,
n_features,
learning_rate=0.01,
reward_decay=0.9,
e_greedy=0.9,
replace_target_iter=300,
memory_size=500,
batch_size=32,
e_greedy_increment=None,
output_graph=False,
):
self.n_actions = n_actions
self.n_features = n_features
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon_max = e_greedy
self.replace_target_iter = replace_target_iter
self.memory_size = memory_size
self.batch_size = batch_size
self.epsilon_increment = e_greedy_increment
self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
# total learning step
self.learn_step_counter = 0
# initialize zero memory [s, a, r, s_]
self.memory = np.zeros((self.memory_size, n_features * 2 + 2))
# consist of [target_net, evaluate_net]
self._build_net()
t_params = tf.get_collection('target_net_params')
e_params = tf.get_collection('eval_net_params')
self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
self.sess = tf.Session()
if output_graph:
# $ tensorboard --logdir=logs
# tf.train.SummaryWriter soon be deprecated, use following
tf.summary.FileWriter("logs/", self.sess.graph)
self.sess.run(tf.global_variables_initializer())
self.cost_his = []
def _build_net(self):
# ------------------ build evaluate_net ------------------
self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input
self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss
with tf.variable_scope('eval_net'):
# c_names(collections_names) are the collections to store variables
c_names, n_l1, w_initializer, b_initializer = \
['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \
tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers
# first layer. collections is used later when assign to target net
with tf.variable_scope('l1'):
w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
l1 = tf.nn.relu(tf.matmul(self.s, w1) + b1)
# second layer. collections is used later when assign to target net
with tf.variable_scope('l2'):
w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
self.q_eval = tf.matmul(l1, w2) + b2
with tf.variable_scope('loss'):
self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
with tf.variable_scope('train'):
self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
# ------------------ build target_net ------------------
self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input
with tf.variable_scope('target_net'):
# c_names(collections_names) are the collections to store variables
c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
# first layer. collections is used later when assign to target net
with tf.variable_scope('l1'):
w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
l1 = tf.nn.relu(tf.matmul(self.s_, w1) + b1)
# second layer. collections is used later when assign to target net
with tf.variable_scope('l2'):
w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
self.q_next = tf.matmul(l1, w2) + b2
def store_transition(self, s, a, r, s_):
if not hasattr(self, 'memory_counter'):
self.memory_counter = 0
transition = np.hstack((s, [a, r], s_))
# replace the old memory with new memory
index = self.memory_counter % self.memory_size
self.memory[index, :] = transition
self.memory_counter += 1
def choose_action(self, observation):
# to have batch dimension when feed into tf placeholder
observation = observation[np.newaxis, :]
if np.random.uniform() < self.epsilon:
# forward feed the observation and get q value for every actions
actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
action = np.argmax(actions_value)
else:
action = np.random.randint(0, self.n_actions)
return action
def learn(self):
# check to replace target parameters
if self.learn_step_counter % self.replace_target_iter == 0:
self.sess.run(self.replace_target_op)
print('\ntarget_params_replaced\n')
# sample batch memory from all memory
if self.memory_counter > self.memory_size:
sample_index = np.random.choice(self.memory_size, size=self.batch_size)
else:
sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
batch_memory = self.memory[sample_index, :]
q_next, q_eval = self.sess.run(
[self.q_next, self.q_eval],
feed_dict={
self.s_: batch_memory[:, -self.n_features:], # fixed params
self.s: batch_memory[:, :self.n_features], # newest params
})
# change q_target w.r.t q_eval's action
q_target = q_eval.copy()
batch_index = np.arange(self.batch_size, dtype=np.int32)
eval_act_index = batch_memory[:, self.n_features].astype(int)
reward = batch_memory[:, self.n_features + 1]
q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
"""
For example in this batch I have 2 samples and 3 actions:
q_eval =
[[1, 2, 3],
[4, 5, 6]]
q_target = q_eval =
[[1, 2, 3],
[4, 5, 6]]
Then change q_target with the real q_target value w.r.t the q_eval's action.
For example in:
sample 0, I took action 0, and the max q_target value is -1;
sample 1, I took action 2, and the max q_target value is -2:
q_target =
[[-1, 2, 3],
[4, 5, -2]]
So the (q_target - q_eval) becomes:
[[(-1)-(1), 0, 0],
[0, 0, (-2)-(6)]]
We then backpropagate this error w.r.t the corresponding action to network,
leave other action as error=0 cause we didn't choose it.
"""
# train eval network
_, self.cost = self.sess.run([self._train_op, self.loss],
feed_dict={self.s: batch_memory[:, :self.n_features],
self.q_target: q_target})
self.cost_his.append(self.cost)
# increasing epsilon
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
self.learn_step_counter += 1
def plot_cost(self):
import matplotlib.pyplot as plt
plt.plot(np.arange(len(self.cost_his)), self.cost_his)
plt.ylabel('Cost')
plt.xlabel('training steps')
plt.show()
================================================
FILE: contents/6_OpenAI_gym/run_CartPole.py
================================================
"""
Deep Q network,
Using:
Tensorflow: 1.0
gym: 0.7.3
"""
import gym
from RL_brain import DeepQNetwork
env = gym.make('CartPole-v0')
env = env.unwrapped
print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)
RL = DeepQNetwork(n_actions=env.action_space.n,
n_features=env.observation_space.shape[0],
learning_rate=0.01, e_greedy=0.9,
replace_target_iter=100, memory_size=2000,
e_greedy_increment=0.001,)
total_steps = 0
for i_episode in range(100):
observation = env.reset()
ep_r = 0
while True:
env.render()
action = RL.choose_action(observation)
observation_, reward, done, info = env.step(action)
# the smaller theta and closer to center the better
x, x_dot, theta, theta_dot = observation_
r1 = (env.x_threshold - abs(x))/env.x_threshold - 0.8
r2 = (env.theta_threshold_radians - abs(theta))/env.theta_threshold_radians - 0.5
reward = r1 + r2
RL.store_transition(observation, action, reward, observation_)
ep_r += reward
if total_steps > 1000:
RL.learn()
if done:
print('episode: ', i_episode,
'ep_r: ', round(ep_r, 2),
' epsilon: ', round(RL.epsilon, 2))
break
observation = observation_
total_steps += 1
RL.plot_cost()
================================================
FILE: contents/6_OpenAI_gym/run_MountainCar.py
================================================
"""
Deep Q network,
Using:
Tensorflow: 1.0
gym: 0.8.0
"""
import gym
from RL_brain import DeepQNetwork
env = gym.make('MountainCar-v0')
env = env.unwrapped
print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)
RL = DeepQNetwork(n_actions=3, n_features=2, learning_rate=0.001, e_greedy=0.9,
replace_target_iter=300, memory_size=3000,
e_greedy_increment=0.0002,)
total_steps = 0
for i_episode in range(10):
observation = env.reset()
ep_r = 0
while True:
env.render()
action = RL.choose_action(observation)
observation_, reward, done, info = env.step(action)
position, velocity = observation_
# the higher the better
reward = abs(position - (-0.5)) # r in [0, 1]
RL.store_transition(observation, action, reward, observation_)
if total_steps > 1000:
RL.learn()
ep_r += reward
if done:
get = '| Get' if observation_[0] >= env.unwrapped.goal_position else '| ----'
print('Epi: ', i_episode,
get,
'| Ep_r: ', round(ep_r, 4),
'| Epsilon: ', round(RL.epsilon, 2))
break
observation = observation_
total_steps += 1
RL.plot_cost()
================================================
FILE: contents/7_Policy_gradient_softmax/RL_brain.py
================================================
"""
This part of code is the reinforcement learning brain, which is a brain of the agent.
All decisions are made in here.
Policy Gradient, Reinforcement Learning.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
Tensorflow: 1.0
gym: 0.8.0
"""
import numpy as np
import tensorflow as tf
# reproducible
np.random.seed(1)
tf.set_random_seed(1)
class PolicyGradient:
def __init__(
self,
n_actions,
n_features,
learning_rate=0.01,
reward_decay=0.95,
output_graph=False,
):
self.n_actions = n_actions
self.n_features = n_features
self.lr = learning_rate
self.gamma = reward_decay
self.ep_obs, self.ep_as, self.ep_rs = [], [], []
self._build_net()
self.sess = tf.Session()
if output_graph:
# $ tensorboard --logdir=logs
# http://0.0.0.0:6006/
# tf.train.SummaryWriter soon be deprecated, use following
tf.summary.FileWriter("logs/", self.sess.graph)
self.sess.run(tf.global_variables_initializer())
def _build_net(self):
with tf.name_scope('inputs'):
self.tf_obs = tf.placeholder(tf.float32, [None, self.n_features], name="observations")
self.tf_acts = tf.placeholder(tf.int32, [None, ], name="actions_num")
self.tf_vt = tf.placeholder(tf.float32, [None, ], name="actions_value")
# fc1
layer = tf.layers.dense(
inputs=self.tf_obs,
units=10,
activation=tf.nn.tanh, # tanh activation
kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
bias_initializer=tf.constant_initializer(0.1),
name='fc1'
)
# fc2
all_act = tf.layers.dense(
inputs=layer,
units=self.n_actions,
activation=None,
kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
bias_initializer=tf.constant_initializer(0.1),
name='fc2'
)
self.all_act_prob = tf.nn.softmax(all_act, name='act_prob') # use softmax to convert to probability
with tf.name_scope('loss'):
# to maximize total reward (log_p * R) is to minimize -(log_p * R), and the tf only have minimize(loss)
neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=all_act, labels=self.tf_acts) # this is negative log of chosen action
# or in this way:
# neg_log_prob = tf.reduce_sum(-tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts, self.n_actions), axis=1)
loss = tf.reduce_mean(neg_log_prob * self.tf_vt) # reward guided loss
with tf.name_scope('train'):
self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)
def choose_action(self, observation):
prob_weights = self.sess.run(self.all_act_prob, feed_dict={self.tf_obs: observation[np.newaxis, :]})
action = np.random.choice(range(prob_weights.shape[1]), p=prob_weights.ravel()) # select action w.r.t the actions prob
return action
def store_transition(self, s, a, r):
self.ep_obs.append(s)
self.ep_as.append(a)
self.ep_rs.append(r)
def learn(self):
# discount and normalize episode reward
discounted_ep_rs_norm = self._discount_and_norm_rewards()
# train on episode
self.sess.run(self.train_op, feed_dict={
self.tf_obs: np.vstack(self.ep_obs), # shape=[None, n_obs]
self.tf_acts: np.array(self.ep_as), # shape=[None, ]
self.tf_vt: discounted_ep_rs_norm, # shape=[None, ]
})
self.ep_obs, self.ep_as, self.ep_rs = [], [], [] # empty episode data
return discounted_ep_rs_norm
def _discount_and_norm_rewards(self):
# discount episode rewards
discounted_ep_rs = np.zeros_like(self.ep_rs)
running_add = 0
for t in reversed(range(0, len(self.ep_rs))):
running_add = running_add * self.gamma + self.ep_rs[t]
discounted_ep_rs[t] = running_add
# normalize episode rewards
discounted_ep_rs -= np.mean(discounted_ep_rs)
discounted_ep_rs /= np.std(discounted_ep_rs)
return discounted_ep_rs
================================================
FILE: contents/7_Policy_gradient_softmax/run_CartPole.py
================================================
"""
Policy Gradient, Reinforcement Learning.
The cart pole example
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
Tensorflow: 1.0
gym: 0.8.0
"""
import gym
from RL_brain import PolicyGradient
import matplotlib.pyplot as plt
DISPLAY_REWARD_THRESHOLD = 400 # renders environment if total episode reward is greater then this threshold
RENDER = False # rendering wastes time
env = gym.make('CartPole-v0')
env.seed(1) # reproducible, general Policy gradient has high variance
env = env.unwrapped
print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)
RL = PolicyGradient(
n_actions=env.action_space.n,
n_features=env.observation_space.shape[0],
learning_rate=0.02,
reward_decay=0.99,
# output_graph=True,
)
for i_episode in range(3000):
observation = env.reset()
while True:
if RENDER: env.render()
action = RL.choose_action(observation)
observation_, reward, done, info = env.step(action)
RL.store_transition(observation, action, reward)
if done:
ep_rs_sum = sum(RL.ep_rs)
if 'running_reward' not in globals():
running_reward = ep_rs_sum
else:
running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
print("episode:", i_episode, " reward:", int(running_reward))
vt = RL.learn()
if i_episode == 0:
plt.plot(vt) # plot the episode vt
plt.xlabel('episode steps')
plt.ylabel('normalized state-action value')
plt.show()
break
observation = observation_
================================================
FILE: contents/7_Policy_gradient_softmax/run_MountainCar.py
================================================
"""
Policy Gradient, Reinforcement Learning.
The cart pole example
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
Tensorflow: 1.0
gym: 0.8.0
"""
import gym
from RL_brain import PolicyGradient
import matplotlib.pyplot as plt
DISPLAY_REWARD_THRESHOLD = -2000 # renders environment if total episode reward is greater then this threshold
# episode: 154 reward: -10667
# episode: 387 reward: -2009
# episode: 489 reward: -1006
# episode: 628 reward: -502
RENDER = False # rendering wastes time
env = gym.make('MountainCar-v0')
env.seed(1) # reproducible, general Policy gradient has high variance
env = env.unwrapped
print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)
RL = PolicyGradient(
n_actions=env.action_space.n,
n_features=env.observation_space.shape[0],
learning_rate=0.02,
reward_decay=0.995,
# output_graph=True,
)
for i_episode in range(1000):
observation = env.reset()
while True:
if RENDER: env.render()
action = RL.choose_action(observation)
observation_, reward, done, info = env.step(action) # reward = -1 in all cases
RL.store_transition(observation, action, reward)
if done:
# calculate running reward
ep_rs_sum = sum(RL.ep_rs)
if 'running_reward' not in globals():
running_reward = ep_rs_sum
else:
running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
print("episode:", i_episode, " reward:", int(running_reward))
vt = RL.learn() # train
if i_episode == 30:
plt.plot(vt) # plot the episode vt
plt.xlabel('episode steps')
plt.ylabel('normalized state-action value')
plt.show()
break
observation = observation_
================================================
FILE: contents/8_Actor_Critic_Advantage/AC_CartPole.py
================================================
"""
Actor-Critic using TD-error as the Advantage, Reinforcement Learning.
The cart pole example. Policy is oscillated.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
tensorflow 1.0
gym 0.8.0
"""
import numpy as np
import tensorflow as tf
import gym
np.random.seed(2)
tf.set_random_seed(2) # reproducible
# Superparameters
OUTPUT_GRAPH = False
MAX_EPISODE = 3000
DISPLAY_REWARD_THRESHOLD = 200 # renders environment if total episode reward is greater then this threshold
MAX_EP_STEPS = 1000 # maximum time step in one episode
RENDER = False # rendering wastes time
GAMMA = 0.9 # reward discount in TD error
LR_A = 0.001 # learning rate for actor
LR_C = 0.01 # learning rate for critic
env = gym.make('CartPole-v0')
env.seed(1) # reproducible
env = env.unwrapped
N_F = env.observation_space.shape[0]
N_A = env.action_space.n
class Actor(object):
def __init__(self, sess, n_features, n_actions, lr=0.001):
self.sess = sess
self.s = tf.placeholder(tf.float32, [1, n_features], "state")
self.a = tf.placeholder(tf.int32, None, "act")
self.td_error = tf.placeholder(tf.float32, None, "td_error") # TD_error
with tf.variable_scope('Actor'):
l1 = tf.layers.dense(
inputs=self.s,
units=20, # number of hidden units
activation=tf.nn.relu,
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
bias_initializer=tf.constant_initializer(0.1), # biases
name='l1'
)
self.acts_prob = tf.layers.dense(
inputs=l1,
units=n_actions, # output units
activation=tf.nn.softmax, # get action probabilities
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
bias_initializer=tf.constant_initializer(0.1), # biases
name='acts_prob'
)
with tf.variable_scope('exp_v'):
log_prob = tf.log(self.acts_prob[0, self.a])
self.exp_v = tf.reduce_mean(log_prob * self.td_error) # advantage (TD_error) guided loss
with tf.variable_scope('train'):
self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v) # minimize(-exp_v) = maximize(exp_v)
def learn(self, s, a, td):
s = s[np.newaxis, :]
feed_dict = {self.s: s, self.a: a, self.td_error: td}
_, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
return exp_v
def choose_action(self, s):
s = s[np.newaxis, :]
probs = self.sess.run(self.acts_prob, {self.s: s}) # get probabilities for all actions
return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel()) # return a int
class Critic(object):
def __init__(self, sess, n_features, lr=0.01):
self.sess = sess
self.s = tf.placeholder(tf.float32, [1, n_features], "state")
self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next")
self.r = tf.placeholder(tf.float32, None, 'r')
with tf.variable_scope('Critic'):
l1 = tf.layers.dense(
inputs=self.s,
units=20, # number of hidden units
activation=tf.nn.relu, # None
# have to be linear to make sure the convergence of actor.
# But linear approximator seems hardly learns the correct Q.
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
bias_initializer=tf.constant_initializer(0.1), # biases
name='l1'
)
self.v = tf.layers.dense(
inputs=l1,
units=1, # output units
activation=None,
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
bias_initializer=tf.constant_initializer(0.1), # biases
name='V'
)
with tf.variable_scope('squared_TD_error'):
self.td_error = self.r + GAMMA * self.v_ - self.v
self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval
with tf.variable_scope('train'):
self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
def learn(self, s, r, s_):
s, s_ = s[np.newaxis, :], s_[np.newaxis, :]
v_ = self.sess.run(self.v, {self.s: s_})
td_error, _ = self.sess.run([self.td_error, self.train_op],
{self.s: s, self.v_: v_, self.r: r})
return td_error
sess = tf.Session()
actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A)
critic = Critic(sess, n_features=N_F, lr=LR_C) # we need a good teacher, so the teacher should learn faster than the actor
sess.run(tf.global_variables_initializer())
if OUTPUT_GRAPH:
tf.summary.FileWriter("logs/", sess.graph)
for i_episode in range(MAX_EPISODE):
s = env.reset()
t = 0
track_r = []
while True:
if RENDER: env.render()
a = actor.choose_action(s)
s_, r, done, info = env.step(a)
if done: r = -20
track_r.append(r)
td_error = critic.learn(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)]
actor.learn(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error]
s = s_
t += 1
if done or t >= MAX_EP_STEPS:
ep_rs_sum = sum(track_r)
if 'running_reward' not in globals():
running_reward = ep_rs_sum
else:
running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
print("episode:", i_episode, " reward:", int(running_reward))
break
================================================
FILE: contents/8_Actor_Critic_Advantage/AC_continue_Pendulum.py
================================================
"""
Actor-Critic with continuous action using TD-error as the Advantage, Reinforcement Learning.
The Pendulum example (based on https://github.com/dennybritz/reinforcement-learning/blob/master/PolicyGradient/Continuous%20MountainCar%20Actor%20Critic%20Solution.ipynb)
Cannot converge!!! oscillate!!!
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
tensorflow r1.3
gym 0.8.0
"""
import tensorflow as tf
import numpy as np
import gym
np.random.seed(2)
tf.set_random_seed(2) # reproducible
class Actor(object):
def __init__(self, sess, n_features, action_bound, lr=0.0001):
self.sess = sess
self.s = tf.placeholder(tf.float32, [1, n_features], "state")
self.a = tf.placeholder(tf.float32, None, name="act")
self.td_error = tf.placeholder(tf.float32, None, name="td_error") # TD_error
l1 = tf.layers.dense(
inputs=self.s,
units=30, # number of hidden units
activation=tf.nn.relu,
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
bias_initializer=tf.constant_initializer(0.1), # biases
name='l1'
)
mu = tf.layers.dense(
inputs=l1,
units=1, # number of hidden units
activation=tf.nn.tanh,
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
bias_initializer=tf.constant_initializer(0.1), # biases
name='mu'
)
sigma = tf.layers.dense(
inputs=l1,
units=1, # output units
activation=tf.nn.softplus, # get action probabilities
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
bias_initializer=tf.constant_initializer(1.), # biases
name='sigma'
)
global_step = tf.Variable(0, trainable=False)
# self.e = epsilon = tf.train.exponential_decay(2., global_step, 1000, 0.9)
self.mu, self.sigma = tf.squeeze(mu*2), tf.squeeze(sigma+0.1)
self.normal_dist = tf.distributions.Normal(self.mu, self.sigma)
self.action = tf.clip_by_value(self.normal_dist.sample(1), action_bound[0], action_bound[1])
with tf.name_scope('exp_v'):
log_prob = self.normal_dist.log_prob(self.a) # loss without advantage
self.exp_v = log_prob * self.td_error # advantage (TD_error) guided loss
# Add cross entropy cost to encourage exploration
self.exp_v += 0.01*self.normal_dist.entropy()
with tf.name_scope('train'):
self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v, global_step) # min(v) = max(-v)
def learn(self, s, a, td):
s = s[np.newaxis, :]
feed_dict = {self.s: s, self.a: a, self.td_error: td}
_, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
return exp_v
def choose_action(self, s):
s = s[np.newaxis, :]
return self.sess.run(self.action, {self.s: s}) # get probabilities for all actions
class Critic(object):
def __init__(self, sess, n_features, lr=0.01):
self.sess = sess
with tf.name_scope('inputs'):
self.s = tf.placeholder(tf.float32, [1, n_features], "state")
self.v_ = tf.placeholder(tf.float32, [1, 1], name="v_next")
self.r = tf.placeholder(tf.float32, name='r')
with tf.variable_scope('Critic'):
l1 = tf.layers.dense(
inputs=self.s,
units=30, # number of hidden units
activation=tf.nn.relu,
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
bias_initializer=tf.constant_initializer(0.1), # biases
name='l1'
)
self.v = tf.layers.dense(
inputs=l1,
units=1, # output units
activation=None,
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
bias_initializer=tf.constant_initializer(0.1), # biases
name='V'
)
with tf.variable_scope('squared_TD_error'):
self.td_error = tf.reduce_mean(self.r + GAMMA * self.v_ - self.v)
self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval
with tf.variable_scope('train'):
self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
def learn(self, s, r, s_):
s, s_ = s[np.newaxis, :], s_[np.newaxis, :]
v_ = self.sess.run(self.v, {self.s: s_})
td_error, _ = self.sess.run([self.td_error, self.train_op],
{self.s: s, self.v_: v_, self.r: r})
return td_error
OUTPUT_GRAPH = False
MAX_EPISODE = 1000
MAX_EP_STEPS = 200
DISPLAY_REWARD_THRESHOLD = -100 # renders environment if total episode reward is greater then this threshold
RENDER = False # rendering wastes time
GAMMA = 0.9
LR_A = 0.001 # learning rate for actor
LR_C = 0.01 # learning rate for critic
env = gym.make('Pendulum-v0')
env.seed(1) # reproducible
env = env.unwrapped
N_S = env.observation_space.shape[0]
A_BOUND = env.action_space.high
sess = tf.Session()
actor = Actor(sess, n_features=N_S, lr=LR_A, action_bound=[-A_BOUND, A_BOUND])
critic = Critic(sess, n_features=N_S, lr=LR_C)
sess.run(tf.global_variables_initializer())
if OUTPUT_GRAPH:
tf.summary.FileWriter("logs/", sess.graph)
for i_episode in range(MAX_EPISODE):
s = env.reset()
t = 0
ep_rs = []
while True:
# if RENDER:
env.render()
a = actor.choose_action(s)
s_, r, done, info = env.step(a)
r /= 10
td_error = critic.learn(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)]
actor.learn(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error]
s = s_
t += 1
ep_rs.append(r)
if t > MAX_EP_STEPS:
ep_rs_sum = sum(ep_rs)
if 'running_reward' not in globals():
running_reward = ep_rs_sum
else:
running_reward = running_reward * 0.9 + ep_rs_sum * 0.1
if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
print("episode:", i_episode, " reward:", int(running_reward))
break
================================================
FILE: contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG.py
================================================
"""
Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning.
DDPG is Actor Critic based algorithm.
Pendulum example.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
tensorflow 1.0
gym 0.8.0
"""
import tensorflow as tf
import numpy as np
import gym
import time
np.random.seed(1)
tf.set_random_seed(1)
##################### hyper parameters ####################
MAX_EPISODES = 200
MAX_EP_STEPS = 200
LR_A = 0.001 # learning rate for actor
LR_C = 0.001 # learning rate for critic
GAMMA = 0.9 # reward discount
REPLACEMENT = [
dict(name='soft', tau=0.01),
dict(name='hard', rep_iter_a=600, rep_iter_c=500)
][0] # you can try different target replacement strategies
MEMORY_CAPACITY = 10000
BATCH_SIZE = 32
RENDER = False
OUTPUT_GRAPH = True
ENV_NAME = 'Pendulum-v0'
############################### Actor ####################################
class Actor(object):
def __init__(self, sess, action_dim, action_bound, learning_rate, replacement):
self.sess = sess
self.a_dim = action_dim
self.action_bound = action_bound
self.lr = learning_rate
self.replacement = replacement
self.t_replace_counter = 0
with tf.variable_scope('Actor'):
# input s, output a
self.a = self._build_net(S, scope='eval_net', trainable=True)
# input s_, output a, get a_ for critic
self.a_ = self._build_net(S_, scope='target_net', trainable=False)
self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net')
self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net')
if self.replacement['name'] == 'hard':
self.t_replace_counter = 0
self.hard_replace = [tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]
else:
self.soft_replace = [tf.assign(t, (1 - self.replacement['tau']) * t + self.replacement['tau'] * e)
for t, e in zip(self.t_params, self.e_params)]
def _build_net(self, s, scope, trainable):
with tf.variable_scope(scope):
init_w = tf.random_normal_initializer(0., 0.3)
init_b = tf.constant_initializer(0.1)
net = tf.layers.dense(s, 30, activation=tf.nn.relu,
kernel_initializer=init_w, bias_initializer=init_b, name='l1',
trainable=trainable)
with tf.variable_scope('a'):
actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w,
bias_initializer=init_b, name='a', trainable=trainable)
scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') # Scale output to -action_bound to action_bound
return scaled_a
def learn(self, s): # batch update
self.sess.run(self.train_op, feed_dict={S: s})
if self.replacement['name'] == 'soft':
self.sess.run(self.soft_replace)
else:
if self.t_replace_counter % self.replacement['rep_iter_a'] == 0:
self.sess.run(self.hard_replace)
self.t_replace_counter += 1
def choose_action(self, s):
s = s[np.newaxis, :] # single state
return self.sess.run(self.a, feed_dict={S: s})[0] # single action
def add_grad_to_graph(self, a_grads):
with tf.variable_scope('policy_grads'):
# ys = policy;
# xs = policy's parameters;
# a_grads = the gradients of the policy to get more Q
# tf.gradients will calculate dys/dxs with a initial gradients for ys, so this is dq/da * da/dparams
self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)
with tf.variable_scope('A_train'):
opt = tf.train.AdamOptimizer(-self.lr) # (- learning rate) for ascent policy
self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params))
############################### Critic ####################################
class Critic(object):
def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, replacement, a, a_):
self.sess = sess
self.s_dim = state_dim
self.a_dim = action_dim
self.lr = learning_rate
self.gamma = gamma
self.replacement = replacement
with tf.variable_scope('Critic'):
# Input (s, a), output q
self.a = tf.stop_gradient(a) # stop critic update flows to actor
self.q = self._build_net(S, self.a, 'eval_net', trainable=True)
# Input (s_, a_), output q_ for q_target
self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net
self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net')
self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net')
with tf.variable_scope('target_q'):
self.target_q = R + self.gamma * self.q_
with tf.variable_scope('TD_error'):
self.loss = tf.reduce_mean(tf.squared_difference(self.target_q, self.q))
with tf.variable_scope('C_train'):
self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
with tf.variable_scope('a_grad'):
self.a_grads = tf.gradients(self.q, self.a)[0] # tensor of gradients of each sample (None, a_dim)
if self.replacement['name'] == 'hard':
self.t_replace_counter = 0
self.hard_replacement = [tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]
else:
self.soft_replacement = [tf.assign(t, (1 - self.replacement['tau']) * t + self.replacement['tau'] * e)
for t, e in zip(self.t_params, self.e_params)]
def _build_net(self, s, a, scope, trainable):
with tf.variable_scope(scope):
init_w = tf.random_normal_initializer(0., 0.1)
init_b = tf.constant_initializer(0.1)
with tf.variable_scope('l1'):
n_l1 = 30
w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable)
net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
with tf.variable_scope('q'):
q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable) # Q(s,a)
return q
def learn(self, s, a, r, s_):
self.sess.run(self.train_op, feed_dict={S: s, self.a: a, R: r, S_: s_})
if self.replacement['name'] == 'soft':
self.sess.run(self.soft_replacement)
else:
if self.t_replace_counter % self.replacement['rep_iter_c'] == 0:
self.sess.run(self.hard_replacement)
self.t_replace_counter += 1
##################### Memory ####################
class Memory(object):
def __init__(self, capacity, dims):
self.capacity = capacity
self.data = np.zeros((capacity, dims))
self.pointer = 0
def store_transition(self, s, a, r, s_):
transition = np.hstack((s, a, [r], s_))
index = self.pointer % self.capacity # replace the old memory with new memory
self.data[index, :] = transition
self.pointer += 1
def sample(self, n):
assert self.pointer >= self.capacity, 'Memory has not been fulfilled'
indices = np.random.choice(self.capacity, size=n)
return self.data[indices, :]
env = gym.make(ENV_NAME)
env = env.unwrapped
env.seed(1)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high
# all placeholder for tf
with tf.name_scope('S'):
S = tf.placeholder(tf.float32, shape=[None, state_dim], name='s')
with tf.name_scope('R'):
R = tf.placeholder(tf.float32, [None, 1], name='r')
with tf.name_scope('S_'):
S_ = tf.placeholder(tf.float32, shape=[None, state_dim], name='s_')
sess = tf.Session()
# Create actor and critic.
# They are actually connected to each other, details can be seen in tensorboard or in this picture:
actor = Actor(sess, action_dim, action_bound, LR_A, REPLACEMENT)
critic = Critic(sess, state_dim, action_dim, LR_C, GAMMA, REPLACEMENT, actor.a, actor.a_)
actor.add_grad_to_graph(critic.a_grads)
sess.run(tf.global_variables_initializer())
M = Memory(MEMORY_CAPACITY, dims=2 * state_dim + action_dim + 1)
if OUTPUT_GRAPH:
tf.summary.FileWriter("logs/", sess.graph)
var = 3 # control exploration
t1 = time.time()
for i in range(MAX_EPISODES):
s = env.reset()
ep_reward = 0
for j in range(MAX_EP_STEPS):
if RENDER:
env.render()
# Add exploration noise
a = actor.choose_action(s)
a = np.clip(np.random.normal(a, var), -2, 2) # add randomness to action selection for exploration
s_, r, done, info = env.step(a)
M.store_transition(s, a, r / 10, s_)
if M.pointer > MEMORY_CAPACITY:
var *= .9995 # decay the action randomness
b_M = M.sample(BATCH_SIZE)
b_s = b_M[:, :state_dim]
b_a = b_M[:, state_dim: state_dim + action_dim]
b_r = b_M[:, -state_dim - 1: -state_dim]
b_s_ = b_M[:, -state_dim:]
critic.learn(b_s, b_a, b_r, b_s_)
actor.learn(b_s)
s = s_
ep_reward += r
if j == MAX_EP_STEPS-1:
print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
if ep_reward > -300:
RENDER = True
break
print('Running time: ', time.time()-t1)
================================================
FILE: contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG_update.py
================================================
"""
Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning.
DDPG is Actor Critic based algorithm.
Pendulum example.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
tensorflow 1.0
gym 0.8.0
"""
import tensorflow as tf
import numpy as np
import gym
import time
##################### hyper parameters ####################
MAX_EPISODES = 200
MAX_EP_STEPS = 200
LR_A = 0.001 # learning rate for actor
LR_C = 0.002 # learning rate for critic
GAMMA = 0.9 # reward discount
TAU = 0.01 # soft replacement
MEMORY_CAPACITY = 10000
BATCH_SIZE = 32
RENDER = False
ENV_NAME = 'Pendulum-v0'
############################### DDPG ####################################
class DDPG(object):
def __init__(self, a_dim, s_dim, a_bound,):
self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
self.pointer = 0
self.sess = tf.Session()
self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
self.R = tf.placeholder(tf.float32, [None, 1], 'r')
with tf.variable_scope('Actor'):
self.a = self._build_a(self.S, scope='eval', trainable=True)
a_ = self._build_a(self.S_, scope='target', trainable=False)
with tf.variable_scope('Critic'):
# assign self.a = a in memory when calculating q for td_error,
# otherwise the self.a is from Actor when updating Actor
q = self._build_c(self.S, self.a, scope='eval', trainable=True)
q_ = self._build_c(self.S_, a_, scope='target', trainable=False)
# networks parameters
self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')
# target net replacement
self.soft_replace = [tf.assign(t, (1 - TAU) * t + TAU * e)
for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)]
q_target = self.R + GAMMA * q_
# in the feed_dic for the td_error, the self.a should change to actions in memory
td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=self.ce_params)
a_loss = - tf.reduce_mean(q) # maximize the q
self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=self.ae_params)
self.sess.run(tf.global_variables_initializer())
def choose_action(self, s):
return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]
def learn(self):
# soft target replacement
self.sess.run(self.soft_replace)
indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
bt = self.memory[indices, :]
bs = bt[:, :self.s_dim]
ba = bt[:, self.s_dim: self.s_dim + self.a_dim]
br = bt[:, -self.s_dim - 1: -self.s_dim]
bs_ = bt[:, -self.s_dim:]
self.sess.run(self.atrain, {self.S: bs})
self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_})
def store_transition(self, s, a, r, s_):
transition = np.hstack((s, a, [r], s_))
index = self.pointer % MEMORY_CAPACITY # replace the old memory with new memory
self.memory[index, :] = transition
self.pointer += 1
def _build_a(self, s, scope, trainable):
with tf.variable_scope(scope):
net = tf.layers.dense(s, 30, activation=tf.nn.relu, name='l1', trainable=trainable)
a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable)
return tf.multiply(a, self.a_bound, name='scaled_a')
def _build_c(self, s, a, scope, trainable):
with tf.variable_scope(scope):
n_l1 = 30
w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
return tf.layers.dense(net, 1, trainable=trainable) # Q(s,a)
############################### training ####################################
env = gym.make(ENV_NAME)
env = env.unwrapped
env.seed(1)
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.shape[0]
a_bound = env.action_space.high
ddpg = DDPG(a_dim, s_dim, a_bound)
var = 3 # control exploration
t1 = time.time()
for i in range(MAX_EPISODES):
s = env.reset()
ep_reward = 0
for j in range(MAX_EP_STEPS):
if RENDER:
env.render()
# Add exploration noise
a = ddpg.choose_action(s)
a = np.clip(np.random.normal(a, var), -2, 2) # add randomness to action selection for exploration
s_, r, done, info = env.step(a)
ddpg.store_transition(s, a, r / 10, s_)
if ddpg.pointer > MEMORY_CAPACITY:
var *= .9995 # decay the action randomness
ddpg.learn()
s = s_
ep_reward += r
if j == MAX_EP_STEPS-1:
print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
# if ep_reward > -300:RENDER = True
break
print('Running time: ', time.time() - t1)
================================================
FILE: contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG_update2.py
================================================
"""
Note: This is a updated version from my previous code,
for the target network, I use moving average to soft replace target parameters instead using assign function.
By doing this, it has 20% speed up on my machine (CPU).
Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning.
DDPG is Actor Critic based algorithm.
Pendulum example.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
tensorflow 1.0
gym 0.8.0
"""
import tensorflow as tf
import numpy as np
import gym
import time
##################### hyper parameters ####################
MAX_EPISODES = 200
MAX_EP_STEPS = 200
LR_A = 0.001 # learning rate for actor
LR_C = 0.002 # learning rate for critic
GAMMA = 0.9 # reward discount
TAU = 0.01 # soft replacement
MEMORY_CAPACITY = 10000
BATCH_SIZE = 32
RENDER = False
ENV_NAME = 'Pendulum-v0'
############################### DDPG ####################################
class DDPG(object):
def __init__(self, a_dim, s_dim, a_bound,):
self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
self.pointer = 0
self.sess = tf.Session()
self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
self.R = tf.placeholder(tf.float32, [None, 1], 'r')
self.a = self._build_a(self.S,)
q = self._build_c(self.S, self.a, )
a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Actor')
c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Critic')
ema = tf.train.ExponentialMovingAverage(decay=1 - TAU) # soft replacement
def ema_getter(getter, name, *args, **kwargs):
return ema.average(getter(name, *args, **kwargs))
target_update = [ema.apply(a_params), ema.apply(c_params)] # soft update operation
a_ = self._build_a(self.S_, reuse=True, custom_getter=ema_getter) # replaced target parameters
q_ = self._build_c(self.S_, a_, reuse=True, custom_getter=ema_getter)
a_loss = - tf.reduce_mean(q) # maximize the q
self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=a_params)
with tf.control_dependencies(target_update): # soft replacement happened at here
q_target = self.R + GAMMA * q_
td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=c_params)
self.sess.run(tf.global_variables_initializer())
def choose_action(self, s):
return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]
def learn(self):
indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
bt = self.memory[indices, :]
bs = bt[:, :self.s_dim]
ba = bt[:, self.s_dim: self.s_dim + self.a_dim]
br = bt[:, -self.s_dim - 1: -self.s_dim]
bs_ = bt[:, -self.s_dim:]
self.sess.run(self.atrain, {self.S: bs})
self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_})
def store_transition(self, s, a, r, s_):
transition = np.hstack((s, a, [r], s_))
index = self.pointer % MEMORY_CAPACITY # replace the old memory with new memory
self.memory[index, :] = transition
self.pointer += 1
def _build_a(self, s, reuse=None, custom_getter=None):
trainable = True if reuse is None else False
with tf.variable_scope('Actor', reuse=reuse, custom_getter=custom_getter):
net = tf.layers.dense(s, 30, activation=tf.nn.relu, name='l1', trainable=trainable)
a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable)
return tf.multiply(a, self.a_bound, name='scaled_a')
def _build_c(self, s, a, reuse=None, custom_getter=None):
trainable = True if reuse is None else False
with tf.variable_scope('Critic', reuse=reuse, custom_getter=custom_getter):
n_l1 = 30
w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
return tf.layers.dense(net, 1, trainable=trainable) # Q(s,a)
############################### training ####################################
env = gym.make(ENV_NAME)
env = env.unwrapped
env.seed(1)
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.shape[0]
a_bound = env.action_space.high
ddpg = DDPG(a_dim, s_dim, a_bound)
var = 3 # control exploration
t1 = time.time()
for i in range(MAX_EPISODES):
s = env.reset()
ep_reward = 0
for j in range(MAX_EP_STEPS):
if RENDER:
env.render()
# Add exploration noise
a = ddpg.choose_action(s)
a = np.clip(np.random.normal(a, var), -2, 2) # add randomness to action selection for exploration
s_, r, done, info = env.step(a)
ddpg.store_transition(s, a, r / 10, s_)
if ddpg.pointer > MEMORY_CAPACITY:
var *= .9995 # decay the action randomness
ddpg.learn()
s = s_
ep_reward += r
if j == MAX_EP_STEPS-1:
print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
# if ep_reward > -300:RENDER = True
break
print('Running time: ', time.time() - t1)
================================================
FILE: contents/Curiosity_Model/Curiosity.py
================================================
"""This is a simple implementation of [Large-Scale Study of Curiosity-Driven Learning](https://arxiv.org/abs/1808.04355)"""
import numpy as np
import tensorflow as tf
import gym
import matplotlib.pyplot as plt
class CuriosityNet:
def __init__(
self,
n_a,
n_s,
lr=0.01,
gamma=0.98,
epsilon=0.95,
replace_target_iter=300,
memory_size=10000,
batch_size=128,
output_graph=False,
):
self.n_a = n_a
self.n_s = n_s
self.lr = lr
self.gamma = gamma
self.epsilon = epsilon
self.replace_target_iter = replace_target_iter
self.memory_size = memory_size
self.batch_size = batch_size
# total learning step
self.learn_step_counter = 0
self.memory_counter = 0
# initialize zero memory [s, a, r, s_]
self.memory = np.zeros((self.memory_size, n_s * 2 + 2))
self.tfs, self.tfa, self.tfr, self.tfs_, self.dyn_train, self.dqn_train, self.q, self.int_r = \
self._build_nets()
t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net')
e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_net')
with tf.variable_scope('hard_replacement'):
self.target_replace_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
self.sess = tf.Session()
if output_graph:
tf.summary.FileWriter("logs/", self.sess.graph)
self.sess.run(tf.global_variables_initializer())
def _build_nets(self):
tfs = tf.placeholder(tf.float32, [None, self.n_s], name="s") # input State
tfa = tf.placeholder(tf.int32, [None, ], name="a") # input Action
tfr = tf.placeholder(tf.float32, [None, ], name="ext_r") # extrinsic reward
tfs_ = tf.placeholder(tf.float32, [None, self.n_s], name="s_") # input Next State
# dynamics net
dyn_s_, curiosity, dyn_train = self._build_dynamics_net(tfs, tfa, tfs_)
# normal RL model
total_reward = tf.add(curiosity, tfr, name="total_r")
q, dqn_loss, dqn_train = self._build_dqn(tfs, tfa, total_reward, tfs_)
return tfs, tfa, tfr, tfs_, dyn_train, dqn_train, q, curiosity
def _build_dynamics_net(self, s, a, s_):
with tf.variable_scope("dyn_net"):
float_a = tf.expand_dims(tf.cast(a, dtype=tf.float32, name="float_a"), axis=1, name="2d_a")
sa = tf.concat((s, float_a), axis=1, name="sa")
encoded_s_ = s_ # here we use s_ as the encoded s_
dyn_l = tf.layers.dense(sa, 32, activation=tf.nn.relu)
dyn_s_ = tf.layers.dense(dyn_l, self.n_s) # predicted s_
with tf.name_scope("int_r"):
squared_diff = tf.reduce_sum(tf.square(encoded_s_ - dyn_s_), axis=1) # intrinsic reward
# It is better to reduce the learning rate in order to stay curious
train_op = tf.train.RMSPropOptimizer(self.lr, name="dyn_opt").minimize(tf.reduce_mean(squared_diff))
return dyn_s_, squared_diff, train_op
def _build_dqn(self, s, a, r, s_):
with tf.variable_scope('eval_net'):
e1 = tf.layers.dense(s, 128, tf.nn.relu)
q = tf.layers.dense(e1, self.n_a, name="q")
with tf.variable_scope('target_net'):
t1 = tf.layers.dense(s_, 128, tf.nn.relu)
q_ = tf.layers.dense(t1, self.n_a, name="q_")
with tf.variable_scope('q_target'):
q_target = r + self.gamma * tf.reduce_max(q_, axis=1, name="Qmax_s_")
with tf.variable_scope('q_wrt_a'):
a_indices = tf.stack([tf.range(tf.shape(a)[0], dtype=tf.int32), a], axis=1)
q_wrt_a = tf.gather_nd(params=q, indices=a_indices)
loss = tf.losses.mean_squared_error(labels=q_target, predictions=q_wrt_a) # TD error
train_op = tf.train.RMSPropOptimizer(self.lr, name="dqn_opt").minimize(
loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "eval_net"))
return q, loss, train_op
def store_transition(self, s, a, r, s_):
transition = np.hstack((s, [a, r], s_))
# replace the old memory with new memory
index = self.memory_counter % self.memory_size
self.memory[index, :] = transition
self.memory_counter += 1
def choose_action(self, observation):
# to have batch dimension when feed into tf placeholder
s = observation[np.newaxis, :]
if np.random.uniform() < self.epsilon:
# forward feed the observation and get q value for every actions
actions_value = self.sess.run(self.q, feed_dict={self.tfs: s})
action = np.argmax(actions_value)
else:
action = np.random.randint(0, self.n_a)
return action
def learn(self):
# check to replace target parameters
if self.learn_step_counter % self.replace_target_iter == 0:
self.sess.run(self.target_replace_op)
# sample batch memory from all memory
top = self.memory_size if self.memory_counter > self.memory_size else self.memory_counter
sample_index = np.random.choice(top, size=self.batch_size)
batch_memory = self.memory[sample_index, :]
bs, ba, br, bs_ = batch_memory[:, :self.n_s], batch_memory[:, self.n_s], \
batch_memory[:, self.n_s + 1], batch_memory[:, -self.n_s:]
self.sess.run(self.dqn_train, feed_dict={self.tfs: bs, self.tfa: ba, self.tfr: br, self.tfs_: bs_})
if self.learn_step_counter % 1000 == 0: # delay training in order to stay curious
self.sess.run(self.dyn_train, feed_dict={self.tfs: bs, self.tfa: ba, self.tfs_: bs_})
self.learn_step_counter += 1
env = gym.make('MountainCar-v0')
env = env.unwrapped
dqn = CuriosityNet(n_a=3, n_s=2, lr=0.01, output_graph=False)
ep_steps = []
for epi in range(200):
s = env.reset()
steps = 0
while True:
env.render()
a = dqn.choose_action(s)
s_, r, done, info = env.step(a)
dqn.store_transition(s, a, r, s_)
dqn.learn()
if done:
print('Epi: ', epi, "| steps: ", steps)
ep_steps.append(steps)
break
s = s_
steps += 1
plt.plot(ep_steps)
plt.ylabel("steps")
plt.xlabel("episode")
plt.show()
================================================
FILE: contents/Curiosity_Model/Random_Network_Distillation.py
================================================
"""This is a simple implementation of [Exploration by Random Network Distillation](https://arxiv.org/abs/1810.12894)"""
import numpy as np
import tensorflow as tf
import gym
import matplotlib.pyplot as plt
class CuriosityNet:
def __init__(
self,
n_a,
n_s,
lr=0.01,
gamma=0.95,
epsilon=1.,
replace_target_iter=300,
memory_size=10000,
batch_size=128,
output_graph=False,
):
self.n_a = n_a
self.n_s = n_s
self.lr = lr
self.gamma = gamma
self.epsilon = epsilon
self.replace_target_iter = replace_target_iter
self.memory_size = memory_size
self.batch_size = batch_size
self.s_encode_size = 1000 # give a hard job for predictor to learn
# total learning step
self.learn_step_counter = 0
self.memory_counter = 0
# initialize zero memory [s, a, r, s_]
self.memory = np.zeros((self.memory_size, n_s * 2 + 2))
self.tfs, self.tfa, self.tfr, self.tfs_, self.pred_train, self.dqn_train, self.q = \
self._build_nets()
t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net')
e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_net')
with tf.variable_scope('hard_replacement'):
self.target_replace_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
self.sess = tf.Session()
if output_graph:
tf.summary.FileWriter("logs/", self.sess.graph)
self.sess.run(tf.global_variables_initializer())
def _build_nets(self):
tfs = tf.placeholder(tf.float32, [None, self.n_s], name="s") # input State
tfa = tf.placeholder(tf.int32, [None, ], name="a") # input Action
tfr = tf.placeholder(tf.float32, [None, ], name="ext_r") # extrinsic reward
tfs_ = tf.placeholder(tf.float32, [None, self.n_s], name="s_") # input Next State
# fixed random net
with tf.variable_scope("random_net"):
rand_encode_s_ = tf.layers.dense(tfs_, self.s_encode_size)
# predictor
ri, pred_train = self._build_predictor(tfs_, rand_encode_s_)
# normal RL model
q, dqn_loss, dqn_train = self._build_dqn(tfs, tfa, ri, tfr, tfs_)
return tfs, tfa, tfr, tfs_, pred_train, dqn_train, q
def _build_predictor(self, s_, rand_encode_s_):
with tf.variable_scope("predictor"):
net = tf.layers.dense(s_, 128, tf.nn.relu)
out = tf.layers.dense(net, self.s_encode_size)
with tf.name_scope("int_r"):
ri = tf.reduce_sum(tf.square(rand_encode_s_ - out), axis=1) # intrinsic reward
train_op = tf.train.RMSPropOptimizer(self.lr, name="predictor_opt").minimize(
tf.reduce_mean(ri), var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predictor"))
return ri, train_op
def _build_dqn(self, s, a, ri, re, s_):
with tf.variable_scope('eval_net'):
e1 = tf.layers.dense(s, 128, tf.nn.relu)
q = tf.layers.dense(e1, self.n_a, name="q")
with tf.variable_scope('target_net'):
t1 = tf.layers.dense(s_, 128, tf.nn.relu)
q_ = tf.layers.dense(t1, self.n_a, name="q_")
with tf.variable_scope('q_target'):
q_target = re + ri + self.gamma * tf.reduce_max(q_, axis=1, name="Qmax_s_")
with tf.variable_scope('q_wrt_a'):
a_indices = tf.stack([tf.range(tf.shape(a)[0], dtype=tf.int32), a], axis=1)
q_wrt_a = tf.gather_nd(params=q, indices=a_indices)
loss = tf.losses.mean_squared_error(labels=q_target, predictions=q_wrt_a) # TD error
train_op = tf.train.RMSPropOptimizer(self.lr, name="dqn_opt").minimize(
loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "eval_net"))
return q, loss, train_op
def store_transition(self, s, a, r, s_):
transition = np.hstack((s, [a, r], s_))
# replace the old memory with new memory
index = self.memory_counter % self.memory_size
self.memory[index, :] = transition
self.memory_counter += 1
def choose_action(self, observation):
# to have batch dimension when feed into tf placeholder
s = observation[np.newaxis, :]
if np.random.uniform() < self.epsilon:
# forward feed the observation and get q value for every actions
actions_value = self.sess.run(self.q, feed_dict={self.tfs: s})
action = np.argmax(actions_value)
else:
action = np.random.randint(0, self.n_a)
return action
def learn(self):
# check to replace target parameters
if self.learn_step_counter % self.replace_target_iter == 0:
self.sess.run(self.target_replace_op)
# sample batch memory from all memory
top = self.memory_size if self.memory_counter > self.memory_size else self.memory_counter
sample_index = np.random.choice(top, size=self.batch_size)
batch_memory = self.memory[sample_index, :]
bs, ba, br, bs_ = batch_memory[:, :self.n_s], batch_memory[:, self.n_s], \
batch_memory[:, self.n_s + 1], batch_memory[:, -self.n_s:]
self.sess.run(self.dqn_train, feed_dict={self.tfs: bs, self.tfa: ba, self.tfr: br, self.tfs_: bs_})
if self.learn_step_counter % 100 == 0: # delay training in order to stay curious
self.sess.run(self.pred_train, feed_dict={self.tfs_: bs_})
self.learn_step_counter += 1
env = gym.make('MountainCar-v0')
env = env.unwrapped
dqn = CuriosityNet(n_a=3, n_s=2, lr=0.01, output_graph=False)
ep_steps = []
for epi in range(200):
s = env.reset()
steps = 0
while True:
# env.render()
a = dqn.choose_action(s)
s_, r, done, info = env.step(a)
dqn.store_transition(s, a, r, s_)
dqn.learn()
if done:
print('Epi: ', epi, "| steps: ", steps)
ep_steps.append(steps)
break
s = s_
steps += 1
plt.plot(ep_steps)
plt.ylabel("steps")
plt.xlabel("episode")
plt.show()
================================================
FILE: experiments/2D_car/DDPG.py
================================================
"""
Environment is a 2D car.
Car has 5 sensors to obtain distance information.
Car collision => reward = -1, otherwise => reward = 0.
You can train this RL by using LOAD = False, after training, this model will be store in the a local folder.
Using LOAD = True to reload the trained model for playing.
You can customize this script in a way you want.
View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
Requirement:
pyglet >= 1.2.4
numpy >= 1.12.1
tensorflow >= 1.0.1
"""
import tensorflow as tf
import numpy as np
import os
import shutil
from car_env import CarEnv
np.random.seed(1)
tf.set_random_seed(1)
MAX_EPISODES = 500
MAX_EP_STEPS = 600
LR_A = 1e-4 # learning rate for actor
LR_C = 1e-4 # learning rate for critic
GAMMA = 0.9 # reward discount
REPLACE_ITER_A = 800
REPLACE_ITER_C = 700
MEMORY_CAPACITY = 2000
BATCH_SIZE = 16
VAR_MIN = 0.1
RENDER = True
LOAD = False
DISCRETE_ACTION = False
env = CarEnv(discrete_action=DISCRETE_ACTION)
STATE_DIM = env.state_dim
ACTION_DIM = env.action_dim
ACTION_BOUND = env.action_bound
# all placeholder for tf
with tf.name_scope('S'):
S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s')
with tf.name_scope('R'):
R = tf.placeholder(tf.float32, [None, 1], name='r')
with tf.name_scope('S_'):
S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_')
class Actor(object):
def __init__(self, sess, action_dim, action_bound, learning_rate, t_replace_iter):
self.sess = sess
self.a_dim = action_dim
self.action_bound = action_bound
self.lr = learning_rate
self.t_replace_iter = t_replace_iter
self.t_replace_counter = 0
with tf.variable_scope('Actor'):
# input s, output a
self.a = self._build_net(S, scope='eval_net', trainable=True)
# input s_, output a, get a_ for critic
self.a_ = self._build_net(S_, scope='target_net', trainable=False)
self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net')
self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net')
def _build_net(self, s, scope, trainable):
with tf.variable_scope(scope):
init_w = tf.contrib.layers.xavier_initializer()
init_b = tf.constant_initializer(0.001)
net = tf.layers.dense(s, 100, activation=tf.nn.relu,
kernel_initializer=init_w, bias_initializer=init_b, name='l1',
trainable=trainable)
net = tf.layers.dense(net, 20, activation=tf.nn.relu,
kernel_initializer=init_w, bias_initializer=init_b, name='l2',
trainable=trainable)
with tf.variable_scope('a'):
actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w,
name='a', trainable=trainable)
scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') # Scale output to -action_bound to action_bound
return scaled_a
def learn(self, s): # batch update
self.sess.run(self.train_op, feed_dict={S: s})
if self.t_replace_counter % self.t_replace_iter == 0:
self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
self.t_replace_counter += 1
def choose_action(self, s):
s = s[np.newaxis, :] # single state
return self.sess.run(self.a, feed_dict={S: s})[0] # single action
def add_grad_to_graph(self, a_grads):
with tf.variable_scope('policy_grads'):
self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)
with tf.variable_scope('A_train'):
opt = tf.train.RMSPropOptimizer(-self.lr) # (- learning rate) for ascent policy
self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params))
class Critic(object):
def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a, a_):
self.sess = sess
self.s_dim = state_dim
self.a_dim = action_dim
self.lr = learning_rate
self.gamma = gamma
self.t_replace_iter = t_replace_iter
self.t_replace_counter = 0
with tf.variable_scope('Critic'):
# Input (s, a), output q
self.a = a
self.q = self._build_net(S, self.a, 'eval_net', trainable=True)
# Input (s_, a_), output q_ for q_target
self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net
self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net')
self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net')
with tf.variable_scope('target_q'):
self.target_q = R + self.gamma * self.q_
with tf.variable_scope('TD_error'):
self.loss = tf.reduce_mean(tf.squared_difference(self.target_q, self.q))
with tf.variable_scope('C_train'):
self.train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
with tf.variable_scope('a_grad'):
self.a_grads = tf.gradients(self.q, a)[0] # tensor of gradients of each sample (None, a_dim)
def _build_net(self, s, a, scope, trainable):
with tf.variable_scope(scope):
init_w = tf.contrib.layers.xavier_initializer()
init_b = tf.constant_initializer(0.01)
with tf.variable_scope('l1'):
n_l1 = 100
w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable)
net = tf.nn.relu6(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
net = tf.layers.dense(net, 20, activation=tf.nn.relu,
kernel_initializer=init_w, bias_initializer=init_b, name='l2',
trainable=trainable)
with tf.variable_scope('q'):
q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable) # Q(s,a)
return q
def learn(self, s, a, r, s_):
self.sess.run(self.train_op, feed_dict={S: s, self.a: a, R: r, S_: s_})
if self.t_replace_counter % self.t_replace_iter == 0:
self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
self.t_replace_counter += 1
class Memory(object):
def __init__(self, capacity, dims):
self.capacity = capacity
self.data = np.zeros((capacity, dims))
self.pointer = 0
def store_transition(self, s, a, r, s_):
transition = np.hstack((s, a, [r], s_))
index = self.pointer % self.capacity # replace the old memory with new memory
self.data[index, :] = transition
self.pointer += 1
def sample(self, n):
assert self.pointer >= self.capacity, 'Memory has not been fulfilled'
indices = np.random.choice(self.capacity, size=n)
return self.data[indices, :]
sess = tf.Session()
# Create actor and critic.
actor = Actor(sess, ACTION_DIM, ACTION_BOUND[1], LR_A, REPLACE_ITER_A)
critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a, actor.a_)
actor.add_grad_to_graph(critic.a_grads)
M = Memory(MEMORY_CAPACITY, dims=2 * STATE_DIM + ACTION_DIM + 1)
saver = tf.train.Saver()
path = './discrete' if DISCRETE_ACTION else './continuous'
if LOAD:
saver.restore(sess, tf.train.latest_checkpoint(path))
else:
sess.run(tf.global_variables_initializer())
def train():
var = 2. # control exploration
for ep in range(MAX_EPISODES):
s = env.reset()
ep_step = 0
for t in range(MAX_EP_STEPS):
# while True:
if RENDER:
env.render()
# Added exploration noise
a = actor.choose_action(s)
a = np.clip(np.random.normal(a, var), *ACTION_BOUND) # add randomness to action selection for exploration
s_, r, done = env.step(a)
M.store_transition(s, a, r, s_)
if M.pointer > MEMORY_CAPACITY:
var = max([var*.9995, VAR_MIN]) # decay the action randomness
b_M = M.sample(BATCH_SIZE)
b_s = b_M[:, :STATE_DIM]
b_a = b_M[:, STATE_DIM: STATE_DIM + ACTION_DIM]
b_r = b_M[:, -STATE_DIM - 1: -STATE_DIM]
b_s_ = b_M[:, -STATE_DIM:]
critic.learn(b_s, b_a, b_r, b_s_)
actor.learn(b_s)
s = s_
ep_step += 1
if done or t == MAX_EP_STEPS - 1:
# if done:
print('Ep:', ep,
'| Steps: %i' % int(ep_step),
'| Explore: %.2f' % var,
)
break
if os.path.isdir(path): shutil.rmtree(path)
os.mkdir(path)
ckpt_path = os.path.join(path, 'DDPG.ckpt')
save_path = saver.save(sess, ckpt_path, write_meta_graph=False)
print("\nSave Model %s\n" % save_path)
def eval():
env.set_fps(30)
while True:
s = env.reset()
while True:
env.render()
a = actor.choose_action(s)
s_, r, done = env.step(a)
s = s_
if done:
break
if __name__ == '__main__':
if LOAD:
eval()
else:
train()
================================================
FILE: experiments/2D_car/car_env.py
================================================
"""
Environment for 2D car driving.
You can customize this script in a way you want.
View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
Requirement:
pyglet >= 1.2.4
numpy >= 1.12.1
"""
import numpy as np
import pyglet
pyglet.clock.set_fps_limit(10000)
class CarEnv(object):
n_sensor = 5
action_dim = 1
state_dim = n_sensor
viewer = None
viewer_xy = (500, 500)
sensor_max = 150.
start_point = [450, 300]
speed = 50.
dt = 0.1
def __init__(self, discrete_action=False):
self.is_discrete_action = discrete_action
if discrete_action:
self.actions = [-1, 0, 1]
else:
self.action_bound = [-1, 1]
self.terminal = False
# node1 (x, y, r, w, l),
self.car_info = np.array([0, 0, 0, 20, 40], dtype=np.float64) # car coordination
self.obstacle_coords = np.array([
[120, 120],
[380, 120],
[380, 380],
[120, 380],
])
self.sensor_info = self.sensor_max + np.zeros((self.n_sensor, 3)) # n sensors, (distance, end_x, end_y)
def step(self, action):
if self.is_discrete_action:
action = self.actions[action]
else:
action = np.clip(action, *self.action_bound)[0]
self.car_info[2] += action * np.pi/30 # max r = 6 degree
self.car_info[:2] = self.car_info[:2] + \
self.speed * self.dt * np.array([np.cos(self.car_info[2]), np.sin(self.car_info[2])])
self._update_sensor()
s = self._get_state()
r = -1 if self.terminal else 0
return s, r, self.terminal
def reset(self):
self.terminal = False
self.car_info[:3] = np.array([*self.start_point, -np.pi/2])
self._update_sensor()
return self._get_state()
def render(self):
if self.viewer is None:
self.viewer = Viewer(*self.viewer_xy, self.car_info, self.sensor_info, self.obstacle_coords)
self.viewer.render()
def sample_action(self):
if self.is_discrete_action:
a = np.random.choice(list(range(3)))
else:
a = np.random.uniform(*self.action_bound, size=self.action_dim)
return a
def set_fps(self, fps=30):
pyglet.clock.set_fps_limit(fps)
def _get_state(self):
s = self.sensor_info[:, 0].flatten()/self.sensor_max
return s
def _update_sensor(self):
cx, cy, rotation = self.car_info[:3]
n_sensors = len(self.sensor_info)
sensor_theta = np.linspace(-np.pi / 2, np.pi / 2, n_sensors)
xs = cx + (np.zeros((n_sensors, ))+self.sensor_max) * np.cos(sensor_theta)
ys = cy + (np.zeros((n_sensors, ))+self.sensor_max) * np.sin(sensor_theta)
xys = np.array([[x, y] for x, y in zip(xs, ys)]) # shape (5 sensors, 2)
# sensors
tmp_x = xys[:, 0] - cx
tmp_y = xys[:, 1] - cy
# apply rotation
rotated_x = tmp_x * np.cos(rotation) - tmp_y * np.sin(rotation)
rotated_y = tmp_x * np.sin(rotation) + tmp_y * np.cos(rotation)
# rotated x y
self.sensor_info[:, -2:] = np.vstack([rotated_x+cx, rotated_y+cy]).T
q = np.array([cx, cy])
for si in range(len(self.sensor_info)):
s = self.sensor_info[si, -2:] - q
possible_sensor_distance = [self.sensor_max]
possible_intersections = [self.sensor_info[si, -2:]]
# obstacle collision
for oi in range(len(self.obstacle_coords)):
p = self.obstacle_coords[oi]
r = self.obstacle_coords[(oi + 1) % len(self.obstacle_coords)] - self.obstacle_coords[oi]
if np.cross(r, s) != 0: # may collision
t = np.cross((q - p), s) / np.cross(r, s)
u = np.cross((q - p), r) / np.cross(r, s)
if 0 <= t <= 1 and 0 <= u <= 1:
intersection = q + u * s
possible_intersections.append(intersection)
possible_sensor_distance.append(np.linalg.norm(u*s))
# window collision
win_coord = np.array([
[0, 0],
[self.viewer_xy[0], 0],
[*self.viewer_xy],
[0, self.viewer_xy[1]],
[0, 0],
])
for oi in range(4):
p = win_coord[oi]
r = win_coord[(oi + 1) % len(win_coord)] - win_coord[oi]
if np.cross(r, s) != 0: # may collision
t = np.cross((q - p), s) / np.cross(r, s)
u = np.cross((q - p), r) / np.cross(r, s)
if 0 <= t <= 1 and 0 <= u <= 1:
intersection = p + t * r
possible_intersections.append(intersection)
possible_sensor_distance.append(np.linalg.norm(intersection - q))
distance = np.min(possible_sensor_distance)
distance_index = np.argmin(possible_sensor_distance)
self.sensor_info[si, 0] = distance
self.sensor_info[si, -2:] = possible_intersections[distance_index]
if distance < self.car_info[-1]/2:
self.terminal = True
class Viewer(pyglet.window.Window):
color = {
'background': [1]*3 + [1]
}
fps_display = pyglet.clock.ClockDisplay()
bar_thc = 5
def __init__(self, width, height, car_info, sensor_info, obstacle_coords):
super(Viewer, self).__init__(width, height, resizable=False, caption='2D car', vsync=False) # vsync=False to not use the monitor FPS
self.set_location(x=80, y=10)
pyglet.gl.glClearColor(*self.color['background'])
self.car_info = car_info
self.sensor_info = sensor_info
self.batch = pyglet.graphics.Batch()
background = pyglet.graphics.OrderedGroup(0)
foreground = pyglet.graphics.OrderedGroup(1)
self.sensors = []
line_coord = [0, 0] * 2
c = (73, 73, 73) * 2
for i in range(len(self.sensor_info)):
self.sensors.append(self.batch.add(2, pyglet.gl.GL_LINES, foreground, ('v2f', line_coord), ('c3B', c)))
car_box = [0, 0] * 4
c = (249, 86, 86) * 4
self.car = self.batch.add(4, pyglet.gl.GL_QUADS, foreground, ('v2f', car_box), ('c3B', c))
c = (134, 181, 244) * 4
self.obstacle = self.batch.add(4, pyglet.gl.GL_QUADS, background, ('v2f', obstacle_coords.flatten()), ('c3B', c))
def render(self):
pyglet.clock.tick()
self._update()
self.switch_to()
self.dispatch_events()
self.dispatch_event('on_draw')
self.flip()
def on_draw(self):
self.clear()
self.batch.draw()
# self.fps_display.draw()
def _update(self):
cx, cy, r, w, l = self.car_info
# sensors
for i, sensor in enumerate(self.sensors):
sensor.vertices = [cx, cy, *self.sensor_info[i, -2:]]
# car
xys = [
[cx + l / 2, cy + w / 2],
[cx - l / 2, cy + w / 2],
[cx - l / 2, cy - w / 2],
[cx + l / 2, cy - w / 2],
]
r_xys = []
for x, y in xys:
tempX = x - cx
tempY = y - cy
# apply rotation
rotatedX = tempX * np.cos(r) - tempY * np.sin(r)
rotatedY = tempX * np.sin(r) + tempY * np.cos(r)
# rotated x y
x = rotatedX + cx
y = rotatedY + cy
r_xys += [x, y]
self.car.vertices = r_xys
if __name__ == '__main__':
np.random.seed(1)
env = CarEnv()
env.set_fps(30)
for ep in range(20):
s = env.reset()
# for t in range(100):
while True:
env.render()
s, r, done = env.step(env.sample_action())
if done:
break
================================================
FILE: experiments/2D_car/collision.py
================================================
import numpy as np
def intersection():
p = np.array([0, 0])
r = np.array([1, 1])
q = np.array([0.1, 0.1])
s = np.array([.1, .1])
if np.cross(r, s) == 0 and np.cross((q-p), r) == 0: # collinear
# t0 = (q − p) · r / (r · r)
# t1 = (q + s − p) · r / (r · r) = t0 + s · r / (r · r)
t0 = np.dot(q-p, r)/np.dot(r, r)
t1 = t0 + np.dot(s, r)/np.dot(r, r)
print(t1, t0)
if ((np.dot(s, r) > 0) and (0 <= t1 - t0 <= 1)) or ((np.dot(s, r) <= 0) and (0 <= t0 - t1 <= 1)):
print('collinear and overlapping, q_s in p_r')
else:
print('collinear and disjoint')
elif np.cross(r, s) == 0 and np.cross((q-p), r) != 0: # parallel r × s = 0 and (q − p) × r ≠ 0,
print('parallel')
else:
t = np.cross((q - p), s) / np.cross(r, s)
u = np.cross((q - p), r) / np.cross(r, s)
if 0 <= t <= 1 and 0 <= u <= 1:
# If r × s ≠ 0 and 0 ≤ t ≤ 1 and 0 ≤ u ≤ 1, the two line segments meet at the point p + t r = q + u s
print('intersection: ', p + t*r)
else:
print('not parallel and not intersect')
def point2segment():
p = np.array([-1, 1]) # coordination of point
a = np.array([0, 1]) # coordination of line segment end 1
b = np.array([1, 0]) # coordination of line segment end 2
ab = b-a # line ab
ap = p-a
distance = np.abs(np.cross(ab, ap)/np.linalg.norm(ab)) # d = (AB x AC)/|AB|
print(distance)
# angle Cos(θ) = A dot B /(|A||B|)
bp = p-b
cosTheta1 = np.dot(ap, ab) / (np.linalg.norm(ap) * np.linalg.norm(ab))
theta1 = np.arccos(cosTheta1)
cosTheta2 = np.dot(bp, ab) / (np.linalg.norm(bp) * np.linalg.norm(ab))
theta2 = np.arccos(cosTheta2)
if np.pi/2 <= (theta1 % (np.pi*2)) <= 3/2 * np.pi:
print('out of a')
elif -np.pi/2 <= (theta2 % (np.pi*2)) <= np.pi/2:
print('out of b')
else:
print('between a and b')
if __name__ == '__main__':
point2segment()
# intersection()
================================================
FILE: experiments/Robot_arm/A3C.py
================================================
"""
Environment is a Robot Arm. The arm tries to get to the blue point.
The environment will return a geographic (distance) information for the arm to learn.
The far away from blue point the less reward; touch blue r+=1; stop at blue for a while then get r=+10.
You can train this RL by using LOAD = False, after training, this model will be store in the a local folder.
Using LOAD = True to reload the trained model for playing.
You can customize this script in a way you want.
View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
Requirement:
pyglet >= 1.2.4
numpy >= 1.12.1
tensorflow >= 1.0.1
"""
import multiprocessing
import threading
import tensorflow as tf
import numpy as np
from arm_env import ArmEnv
# np.random.seed(1)
# tf.set_random_seed(1)
MAX_GLOBAL_EP = 2000
MAX_EP_STEP = 300
UPDATE_GLOBAL_ITER = 5
N_WORKERS = multiprocessing.cpu_count()
LR_A = 1e-4 # learning rate for actor
LR_C = 2e-4 # learning rate for critic
GAMMA = 0.9 # reward discount
MODE = ['easy', 'hard']
n_model = 1
GLOBAL_NET_SCOPE = 'Global_Net'
ENTROPY_BETA = 0.01
GLOBAL_RUNNING_R = []
GLOBAL_EP = 0
env = ArmEnv(mode=MODE[n_model])
N_S = env.state_dim
N_A = env.action_dim
A_BOUND = env.action_bound
del env
class ACNet(object):
def __init__(self, scope, globalAC=None):
if scope == GLOBAL_NET_SCOPE: # get global network
with tf.variable_scope(scope):
self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
self._build_net()
self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
else: # local net, calculate losses
with tf.variable_scope(scope):
self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A')
self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
mu, sigma, self.v = self._build_net()
td = tf.subtract(self.v_target, self.v, name='TD_error')
with tf.name_scope('c_loss'):
self.c_loss = tf.reduce_mean(tf.square(td))
with tf.name_scope('wrap_a_out'):
self.test = sigma[0]
mu, sigma = mu * A_BOUND[1], sigma + 1e-5
normal_dist = tf.contrib.distributions.Normal(mu, sigma)
with tf.name_scope('a_loss'):
log_prob = normal_dist.log_prob(self.a_his)
exp_v = log_prob * td
entropy = normal_dist.entropy() # encourage exploration
self.exp_v = ENTROPY_BETA * entropy + exp_v
self.a_loss = tf.reduce_mean(-self.exp_v)
with tf.name_scope('choose_a'): # use local params to choose action
self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), *A_BOUND)
with tf.name_scope('local_grad'):
self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
self.a_grads = tf.gradients(self.a_loss, self.a_params)
self.c_grads = tf.gradients(self.c_loss, self.c_params)
with tf.name_scope('sync'):
with tf.name_scope('pull'):
self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
with tf.name_scope('push'):
self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
def _build_net(self):
w_init = tf.contrib.layers.xavier_initializer()
with tf.variable_scope('actor'):
l_a = tf.layers.dense(self.s, 400, tf.nn.relu6, kernel_initializer=w_init, name='la')
l_a = tf.layers.dense(l_a, 300, tf.nn.relu6, kernel_initializer=w_init, name='la2')
mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu')
sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma')
with tf.variable_scope('critic'):
l_c = tf.layers.dense(self.s, 400, tf.nn.relu6, kernel_initializer=w_init, name='lc')
l_c = tf.layers.dense(l_c, 200, tf.nn.relu6, kernel_initializer=w_init, name='lc2')
v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value
return mu, sigma, v
def update_global(self, feed_dict): # run by a local
_, _, t = SESS.run([self.update_a_op, self.update_c_op, self.test], feed_dict) # local grads applies to global net
return t
def pull_global(self): # run by a local
SESS.run([self.pull_a_params_op, self.pull_c_params_op])
def choose_action(self, s): # run by a local
s = s[np.newaxis, :]
return SESS.run(self.A, {self.s: s})[0]
class Worker(object):
def __init__(self, name, globalAC):
self.env = ArmEnv(mode=MODE[n_model])
self.name = name
self.AC = ACNet(name, globalAC)
def work(self):
global GLOBAL_RUNNING_R, GLOBAL_EP
total_step = 1
buffer_s, buffer_a, buffer_r = [], [], []
while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
s = self.env.reset()
ep_r = 0
for ep_t in range(MAX_EP_STEP):
if self.name == 'W_0':
self.env.render()
a = self.AC.choose_action(s)
s_, r, done = self.env.step(a)
if ep_t == MAX_EP_STEP - 1: done = True
ep_r += r
buffer_s.append(s)
buffer_a.append(a)
buffer_r.append(r)
if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net
if done:
v_s_ = 0 # terminal
else:
v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
buffer_v_target = []
for r in buffer_r[::-1]: # reverse buffer r
v_s_ = r + GAMMA * v_s_
buffer_v_target.append(v_s_)
buffer_v_target.reverse()
buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target)
feed_dict = {
self.AC.s: buffer_s,
self.AC.a_his: buffer_a,
self.AC.v_target: buffer_v_target,
}
test = self.AC.update_global(feed_dict)
buffer_s, buffer_a, buffer_r = [], [], []
self.AC.pull_global()
s = s_
total_step += 1
if done:
if len(GLOBAL_RUNNING_R) == 0: # record running episode reward
GLOBAL_RUNNING_R.append(ep_r)
else:
GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r)
print(
self.name,
"Ep:", GLOBAL_EP,
"| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
'| Var:', test,
)
GLOBAL_EP += 1
break
if __name__ == "__main__":
SESS = tf.Session()
with tf.device("/cpu:0"):
OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params
workers = []
# Create worker
for i in range(N_WORKERS):
i_name = 'W_%i' % i # worker name
workers.append(Worker(i_name, GLOBAL_AC))
COORD = tf.train.Coordinator()
SESS.run(tf.global_variables_initializer())
worker_threads = []
for worker in workers:
job = lambda: worker.work()
t = threading.Thread(target=job)
t.start()
worker_threads.append(t)
COORD.join(worker_threads)
================================================
FILE: experiments/Robot_arm/DDPG.py
================================================
"""
Environment is a Robot Arm. The arm tries to get to the blue point.
The environment will return a geographic (distance) information for the arm to learn.
The far away from blue point the less reward; touch blue r+=1; stop at blue for a while then get r=+10.
You can train this RL by using LOAD = False, after training, this model will be store in the a local folder.
Using LOAD = True to reload the trained model for playing.
You can customize this script in a way you want.
View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
Requirement:
pyglet >= 1.2.4
numpy >= 1.12.1
tensorflow >= 1.0.1
"""
import tensorflow as tf
import numpy as np
import os
import shutil
from arm_env import ArmEnv
np.random.seed(1)
tf.set_random_seed(1)
MAX_EPISODES = 600
MAX_EP_STEPS = 200
LR_A = 1e-4 # learning rate for actor
LR_C = 1e-4 # learning rate for critic
GAMMA = 0.9 # reward discount
REPLACE_ITER_A = 1100
REPLACE_ITER_C = 1000
MEMORY_CAPACITY = 5000
BATCH_SIZE = 16
VAR_MIN = 0.1
RENDER = True
LOAD = False
MODE = ['easy', 'hard']
n_model = 1
env = ArmEnv(mode=MODE[n_model])
STATE_DIM = env.state_dim
ACTION_DIM = env.action_dim
ACTION_BOUND = env.action_bound
# all placeholder for tf
with tf.name_scope('S'):
S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s')
with tf.name_scope('R'):
R = tf.placeholder(tf.float32, [None, 1], name='r')
with tf.name_scope('S_'):
S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_')
class Actor(object):
def __init__(self, sess, action_dim, action_bound, learning_rate, t_replace_iter):
self.sess = sess
self.a_dim = action_dim
self.action_bound = action_bound
self.lr = learning_rate
self.t_replace_iter = t_replace_iter
self.t_replace_counter = 0
with tf.variable_scope('Actor'):
# input s, output a
self.a = self._build_net(S, scope='eval_net', trainable=True)
# input s_, output a, get a_ for critic
self.a_ = self._build_net(S_, scope='target_net', trainable=False)
self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net')
self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net')
self.replace = [tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]
def _build_net(self, s, scope, trainable):
with tf.variable_scope(scope):
init_w = tf.contrib.layers.xavier_initializer()
init_b = tf.constant_initializer(0.001)
net = tf.layers.dense(s, 200, activation=tf.nn.relu6,
kernel_initializer=init_w, bias_initializer=init_b, name='l1',
trainable=trainable)
net = tf.layers.dense(net, 200, activation=tf.nn.relu6,
kernel_initializer=init_w, bias_initializer=init_b, name='l2',
trainable=trainable)
net = tf.layers.dense(net, 10, activation=tf.nn.relu,
kernel_initializer=init_w, bias_initializer=init_b, name='l3',
trainable=trainable)
with tf.variable_scope('a'):
actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w,
name='a', trainable=trainable)
scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') # Scale output to -action_bound to action_bound
return scaled_a
def learn(self, s): # batch update
self.sess.run(self.train_op, feed_dict={S: s})
if self.t_replace_counter % self.t_replace_iter == 0:
self.sess.run(self.replace)
self.t_replace_counter += 1
def choose_action(self, s):
s = s[np.newaxis, :] # single state
return self.sess.run(self.a, feed_dict={S: s})[0] # single action
def add_grad_to_graph(self, a_grads):
with tf.variable_scope('policy_grads'):
self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)
with tf.variable_scope('A_train'):
opt = tf.train.RMSPropOptimizer(-self.lr) # (- learning rate) for ascent policy
self.train_op = opt.apply_gradients(zip(self.policy_grads, self.e_params))
class Critic(object):
def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a, a_):
self.sess = sess
self.s_dim = state_dim
self.a_dim = action_dim
self.lr = learning_rate
self.gamma = gamma
self.t_replace_iter = t_replace_iter
self.t_replace_counter = 0
with tf.variable_scope('Critic'):
# Input (s, a), output q
self.a = a
self.q = self._build_net(S, self.a, 'eval_net', trainable=True)
# Input (s_, a_), output q_ for q_target
self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net
self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net')
self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net')
with tf.variable_scope('target_q'):
self.target_q = R + self.gamma * self.q_
with tf.variable_scope('TD_error'):
self.loss = tf.reduce_mean(tf.squared_difference(self.target_q, self.q))
with tf.variable_scope('C_train'):
self.train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
with tf.variable_scope('a_grad'):
self.a_grads = tf.gradients(self.q, a)[0] # tensor of gradients of each sample (None, a_dim)
self.replace = [tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]
def _build_net(self, s, a, scope, trainable):
with tf.variable_scope(scope):
init_w = tf.contrib.layers.xavier_initializer()
init_b = tf.constant_initializer(0.01)
with tf.variable_scope('l1'):
n_l1 = 200
w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable)
net = tf.nn.relu6(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
net = tf.layers.dense(net, 200, activation=tf.nn.relu6,
kernel_initializer=init_w, bias_initializer=init_b, name='l2',
trainable=trainable)
net = tf.layers.dense(net, 10, activation=tf.nn.relu,
kernel_initializer=init_w, bias_initializer=init_b, name='l3',
trainable=trainable)
with tf.variable_scope('q'):
q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable) # Q(s,a)
return q
def learn(self, s, a, r, s_):
self.sess.run(self.train_op, feed_dict={S: s, self.a: a, R: r, S_: s_})
if self.t_replace_counter % self.t_replace_iter == 0:
self.sess.run(self.replace)
self.t_replace_counter += 1
class Memory(object):
def __init__(self, capacity, dims):
self.capacity = capacity
self.data = np.zeros((capacity, dims))
self.pointer = 0
def store_transition(self, s, a, r, s_):
transition = np.hstack((s, a, [r], s_))
index = self.pointer % self.capacity # replace the old memory with new memory
self.data[index, :] = transition
self.pointer += 1
def sample(self, n):
assert self.pointer >= self.capacity, 'Memory has not been fulfilled'
indices = np.random.choice(self.capacity, size=n)
return self.data[indices, :]
sess = tf.Session()
# Create actor and critic.
actor = Actor(sess, ACTION_DIM, ACTION_BOUND[1], LR_A, REPLACE_ITER_A)
critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a, actor.a_)
actor.add_grad_to_graph(critic.a_grads)
M = Memory(MEMORY_CAPACITY, dims=2 * STATE_DIM + ACTION_DIM + 1)
saver = tf.train.Saver()
path = './'+MODE[n_model]
if LOAD:
saver.restore(sess, tf.train.latest_checkpoint(path))
else:
sess.run(tf.global_variables_initializer())
def train():
var = 2. # control exploration
for ep in range(MAX_EPISODES):
s = env.reset()
ep_reward = 0
for t in range(MAX_EP_STEPS):
# while True:
if RENDER:
env.render()
# Added exploration noise
a = actor.choose_action(s)
a = np.clip(np.random.normal(a, var), *ACTION_BOUND) # add randomness to action selection for exploration
s_, r, done = env.step(a)
M.store_transition(s, a, r, s_)
if M.pointer > MEMORY_CAPACITY:
var = max([var*.9999, VAR_MIN]) # decay the action randomness
b_M = M.sample(BATCH_SIZE)
b_s = b_M[:, :STATE_DIM]
b_a = b_M[:, STATE_DIM: STATE_DIM + ACTION_DIM]
b_r = b_M[:, -STATE_DIM - 1: -STATE_DIM]
b_s_ = b_M[:, -STATE_DIM:]
critic.learn(b_s, b_a, b_r, b_s_)
actor.learn(b_s)
s = s_
ep_reward += r
if t == MAX_EP_STEPS-1 or done:
# if done:
result = '| done' if done else '| ----'
print('Ep:', ep,
result,
'| R: %i' % int(ep_reward),
'| Explore: %.2f' % var,
)
break
if os.path.isdir(path): shutil.rmtree(path)
os.mkdir(path)
ckpt_path = os.path.join('./'+MODE[n_model], 'DDPG.ckpt')
save_path = saver.save(sess, ckpt_path, write_meta_graph=False)
print("\nSave Model %s\n" % save_path)
def eval():
env.set_fps(30)
s = env.reset()
while True:
if RENDER:
env.render()
a = actor.choose_action(s)
s_, r, done = env.step(a)
s = s_
if __name__ == '__main__':
if LOAD:
eval()
else:
train()
================================================
FILE: experiments/Robot_arm/DPPO.py
================================================
"""
A simple version of OpenAI's Proximal Policy Optimization (PPO). [http://adsabs.harvard.edu/abs/2017arXiv170706347S]
Distributing workers in parallel to collect data, then stop worker's roll-out and train PPO on collected data.
Restart workers once PPO is updated.
The global PPO updating rule is adopted from DeepMind's paper (DPPO):
Emergence of Locomotion Behaviours in Rich Environments (Google Deepmind): [http://adsabs.harvard.edu/abs/2017arXiv170702286H]
View more on my tutorial website: https://morvanzhou.github.io/tutorials
Dependencies:
tensorflow r1.2
gym 0.9.2
"""
import tensorflow as tf
from tensorflow.contrib.distributions import Normal
import numpy as np
import matplotlib.pyplot as plt
import threading, queue
from arm_env import ArmEnv
EP_MAX = 2000
EP_LEN = 300
N_WORKER = 4 # parallel workers
GAMMA = 0.9 # reward discount factor
A_LR = 0.0001 # learning rate for actor
C_LR = 0.0005 # learning rate for critic
MIN_BATCH_SIZE = 64 # minimum batch size for updating PPO
UPDATE_STEP = 5 # loop update operation n-steps
EPSILON = 0.2 # Clipped surrogate objective
MODE = ['easy', 'hard']
n_model = 1
env = ArmEnv(mode=MODE[n_model])
S_DIM = env.state_dim
A_DIM = env.action_dim
A_BOUND = env.action_bound[1]
class PPO(object):
def __init__(self):
self.sess = tf.Session()
self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state')
# critic
l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu)
self.v = tf.layers.dense(l1, 1)
self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
self.advantage = self.tfdc_r - self.v
self.closs = tf.reduce_mean(tf.square(self.advantage))
self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs)
# actor
pi, pi_params = self._build_anet('pi', trainable=True)
oldpi, oldpi_params = self._build_anet('oldpi', trainable=False)
self.sample_op = tf.squeeze(pi.sample(1), axis=0) # choosing action
self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]
self.tfa = tf.placeholder(tf.float32, [None, A_DIM], 'action')
self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')
# ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
ratio = pi.prob(self.tfa) / (oldpi.prob(self.tfa) + 1e-5)
surr = ratio * self.tfadv # surrogate loss
self.aloss = -tf.reduce_mean(tf.minimum(
surr,
tf.clip_by_value(ratio, 1. - EPSILON, 1. + EPSILON) * self.tfadv))
self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.aloss)
self.sess.run(tf.global_variables_initializer())
def update(self):
global GLOBAL_UPDATE_COUNTER
while not COORD.should_stop():
if GLOBAL_EP < EP_MAX:
UPDATE_EVENT.wait() # wait until get batch of data
self.sess.run(self.update_oldpi_op) # old pi to pi
data = [QUEUE.get() for _ in range(QUEUE.qsize())]
data = np.vstack(data)
s, a, r = data[:, :S_DIM], data[:, S_DIM: S_DIM + A_DIM], data[:, -1:]
adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r})
[self.sess.run(self.atrain_op, {self.tfs: s, self.tfa: a, self.tfadv: adv}) for _ in range(UPDATE_STEP)]
[self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(UPDATE_STEP)]
UPDATE_EVENT.clear() # updating finished
GLOBAL_UPDATE_COUNTER = 0 # reset counter
ROLLING_EVENT.set() # set roll-out available
def _build_anet(self, name, trainable):
with tf.variable_scope(name):
l1 = tf.layers.dense(self.tfs, 200, tf.nn.relu, trainable=trainable)
mu = A_BOUND * tf.layers.dense(l1, A_DIM, tf.nn.tanh, trainable=trainable)
sigma = tf.layers.dense(l1, A_DIM, tf.nn.softplus, trainable=trainable)
norm_dist = Normal(loc=mu, scale=sigma)
params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
return norm_dist, params
def choose_action(self, s):
s = s[np.newaxis, :]
a = self.sess.run(self.sample_op, {self.tfs: s})[0]
return np.clip(a, -2, 2)
def get_v(self, s):
if s.ndim < 2: s = s[np.newaxis, :]
return self.sess.run(self.v, {self.tfs: s})[0, 0]
class Worker(object):
def __init__(self, wid):
self.wid = wid
self.env = ArmEnv(mode=MODE[n_model])
self.ppo = GLOBAL_PPO
def work(self):
global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
while not COORD.should_stop():
s = self.env.reset()
ep_r = 0
buffer_s, buffer_a, buffer_r = [], [], []
for t in range(EP_LEN):
if not ROLLING_EVENT.is_set(): # while global PPO is updating
ROLLING_EVENT.wait() # wait until PPO is updated
buffer_s, buffer_a, buffer_r = [], [], [] # clear history buffer
a = self.ppo.choose_action(s)
s_, r, done = self.env.step(a)
buffer_s.append(s)
buffer_a.append(a)
buffer_r.append(r) # normalize reward, find to be useful
s = s_
ep_r += r
GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size
if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
v_s_ = self.ppo.get_v(s_)
discounted_r = [] # compute discounted reward
for r in buffer_r[::-1]:
v_s_ = r + GAMMA * v_s_
discounted_r.append(v_s_)
discounted_r.reverse()
bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis]
buffer_s, buffer_a, buffer_r = [], [], []
QUEUE.put(np.hstack((bs, ba, br)))
if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
ROLLING_EVENT.clear() # stop collecting data
UPDATE_EVENT.set() # globalPPO update
if GLOBAL_EP >= EP_MAX: # stop training
COORD.request_stop()
break
# record reward changes, plot later
if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r)
else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1]*0.9+ep_r*0.1)
GLOBAL_EP += 1
print('{0:.1f}%'.format(GLOBAL_EP/EP_MAX*100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r,)
if __name__ == '__main__':
GLOBAL_PPO = PPO()
UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event()
UPDATE_EVENT.clear() # no update now
ROLLING_EVENT.set() # start to roll out
workers = [Worker(wid=i) for i in range(N_WORKER)]
GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0
GLOBAL_RUNNING_R = []
COORD = tf.train.Coordinator()
QUEUE = queue.Queue()
threads = []
for worker in workers: # worker threads
t = threading.Thread(target=worker.work, args=())
t.start()
threads.append(t)
# add a PPO updating thread
threads.append(threading.Thread(target=GLOBAL_PPO.update,))
threads[-1].start()
COORD.join(threads)
# plot reward change and testing
plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
plt.xlabel('Episode'); plt.ylabel('Moving reward'); plt.ion(); plt.show()
env.set_fps(30)
while True:
s = env.reset()
for t in range(400):
env.render()
s = env.step(GLOBAL_PPO.choose_action(s))[0]
================================================
FILE: experiments/Robot_arm/arm_env.py
================================================
"""
Environment for Robot Arm.
You can customize this script in a way you want.
View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
Requirement:
pyglet >= 1.2.4
numpy >= 1.12.1
"""
import numpy as np
import pyglet
pyglet.clock.set_fps_limit(10000)
class ArmEnv(object):
action_bound = [-1, 1]
action_dim = 2
state_dim = 7
dt = .1 # refresh rate
arm1l = 100
arm2l = 100
viewer = None
viewer_xy = (400, 400)
get_point = False
mouse_in = np.array([False])
point_l = 15
grab_counter = 0
def __init__(self, mode='easy'):
# node1 (l, d_rad, x, y),
# node2 (l, d_rad, x, y)
self.mode = mode
self.arm_info = np.zeros((2, 4))
self.arm_info[0, 0] = self.arm1l
self.arm_info[1, 0] = self.arm2l
self.point_info = np.array([250, 303])
self.point_info_init = self.point_info.copy()
self.center_coord = np.array(self.viewer_xy)/2
def step(self, action):
# action = (node1 angular v, node2 angular v)
action = np.clip(action, *self.action_bound)
self.arm_info[:, 1] += action * self.dt
self.arm_info[:, 1] %= np.pi * 2
arm1rad = self.arm_info[0, 1]
arm2rad = self.arm_info[1, 1]
arm1dx_dy = np.array([self.arm_info[0, 0] * np.cos(arm1rad), self.arm_info[0, 0] * np.sin(arm1rad)])
arm2dx_dy = np.array([self.arm_info[1, 0] * np.cos(arm2rad), self.arm_info[1, 0] * np.sin(arm2rad)])
self.arm_info[0, 2:4] = self.center_coord + arm1dx_dy # (x1, y1)
self.arm_info[1, 2:4] = self.arm_info[0, 2:4] + arm2dx_dy # (x2, y2)
s, arm2_distance = self._get_state()
r = self._r_func(arm2_distance)
return s, r, self.get_point
def reset(self):
self.get_point = False
self.grab_counter = 0
if self.mode == 'hard':
pxy = np.clip(np.random.rand(2) * self.viewer_xy[0], 100, 300)
self.point_info[:] = pxy
else:
arm1rad, arm2rad = np.random.rand(2) * np.pi * 2
self.arm_info[0, 1] = arm1rad
self.arm_info[1, 1] = arm2rad
arm1dx_dy = np.array([self.arm_info[0, 0] * np.cos(arm1rad), self.arm_info[0, 0] * np.sin(arm1rad)])
arm2dx_dy = np.array([self.arm_info[1, 0] * np.cos(arm2rad), self.arm_info[1, 0] * np.sin(arm2rad)])
self.arm_info[0, 2:4] = self.center_coord + arm1dx_dy # (x1, y1)
self.arm_info[1, 2:4] = self.arm_info[0, 2:4] + arm2dx_dy # (x2, y2)
self.point_info[:] = self.point_info_init
return self._get_state()[0]
def render(self):
if self.viewer is None:
self.viewer = Viewer(*self.viewer_xy, self.arm_info, self.point_info, self.point_l, self.mouse_in)
self.viewer.render()
def sample_action(self):
return np.random.uniform(*self.action_bound, size=self.action_dim)
def set_fps(self, fps=30):
pyglet.clock.set_fps_limit(fps)
def _get_state(self):
# return the distance (dx, dy) between arm finger point with blue point
arm_end = self.arm_info[:, 2:4]
t_arms = np.ravel(arm_end - self.point_info)
center_dis = (self.center_coord - self.point_info)/200
in_point = 1 if self.grab_counter > 0 else 0
return np.hstack([in_point, t_arms/200, center_dis,
# arm1_distance_p, arm1_distance_b,
]), t_arms[-2:]
def _r_func(self, distance):
t = 50
abs_distance = np.sqrt(np.sum(np.square(distance)))
r = -abs_distance/200
if abs_distance < self.point_l and (not self.get_point):
r += 1.
self.grab_counter += 1
if self.grab_counter > t:
r += 10.
self.get_point = True
elif abs_distance > self.point_l:
self.grab_counter = 0
self.get_point = False
return r
class Viewer(pyglet.window.Window):
color = {
'background': [1]*3 + [1]
}
fps_display = pyglet.clock.ClockDisplay()
bar_thc = 5
def __init__(self, width, height, arm_info, point_info, point_l, mouse_in):
super(Viewer, self).__init__(width, height, resizable=False, caption='Arm', vsync=False) # vsync=False to not use the monitor FPS
self.set_location(x=80, y=10)
pyglet.gl.glClearColor(*self.color['background'])
self.arm_info = arm_info
self.point_info = point_info
self.mouse_in = mouse_in
self.point_l = point_l
self.center_coord = np.array((min(width, height)/2, ) * 2)
self.batch = pyglet.graphics.Batch()
arm1_box, arm2_box, point_box = [0]*8, [0]*8, [0]*8
c1, c2, c3 = (249, 86, 86)*4, (86, 109, 249)*4, (249, 39, 65)*4
self.point = self.batch.add(4, pyglet.gl.GL_QUADS, None, ('v2f', point_box), ('c3B', c2))
self.arm1 = self.batch.add(4, pyglet.gl.GL_QUADS, None, ('v2f', arm1_box), ('c3B', c1))
self.arm2 = self.batch.add(4, pyglet.gl.GL_QUADS, None, ('v2f', arm2_box), ('c3B', c1))
def render(self):
pyglet.clock.tick()
self._update_arm()
self.switch_to()
self.dispatch_events()
self.dispatch_event('on_draw')
self.flip()
def on_draw(self):
self.clear()
self.batch.draw()
# self.fps_display.draw()
def _update_arm(self):
point_l = self.point_l
point_box = (self.point_info[0] - point_l, self.point_info[1] - point_l,
self.point_info[0] + point_l, self.point_info[1] - point_l,
self.point_info[0] + point_l, self.point_info[1] + point_l,
self.point_info[0] - point_l, self.point_info[1] + point_l)
self.point.vertices = point_box
arm1_coord = (*self.center_coord, *(self.arm_info[0, 2:4])) # (x0, y0, x1, y1)
arm2_coord = (*(self.arm_info[0, 2:4]), *(self.arm_info[1, 2:4])) # (x1, y1, x2, y2)
arm1_thick_rad = np.pi / 2 - self.arm_info[0, 1]
x01, y01 = arm1_coord[0] - np.cos(arm1_thick_rad) * self.bar_thc, arm1_coord[1] + np.sin(
arm1_thick_rad) * self.bar_thc
x02, y02 = arm1_coord[0] + np.cos(arm1_thick_rad) * self.bar_thc, arm1_coord[1] - np.sin(
arm1_thick_rad) * self.bar_thc
x11, y11 = arm1_coord[2] + np.cos(arm1_thick_rad) * self.bar_thc, arm1_coord[3] - np.sin(
arm1_thick_rad) * self.bar_thc
x12, y12 = arm1_coord[2] - np.cos(arm1_thick_rad) * self.bar_thc, arm1_coord[3] + np.sin(
arm1_thick_rad) * self.bar_thc
arm1_box = (x01, y01, x02, y02, x11, y11, x12, y12)
arm2_thick_rad = np.pi / 2 - self.arm_info[1, 1]
x11_, y11_ = arm2_coord[0] + np.cos(arm2_thick_rad) * self.bar_thc, arm2_coord[1] - np.sin(
arm2_thick_rad) * self.bar_thc
x12_, y12_ = arm2_coord[0] - np.cos(arm2_thick_rad) * self.bar_thc, arm2_coord[1] + np.sin(
arm2_thick_rad) * self.bar_thc
x21, y21 = arm2_coord[2] - np.cos(arm2_thick_rad) * self.bar_thc, arm2_coord[3] + np.sin(
arm2_thick_rad) * self.bar_thc
x22, y22 = arm2_coord[2] + np.cos(arm2_thick_rad) * self.bar_thc, arm2_coord[3] - np.sin(
arm2_thick_rad) * self.bar_thc
arm2_box = (x11_, y11_, x12_, y12_, x21, y21, x22, y22)
self.arm1.vertices = arm1_box
self.arm2.vertices = arm2_box
def on_key_press(self, symbol, modifiers):
if symbol == pyglet.window.key.UP:
self.arm_info[0, 1] += .1
print(self.arm_info[:, 2:4] - self.point_info)
elif symbol == pyglet.window.key.DOWN:
self.arm_info[0, 1] -= .1
print(self.arm_info[:, 2:4] - self.point_info)
elif symbol == pyglet.window.key.LEFT:
self.arm_info[1, 1] += .1
print(self.arm_info[:, 2:4] - self.point_info)
elif symbol == pyglet.window.key.RIGHT:
self.arm_info[1, 1] -= .1
print(self.arm_info[:, 2:4] - self.point_info)
elif symbol == pyglet.window.key.Q:
pyglet.clock.set_fps_limit(1000)
elif symbol == pyglet.window.key.A:
pyglet.clock.set_fps_limit(30)
def on_mouse_motion(self, x, y, dx, dy):
self.point_info[:] = [x, y]
def on_mouse_enter(self, x, y):
self.mouse_in[0] = True
def on_mouse_leave(self, x, y):
self.mouse_in[0] = False
================================================
FILE: experiments/Solve_BipedalWalker/A3C.py
================================================
"""
Asynchronous Advantage Actor Critic (A3C), Reinforcement Learning.
The BipedalWalker example.
View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
Using:
tensorflow 1.8.0
gym 0.10.5
"""
import multiprocessing
import threading
import tensorflow as tf
import numpy as np
import gym
import os
import shutil
GAME = 'BipedalWalker-v2'
OUTPUT_GRAPH = False
LOG_DIR = './log'
N_WORKERS = multiprocessing.cpu_count()
MAX_GLOBAL_EP = 8000
GLOBAL_NET_SCOPE = 'Global_Net'
UPDATE_GLOBAL_ITER = 10
GAMMA = 0.99
ENTROPY_BETA = 0.005
LR_A = 0.00005 # learning rate for actor
LR_C = 0.0001 # learning rate for critic
GLOBAL_RUNNING_R = []
GLOBAL_EP = 0
env = gym.make(GAME)
N_S = env.observation_space.shape[0]
N_A = env.action_space.shape[0]
A_BOUND = [env.action_space.low, env.action_space.high]
del env
class ACNet(object):
def __init__(self, scope, globalAC=None):
if scope == GLOBAL_NET_SCOPE: # get global network
with tf.variable_scope(scope):
self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
self._build_net()
self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
else: # local net, calculate losses
with tf.variable_scope(scope):
self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A')
self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
mu, sigma, self.v = self._build_net()
td = tf.subtract(self.v_target, self.v, name='TD_error')
with tf.name_scope('c_loss'):
self.c_loss = tf.reduce_mean(tf.square(td))
with tf.name_scope('wrap_a_out'):
self.test = sigma[0]
mu, sigma = mu * A_BOUND[1], sigma + 1e-5
normal_dist = tf.contrib.distributions.Normal(mu, sigma)
with tf.name_scope('a_loss'):
log_prob = normal_dist.log_prob(self.a_his)
exp_v = log_prob * td
entropy = normal_dist.entropy() # encourage exploration
self.exp_v = ENTROPY_BETA * entropy + exp_v
self.a_loss = tf.reduce_mean(-self.exp_v)
with tf.name_scope('choose_a'): # use local params to choose action
self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1)), *A_BOUND)
with tf.name_scope('local_grad'):
self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
self.a_grads = tf.gradients(self.a_loss, self.a_params)
self.c_grads = tf.gradients(self.c_loss, self.c_params)
with tf.name_scope('sync'):
with tf.name_scope('pull'):
self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
with tf.name_scope('push'):
self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
def _build_net(self):
w_init = tf.contrib.layers.xavier_initializer()
with tf.variable_scope('actor'):
l_a = tf.layers.dense(self.s, 500, tf.nn.relu6, kernel_initializer=w_init, name='la')
l_a = tf.layers.dense(l_a, 300, tf.nn.relu6, kernel_initializer=w_init, name='la2')
mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu')
sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma')
with tf.variable_scope('critic'):
l_c = tf.layers.dense(self.s, 500, tf.nn.relu6, kernel_initializer=w_init, name='lc')
l_c = tf.layers.dense(l_c, 300, tf.nn.relu6, kernel_initializer=w_init, name='lc2')
v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value
return mu, sigma, v
def update_global(self, feed_dict): # run by a local
_, _, t = SESS.run([self.update_a_op, self.update_c_op, self.test], feed_dict) # local grads applies to global net
return t
def pull_global(self): # run by a local
SESS.run([self.pull_a_params_op, self.pull_c_params_op])
def choose_action(self, s): # run by a local
s = s[np.newaxis, :]
return SESS.run(self.A, {self.s: s})
class Worker(object):
def __init__(self, name, globalAC):
self.env = gym.make(GAME)
self.name = name
self.AC = ACNet(name, globalAC)
def work(self):
global GLOBAL_RUNNING_R, GLOBAL_EP
total_step = 1
buffer_s, buffer_a, buffer_r = [], [], []
while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
s = self.env.reset()
ep_r = 0
while True:
if self.name == 'W_0' and total_step % 30 == 0:
self.env.render()
a = self.AC.choose_action(s)
s_, r, done, info = self.env.step(a)
if r == -100: r = -2
ep_r += r
buffer_s.append(s)
buffer_a.append(a)
buffer_r.append(r)
if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net
if done:
v_s_ = 0 # terminal
else:
v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
buffer_v_target = []
for r in buffer_r[::-1]: # reverse buffer r
v_s_ = r + GAMMA * v_s_
buffer_v_target.append(v_s_)
buffer_v_target.reverse()
buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target)
feed_dict = {
self.AC.s: buffer_s,
self.AC.a_his: buffer_a,
self.AC.v_target: buffer_v_target,
}
test = self.AC.update_global(feed_dict)
buffer_s, buffer_a, buffer_r = [], [], []
self.AC.pull_global()
s = s_
total_step += 1
if done:
achieve = '| Achieve' if self.env.unwrapped.hull.position[0] >= 88 else '| -------'
if len(GLOBAL_RUNNING_R) == 0: # record running episode reward
GLOBAL_RUNNING_R.append(ep_r)
else:
GLOBAL_RUNNING_R.append(0.95 * GLOBAL_RUNNING_R[-1] + 0.05 * ep_r)
print(
self.name,
"Ep:", GLOBAL_EP,
achieve,
"| Pos: %i" % self.env.unwrapped.hull.position[0],
"| RR: %.1f" % GLOBAL_RUNNING_R[-1],
'| EpR: %.1f' % ep_r,
'| var:', test,
)
GLOBAL_EP += 1
break
if __name__ == "__main__":
SESS = tf.Session()
with tf.device("/cpu:0"):
OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params
workers = []
# Create worker
for i in range(N_WORKERS):
i_name = 'W_%i' % i # worker name
workers.append(Worker(i_name, GLOBAL_AC))
COORD = tf.train.Coordinator()
SESS.run(tf.global_variables_initializer())
worker_threads = []
for worker in workers:
job = lambda: worker.work()
t = threading.Thread(target=job)
t.start()
worker_threads.append(t)
COORD.join(worker_threads)
import matplotlib.pyplot as plt
plt.plot(GLOBAL_RUNNING_R)
plt.xlabel('episode')
plt.ylabel('global running reward')
plt.show()
================================================
FILE: experiments/Solve_BipedalWalker/A3C_rnn.py
================================================
"""
Asynchronous Advantage Actor Critic (A3C), Reinforcement Learning.
The BipedalWalker example.
View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
Using:
tensorflow 1.8.0
gym 0.10.5
"""
import multiprocessing
import threading
import tensorflow as tf
import numpy as np
import gym
import os
import shutil
GAME = 'BipedalWalker-v2'
OUTPUT_GRAPH = False
LOG_DIR = './log'
N_WORKERS = multiprocessing.cpu_count()
MAX_GLOBAL_EP = 8000
GLOBAL_NET_SCOPE = 'Global_Net'
UPDATE_GLOBAL_ITER = 10
GAMMA = 0.9
ENTROPY_BETA = 0.001
LR_A = 0.00002 # learning rate for actor
LR_C = 0.0001 # learning rate for critic
GLOBAL_RUNNING_R = []
GLOBAL_EP = 0
env = gym.make(GAME)
N_S = env.observation_space.shape[0]
N_A = env.action_space.shape[0]
A_BOUND = [env.action_space.low, env.action_space.high]
del env
class ACNet(object):
def __init__(self, scope, globalAC=None):
if scope == GLOBAL_NET_SCOPE: # get global network
with tf.variable_scope(scope):
self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
self._build_net()
self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
else: # local net, calculate losses
with tf.variable_scope(scope):
self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A')
self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
mu, sigma, self.v = self._build_net()
td = tf.subtract(self.v_target, self.v, name='TD_error')
with tf.name_scope('c_loss'):
self.c_loss = tf.reduce_mean(tf.square(td))
with tf.name_scope('wrap_a_out'):
self.test = sigma[0]
mu, sigma = mu * A_BOUND[1], sigma + 1e-5
normal_dist = tf.contrib.distributions.Normal(mu, sigma)
with tf.name_scope('a_loss'):
log_prob = normal_dist.log_prob(self.a_his)
exp_v = log_prob * td
entropy = normal_dist.entropy() # encourage exploration
self.exp_v = ENTROPY_BETA * entropy + exp_v
self.a_loss = tf.reduce_mean(-self.exp_v)
with tf.name_scope('choose_a'): # use local params to choose action
self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1)), A_BOUND[0], A_BOUND[1])
with tf.name_scope('local_grad'):
self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
self.a_grads = tf.gradients(self.a_loss, self.a_params)
self.c_grads = tf.gradients(self.c_loss, self.c_params)
with tf.name_scope('sync'):
with tf.name_scope('pull'):
self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in
zip(self.a_params, globalAC.a_params)]
self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in
zip(self.c_params, globalAC.c_params)]
with tf.name_scope('push'):
self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
def _build_net(self):
w_init = tf.random_normal_initializer(0., .1)
with tf.variable_scope('critic'): # only critic controls the rnn update
cell_size = 126
s = tf.expand_dims(self.s, axis=1,
name='timely_input') # [time_step, feature] => [time_step, batch, feature]
rnn_cell = tf.contrib.rnn.BasicRNNCell(cell_size)
self.init_state = rnn_cell.zero_state(batch_size=1, dtype=tf.float32)
outputs, self.final_state = tf.nn.dynamic_rnn(
cell=rnn_cell, inputs=s, initial_state=self.init_state, time_major=True)
cell_out = tf.reshape(outputs, [-1, cell_size], name='flatten_rnn_outputs') # joined state representation
l_c = tf.layers.dense(cell_out, 512, tf.nn.relu6, kernel_initializer=w_init, name='lc')
v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value
with tf.variable_scope('actor'): # state representation is based on critic
cell_out = tf.stop_gradient(cell_out, name='c_cell_out') # from what critic think it is
l_a = tf.layers.dense(cell_out, 512, tf.nn.relu6, kernel_initializer=w_init, name='la')
mu = tf.layers.dense(l_a, N_A, tf.nn.tanh, kernel_initializer=w_init, name='mu')
sigma = tf.layers.dense(l_a, N_A, tf.nn.softplus, kernel_initializer=w_init, name='sigma') # restrict variance
return mu, sigma, v
def update_global(self, feed_dict): # run by a local
_, _, t = SESS.run([self.update_a_op, self.update_c_op, self.test], feed_dict) # local grads applies to global net
return t
def pull_global(self): # run by a local
SESS.run([self.pull_a_params_op, self.pull_c_params_op])
def choose_action(self, s, cell_state): # run by a local
s = s[np.newaxis, :]
a, cell_state = SESS.run([self.A, self.final_state], {self.s: s, self.init_state: cell_state})
return a, cell_state
class Worker(object):
def __init__(self, name, globalAC):
self.env = gym.make(GAME)
self.name = name
self.AC = ACNet(name, globalAC)
def work(self):
global GLOBAL_RUNNING_R, GLOBAL_EP
total_step = 1
buffer_s, buffer_a, buffer_r = [], [], []
while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
s = self.env.reset()
ep_r = 0
rnn_state = SESS.run(self.AC.init_state) # zero rnn state at beginning
keep_state = rnn_state.copy() # keep rnn state for updating global net
while True:
if self.name == 'W_0' and total_step % 30 == 0:
self.env.render()
a, rnn_state_ = self.AC.choose_action(s, rnn_state) # get the action and next rnn state
s_, r, done, info = self.env.step(a)
if r == -100: r = -2
ep_r += r
buffer_s.append(s)
buffer_a.append(a)
buffer_r.append(r)
if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net
if done:
v_s_ = 0 # terminal
else:
v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :], self.AC.init_state: rnn_state_})[
0, 0]
buffer_v_target = []
for r in buffer_r[::-1]: # reverse buffer r
v_s_ = r + GAMMA * v_s_
buffer_v_target.append(v_s_)
buffer_v_target.reverse()
buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(
buffer_v_target)
feed_dict = {
self.AC.s: buffer_s,
self.AC.a_his: buffer_a,
self.AC.v_target: buffer_v_target,
self.AC.init_state: keep_state,
}
test = self.AC.update_global(feed_dict)
buffer_s, buffer_a, buffer_r = [], [], []
self.AC.pull_global()
keep_state = rnn_state_.copy() # replace the keep_state as the new initial rnn state_
s = s_
rnn_state = rnn_state_ # renew rnn state
total_step += 1
if done:
achieve = '| Achieve' if self.env.unwrapped.hull.position[0] >= 88 else '| -------'
if len(GLOBAL_RUNNING_R) == 0: # record running episode reward
GLOBAL_RUNNING_R.append(ep_r)
else:
GLOBAL_RUNNING_R.append(0.95 * GLOBAL_RUNNING_R[-1] + 0.05 * ep_r)
print(
self.name,
"Ep:", GLOBAL_EP,
achieve,
"| Pos: %i" % self.env.unwrapped.hull.position[0],
"| RR: %.1f" % GLOBAL_RUNNING_R[-1],
'| EpR: %.1f' % ep_r,
'| var:', test,
)
GLOBAL_EP += 1
break
if __name__ == "__main__":
SESS = tf.Session()
with tf.device("/cpu:0"):
OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA', decay=0.95)
OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC', decay=0.95)
GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params
workers = []
# Create worker
for i in range(N_WORKERS):
i_name = 'W_%i' % i # worker name
workers.append(Worker(i_name, GLOBAL_AC))
COORD = tf.train.Coordinator()
SESS.run(tf.global_variables_initializer())
if OUTPUT_GRAPH:
if os.path.exists(LOG_DIR):
shutil.rmtree(LOG_DIR)
tf.summary.FileWriter(LOG_DIR, SESS.graph)
worker_threads = []
for worker in workers:
t = threading.Thread(target=worker.work)
t.start()
worker_threads.append(t)
COORD.join(worker_threads)
import matplotlib.pyplot as plt
plt.plot(GLOBAL_RUNNING_R)
plt.xlabel('episode')
plt.ylabel('global running reward')
plt.show()
================================================
FILE: experiments/Solve_BipedalWalker/DDPG.py
================================================
import tensorflow as tf
import numpy as np
import gym
import os
import shutil
np.random.seed(1)
tf.set_random_seed(1)
MAX_EPISODES = 2000
LR_A = 0.0005 # learning rate for actor
LR_C = 0.0005 # learning rate for critic
GAMMA = 0.999 # reward discount
REPLACE_ITER_A = 1700
REPLACE_ITER_C = 1500
MEMORY_CAPACITY = 200000
BATCH_SIZE = 32
DISPLAY_THRESHOLD = 100 # display until the running reward > 100
DATA_PATH = './data'
LOAD_MODEL = False
SAVE_MODEL_ITER = 100000
RENDER = False
OUTPUT_GRAPH = False
ENV_NAME = 'BipedalWalker-v2'
GLOBAL_STEP = tf.Variable(0, trainable=False)
INCREASE_GS = GLOBAL_STEP.assign(tf.add(GLOBAL_STEP, 1))
LR_A = tf.train.exponential_decay(LR_A, GLOBAL_STEP, 10000, .97, staircase=True)
LR_C = tf.train.exponential_decay(LR_C, GLOBAL_STEP, 10000, .97, staircase=True)
END_POINT = (200 - 10) * (14/30) # from game
env = gym.make(ENV_NAME)
env.seed(1)
STATE_DIM = env.observation_space.shape[0] # 24
ACTION_DIM = env.action_space.shape[0] # 4
ACTION_BOUND = env.action_space.high # [1, 1, 1, 1]
# all placeholder for tf
with tf.name_scope('S'):
S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s')
with tf.name_scope('R'):
R = tf.placeholder(tf.float32, [None, 1], name='r')
with tf.name_scope('S_'):
S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_')
############################### Actor ####################################
class Actor(object):
def __init__(self, sess, action_dim, action_bound, learning_rate, t_replace_iter):
self.sess = sess
self.a_dim = action_dim
self.action_bound = action_bound
self.lr = learning_rate
self.t_replace_iter = t_replace_iter
self.t_replace_counter = 0
with tf.variable_scope('Actor'):
# input s, output a
self.a = self._build_net(S, scope='eval_net', trainable=True)
# input s_, output a, get a_ for critic
self.a_ = self._build_net(S_, scope='target_net', trainable=False)
self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net')
self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net')
def _build_net(self, s, scope, trainable):
with tf.variable_scope(scope):
init_w = tf.random_normal_initializer(0., 0.01)
init_b = tf.constant_initializer(0.01)
net = tf.layers.dense(s, 500, activation=tf.nn.relu,
kernel_initializer=init_w, bias_initializer=init_b, name='l1', trainable=trainable)
net = tf.layers.dense(net, 200, activation=tf.nn.relu,
kernel_initializer=init_w, bias_initializer=init_b, name='l2', trainable=trainable)
with tf.variable_scope('a'):
actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w,
bias_initializer=init_b, name='a', trainable=trainable)
scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') # Scale output to -action_bound to action_bound
return scaled_a
def learn(self, s): # batch update
self.sess.run(self.train_op, feed_dict={S: s})
if self.t_replace_counter % self.t_replace_iter == 0:
self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
self.t_replace_counter += 1
def choose_action(self, s):
s = s[np.newaxis, :] # single state
return self.sess.run(self.a, feed_dict={S: s})[0] # single action
def add_grad_to_graph(self, a_grads):
with tf.variable_scope('policy_grads'):
# ys = policy;
# xs = policy's parameters;
# self.a_grads = the gradients of the policy to get more Q
# tf.gradients will calculate dys/dxs with a initial gradients for ys, so this is dq/da * da/dparams
self.policy_grads_and_vars = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)
with tf.variable_scope('A_train'):
opt = tf.train.RMSPropOptimizer(-self.lr) # (- learning rate) for ascent policy
self.train_op = opt.apply_gradients(zip(self.policy_grads_and_vars, self.e_params), global_step=GLOBAL_STEP)
############################### Critic ####################################
class Critic(object):
def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a, a_):
self.sess = sess
self.s_dim = state_dim
self.a_dim = action_dim
self.lr = learning_rate
self.gamma = gamma
self.t_replace_iter = t_replace_iter
self.t_replace_counter = 0
with tf.variable_scope('Critic'):
# Input (s, a), output q
self.a = a
self.q = self._build_net(S, self.a, 'eval_net', trainable=True)
# Input (s_, a_), output q_ for q_target
self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net
self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net')
self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net')
with tf.variable_scope('target_q'):
self.target_q = R + self.gamma * self.q_
with tf.variable_scope('abs_TD'):
self.abs_td = tf.abs(self.target_q - self.q)
self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights')
with tf.variable_scope('TD_error'):
self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.target_q, self.q))
with tf.variable_scope('C_train'):
self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss, global_step=GLOBAL_STEP)
with tf.variable_scope('a_grad'):
self.a_grads = tf.gradients(self.q, a)[0] # tensor of gradients of each sample (None, a_dim)
def _build_net(self, s, a, scope, trainable):
with tf.variable_scope(scope):
init_w = tf.random_normal_initializer(0., 0.01)
init_b = tf.constant_initializer(0.01)
with tf.variable_scope('l1'):
n_l1 = 700
# combine the action and states together in this way
w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable)
net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
with tf.variable_scope('l2'):
net = tf.layers.dense(net, 20, activation=tf.nn.relu, kernel_initializer=init_w,
bias_initializer=init_b, name='l2', trainable=trainable)
with tf.variable_scope('q'):
q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable) # Q(s,a)
return q
def learn(self, s, a, r, s_, ISW):
_, abs_td = self.sess.run([self.train_op, self.abs_td], feed_dict={S: s, self.a: a, R: r, S_: s_, self.ISWeights: ISW})
if self.t_replace_counter % self.t_replace_iter == 0:
self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
self.t_replace_counter += 1
return abs_td
class SumTree(object):
"""
This SumTree code is modified version and the original code is from:
https://github.com/jaara/AI-blog/blob/master/SumTree.py
Story the data with it priority in tree and data frameworks.
"""
data_pointer = 0
def __init__(self, capacity):
self.capacity = capacity # for all priority values
self.tree = np.zeros(2 * capacity - 1)+1e-5
# [--------------Parent nodes-------------][-------leaves to recode priority-------]
# size: capacity - 1 size: capacity
self.data = np.zeros(capacity, dtype=object) # for all transitions
# [--------------data frame-------------]
# size: capacity
def add_new_priority(self, p, data):
leaf_idx = self.data_pointer + self.capacity - 1
self.data[self.data_pointer] = data # update data_frame
self.update(leaf_idx, p) # update tree_frame
self.data_pointer += 1
if self.data_pointer >= self.capacity: # replace when exceed the capacity
self.data_pointer = 0
def update(self, tree_idx, p):
change = p - self.tree[tree_idx]
self.tree[tree_idx] = p
self._propagate_change(tree_idx, change)
def _propagate_change(self, tree_idx, change):
"""change the sum of priority value in all parent nodes"""
parent_idx = (tree_idx - 1) // 2
self.tree[parent_idx] += change
if parent_idx != 0:
self._propagate_change(parent_idx, change)
def get_leaf(self, lower_bound):
leaf_idx = self._retrieve(lower_bound) # search the max leaf priority based on the lower_bound
data_idx = leaf_idx - self.capacity + 1
return [leaf_idx, self.tree[leaf_idx], self.data[data_idx]]
def _retrieve(self, lower_bound, parent_idx=0):
"""
Tree structure and array storage:
Tree index:
0 -> storing priority sum
/ \
1 2
/ \ / \
3 4 5 6 -> storing priority for transitions
Array type for storing:
[0,1,2,3,4,5,6]
"""
left_child_idx = 2 * parent_idx + 1
right_child_idx = left_child_idx + 1
if left_child_idx >= len(self.tree): # end search when no more child
return parent_idx
if self.tree[left_child_idx] == self.tree[right_child_idx]:
return self._retrieve(lower_bound, np.random.choice([left_child_idx, right_child_idx]))
if lower_bound <= self.tree[left_child_idx]: # downward search, always search for a higher priority node
return self._retrieve(lower_bound, left_child_idx)
else:
return self._retrieve(lower_bound - self.tree[left_child_idx], right_child_idx)
@property
def root_priority(self):
return self.tree[0] # the root
class Memory(object): # stored as ( s, a, r, s_ ) in SumTree
"""
This SumTree code is modified version and the original code is from:
https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
"""
epsilon = 0.001 # small amount to avoid zero priority
alpha = 0.6 # [0~1] convert the importance of TD error to priority
beta = 0.4 # importance-sampling, from initial value increasing to 1
beta_increment_per_sampling = 1e-5 # annealing the bias
abs_err_upper = 1 # for stability refer to paper
def __init__(self, capacity):
self.tree = SumTree(capacity)
def store(self, error, transition):
p = self._get_priority(error)
self.tree.add_new_priority(p, transition)
def prio_sample(self, n):
batch_idx, batch_memory, ISWeights = [], [], []
segment = self.tree.root_priority / n
self.beta = np.min([1, self.beta + self.beta_increment_per_sampling]) # max = 1
min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.root_priority
maxiwi = np.power(self.tree.capacity * min_prob, -self.beta) # for later normalizing ISWeights
for i in range(n):
a = segment * i
b = segment * (i + 1)
lower_bound = np.random.uniform(a, b)
while True:
idx, p, data = self.tree.get_leaf(lower_bound)
if type(data) is int:
i -= 1
lower_bound = np.random.uniform(segment * i, segment * (i+1))
else:
break
prob = p / self.tree.root_priority
ISWeights.append(self.tree.capacity * prob)
batch_idx.append(idx)
batch_memory.append(data)
ISWeights = np.vstack(ISWeights)
ISWeights = np.power(ISWeights, -self.beta) / maxiwi # normalize
return batch_idx, np.vstack(batch_memory), ISWeights
def random_sample(self, n):
idx = np.random.randint(0, self.tree.capacity, size=n, dtype=np.int)
return np.vstack(self.tree.data[idx])
def update(self, idx, error):
p = self._get_priority(error)
self.tree.update(idx, p)
def _get_priority(self, error):
error += self.epsilon # avoid 0
clipped_error = np.clip(error, 0, self.abs_err_upper)
return np.power(clipped_error, self.alpha)
sess = tf.Session()
# Create actor and critic.
actor = Actor(sess, ACTION_DIM, ACTION_BOUND, LR_A, REPLACE_ITER_A)
critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a, actor.a_)
actor.add_grad_to_graph(critic.a_grads)
M = Memory(MEMORY_CAPACITY)
saver = tf.train.Saver(max_to_keep=100)
if LOAD_MODEL:
all_ckpt = tf.train.get_checkpoint_state('./data', 'checkpoint').all_model_checkpoint_paths
saver.restore(sess, all_ckpt[-1])
else:
if os.path.isdir(DATA_PATH): shutil.rmtree(DATA_PATH)
os.mkdir(DATA_PATH)
sess.run(tf.global_variables_initializer())
if OUTPUT_GRAPH:
tf.summary.FileWriter('logs', graph=sess.graph)
var = 3 # control exploration
var_min = 0.01
for i_episode in range(MAX_EPISODES):
# s = (hull angle speed, angular velocity, horizontal speed, vertical speed, position of joints and joints angular speed, legs contact with ground, and 10 lidar rangefinder measurements.)
s = env.reset()
ep_r = 0
while True:
if RENDER:
env.render()
a = actor.choose_action(s)
a = np.clip(np.random.normal(a, var), -1, 1) # add randomness to action selection for exploration
s_, r, done, _ = env.step(a) # r = total 300+ points up to the far end. If the robot falls, it gets -100.
if r == -100: r = -2
ep_r += r
transition = np.hstack((s, a, [r], s_))
max_p = np.max(M.tree.tree[-M.tree.capacity:])
M.store(max_p, transition)
if GLOBAL_STEP.eval(sess) > MEMORY_CAPACITY/20:
var = max([var*0.9999, var_min]) # decay the action randomness
tree_idx, b_M, ISWeights = M.prio_sample(BATCH_SIZE) # for critic update
b_s = b_M[:, :STATE_DIM]
b_a = b_M[:, STATE_DIM: STATE_DIM + ACTION_DIM]
b_r = b_M[:, -STATE_DIM - 1: -STATE_DIM]
b_s_ = b_M[:, -STATE_DIM:]
abs_td = critic.learn(b_s, b_a, b_r, b_s_, ISWeights)
actor.learn(b_s)
for i in range(len(tree_idx)): # update priority
idx = tree_idx[i]
M.update(idx, abs_td[i])
if GLOBAL_STEP.eval(sess) % SAVE_MODEL_ITER == 0:
ckpt_path = os.path.join(DATA_PATH, 'DDPG.ckpt')
save_path = saver.save(sess, ckpt_path, global_step=GLOBAL_STEP, write_meta_graph=False)
print("\nSave Model %s\n" % save_path)
if done:
if "running_r" not in globals():
running_r = ep_r
else:
running_r = 0.95*running_r + 0.05*ep_r
if running_r > DISPLAY_THRESHOLD: RENDER = True
else: RENDER = False
done = '| Achieve ' if env.unwrapped.hull.position[0] >= END_POINT else '| -----'
print('Episode:', i_episode,
done,
'| Running_r: %i' % int(running_r),
'| Epi_r: %.2f' % ep_r,
'| Exploration: %.3f' % var,
'| Pos: %.i' % int(env.unwrapped.hull.position[0]),
'| LR_A: %.6f' % sess.run(LR_A),
'| LR_C: %.6f' % sess.run(LR_C),
)
break
s = s_
sess.run(INCREASE_GS)
================================================
FILE: experiments/Solve_LunarLander/A3C.py
================================================
"""
Asynchronous Advantage Actor Critic (A3C) with continuous action space, Reinforcement Learning.
The Pendulum example. Convergence promised, but difficult environment, this code hardly converge.
View more on [莫烦Python] : https://morvanzhou.github.io/tutorials/
Using:
tensorflow 1.0
gym 0.8.0
"""
import multiprocessing
import threading
import tensorflow as tf
import numpy as np
import gym
import os
import shutil
import matplotlib.pyplot as plt
GAME = 'LunarLander-v2'
OUTPUT_GRAPH = False
LOG_DIR = './log'
N_WORKERS = multiprocessing.cpu_count()
MAX_GLOBAL_EP = 5000
GLOBAL_NET_SCOPE = 'Global_Net'
UPDATE_GLOBAL_ITER = 5
GAMMA = 0.99
ENTROPY_BETA = 0.001 # not useful in this case
LR_A = 0.0005 # learning rate for actor
LR_C = 0.001 # learning rate for critic
GLOBAL_RUNNING_R = []
GLOBAL_EP = 0
env = gym.make(GAME)
N_S = env.observation_space.shape[0]
N_A = env.action_space.n
del env
class ACNet(object):
def __init__(self, scope, globalAC=None):
if scope == GLOBAL_NET_SCOPE: # get global network
with tf.variable_scope(scope):
self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
self._build_net(N_A)
self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
else: # local net, calculate losses
with tf.variable_scope(scope):
self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
self.a_his = tf.placeholder(tf.int32, [None, ], 'A')
self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
self.a_prob, self.v = self._build_net(N_A)
td = tf.subtract(self.v_target, self.v, name='TD_error')
with tf.name_scope('c_loss'):
self.c_loss = tf.reduce_mean(tf.square(td))
with tf.name_scope('a_loss'):
log_prob = tf.reduce_sum(tf.log(self.a_prob) * tf.one_hot(self.a_his, N_A, dtype=tf.float32), axis=1, keep_dims=True)
exp_v = log_prob * td
entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob), axis=1, keep_dims=True) # encourage exploration
self.exp_v = ENTROPY_BETA * entropy + exp_v
self.a_loss = tf.reduce_mean(-self.exp_v)
with tf.name_scope('local_grad'):
self.a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
self.c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
self.a_grads = tf.gradients(self.a_loss, self.a_params)
self.c_grads = tf.gradients(self.c_loss, self.c_params)
with tf.name_scope('sync'):
with tf.name_scope('pull'):
self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
with tf.name_scope('push'):
self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
def _build_net(self, n_a):
w_init = tf.random_normal_initializer(0., .01)
with tf.variable_scope('critic'):
cell_size = 64
s = tf.expand_dims(self.s, axis=1,
name='timely_input') # [time_step, feature] => [time_step, batch, feature]
rnn_cell = tf.contrib.rnn.BasicRNNCell(cell_size)
self.init_state = rnn_cell.zero_state(batch_size=1, dtype=tf.float32)
outputs, self.final_state = tf.nn.dynamic_rnn(
cell=rnn_cell, inputs=s, initial_state=self.init_state, time_major=True)
cell_out = tf.reshape(outputs, [-1, cell_size], name='flatten_rnn_outputs') # joined state representation
l_c = tf.layers.dense(cell_out, 200, tf.nn.relu6, kernel_initializer=w_init, name='lc')
v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value
with tf.variable_scope('actor'):
cell_out = tf.stop_gradient(cell_out, name='c_cell_out')
l_a = tf.layers.dense(cell_out, 300, tf.nn.relu6, kernel_initializer=w_init, name='la')
a_prob = tf.layers.dense(l_a, n_a, tf.nn.softmax, kernel_initializer=w_init, name='ap')
return a_prob, v
def update_global(self, feed_dict): # run by a local
SESS.run([self.update_a_op, self.update_c_op], feed_dict) # local grads applies to global net
def pull_global(self): # run by a local
SESS.run([self.pull_a_params_op, self.pull_c_params_op])
def choose_action(self, s, cell_state): # run by a local
prob_weights, cell_state = SESS.run([self.a_prob, self.final_state], feed_dict={self.s: s[np.newaxis, :],
self.init_state: cell_state})
action = np.random.choice(range(prob_weights.shape[1]),
p=prob_weights.ravel()) # select action w.r.t the actions prob
return action, cell_state
class Worker(object):
def __init__(self, name, globalAC):
self.env = gym.make(GAME)
self.name = name
self.AC = ACNet(name, globalAC)
def work(self):
global GLOBAL_RUNNING_R, GLOBAL_EP
total_step = 1
r_scale = 100
buffer_s, buffer_a, buffer_r = [], [], []
while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
s = self.env.reset()
ep_r = 0
ep_t = 0
rnn_state = SESS.run(self.AC.init_state) # zero rnn state at beginning
keep_state = rnn_state.copy() # keep rnn state for updating global net
while True:
# if self.name == 'W_0' and total_step % 10 == 0:
# self.env.render()
a, rnn_state_ = self.AC.choose_action(s, rnn_state) # get the action and next rnn state
s_, r, done, info = self.env.step(a)
if r == -100: r = -10
ep_r += r
buffer_s.append(s)
buffer_a.append(a)
buffer_r.append(r/r_scale)
if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net
if done:
v_s_ = 0 # terminal
else:
v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :], self.AC.init_state: rnn_state_})[0,0]
buffer_v_target = []
for r in buffer_r[::-1]: # reverse buffer r
v_s_ = r + GAMMA * v_s_
buffer_v_target.append(v_s_)
buffer_v_target.reverse()
buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.array(buffer_a), np.vstack(buffer_v_target)
feed_dict = {
self.AC.s: buffer_s,
self.AC.a_his: buffer_a,
self.AC.v_target: buffer_v_target,
self.AC.init_state: keep_state,
}
self.AC.update_global(feed_dict)
buffer_s, buffer_a, buffer_r = [], [], []
self.AC.pull_global()
keep_state = rnn_state_.copy() # replace the keep_state as the new initial rnn state_
s = s_
total_step += 1
rnn_state = rnn_state_ # renew rnn state
ep_t += 1
if done:
if len(GLOBAL_RUNNING_R) == 0: # record running episode reward
GLOBAL_RUNNING_R.append(ep_r)
else:
GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] + 0.01 * ep_r)
if not self.env.unwrapped.lander.awake: solve = '| Landed'
else: solve = '| ------'
print(
self.name,
"Ep:", GLOBAL_EP,
solve,
"| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
)
GLOBAL_EP += 1
break
if __name__ == "__main__":
SESS = tf.Session()
with tf.device("/cpu:0"):
OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params
workers = []
# Create worker
for i in range(N_WORKERS):
i_name = 'W_%i' % i # worker name
workers.append(Worker(i_name, GLOBAL_AC))
COORD = tf.train.Coordinator()
SESS.run(tf.global_variables_initializer())
if OUTPUT_GRAPH:
if os.path.exists(LOG_DIR):
shutil.rmtree(LOG_DIR)
tf.summary.FileWriter(LOG_DIR, SESS.graph)
worker_threads = []
for worker in workers:
job = lambda: worker.work()
t = threading.Thread(target=job)
t.start()
worker_threads.append(t)
COORD.join(worker_threads)
plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
plt.xlabel('step')
plt.ylabel('Total moving reward')
plt.show()
================================================
FILE: experiments/Solve_LunarLander/DuelingDQNPrioritizedReplay.py
================================================
"""
The DQN improvement: Prioritized Experience Replay (based on https://arxiv.org/abs/1511.05952)
View more on 莫烦Python: https://morvanzhou.github.io/tutorials/
Using:
Tensorflow: 1.0
"""
import numpy as np
import tensorflow as tf
np.random.seed(1)
tf.set_random_seed(1)
class SumTree(object):
"""
This SumTree code is modified version and the original code is from:
https://github.com/jaara/AI-blog/blob/master/SumTree.py
Story the data with it priority in tree and data frameworks.
"""
data_pointer = 0
def __init__(self, capacity):
self.capacity = capacity # for all priority values
self.tree = np.zeros(2 * capacity - 1)
# [--------------Parent nodes-------------][-------leaves to recode priority-------]
# size: capacity - 1 size: capacity
self.data = np.zeros(capacity, dtype=object) # for all transitions
# [--------------data frame-------------]
# size: capacity
def add_new_priority(self, p, data):
leaf_idx = self.data_pointer + self.capacity - 1
self.data[self.data_pointer] = data # update data_frame
self.update(leaf_idx, p) # update tree_frame
self.data_pointer += 1
if self.data_pointer >= self.capacity: # replace when exceed the capacity
self.data_pointer = 0
def update(self, tree_idx, p):
change = p - self.tree[tree_idx]
self.tree[tree_idx] = p
self._propagate_change(tree_idx, change)
def _propagate_change(self, tree_idx, change):
"""change the sum of priority value in all parent nodes"""
parent_idx = (tree_idx - 1) // 2
self.tree[parent_idx] += change
if parent_idx != 0:
self._propagate_change(parent_idx, change)
def get_leaf(self, lower_bound):
leaf_idx = self._retrieve(lower_bound) # search the max leaf priority based on the lower_bound
data_idx = leaf_idx - self.capacity + 1
return [leaf_idx, self.tree[leaf_idx], self.data[data_idx]]
def _retrieve(self, lower_bound, parent_idx=0):
"""
Tree structure and array storage:
Tree index:
0 -> storing priority sum
/ \
1 2
/ \ / \
3 4 5 6 -> storing priority for transitions
Array type for storing:
[0,1,2,3,4,5,6]
"""
left_child_idx = 2 * parent_idx + 1
right_child_idx = left_child_idx + 1
if left_child_idx >= len(self.tree): # end search when no more child
return parent_idx
if self.tree[left_child_idx] == self.tree[right_child_idx]:
return self._retrieve(lower_bound, np.random.choice([left_child_idx, right_child_idx]))
if lower_bound <= self.tree[left_child_idx]: # downward search, always search for a higher priority node
return self._retrieve(lower_bound, left_child_idx)
else:
return self._retrieve(lower_bound - self.tree[left_child_idx], right_child_idx)
@property
def root_priority(self):
return self.tree[0] # the root
class Memory(object): # stored as ( s, a, r, s_ ) in SumTree
"""
This SumTree code is modified version and the original code is from:
https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
"""
epsilon = 0.001 # small amount to avoid zero priority
alpha = 0.6 # [0~1] convert the importance of TD error to priority
beta = 0.4 # importance-sampling, from initial value increasing to 1
beta_increment_per_sampling = 1e-4 # annealing the bias
abs_err_upper = 1 # for stability refer to paper
def __init__(self, capacity):
self.tree = SumTree(capacity)
def store(self, error, transition):
p = self._get_priority(error)
self.tree.add_new_priority(p, transition)
def sample(self, n):
batch_idx, batch_memory, ISWeights = [], [], []
segment = self.tree.root_priority / n
self.beta = np.min([1, self.beta + self.beta_increment_per_sampling]) # max = 1
min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.root_priority
maxiwi = np.power(self.tree.capacity * min_prob, -self.beta) # for later normalizing ISWeights
for i in range(n):
a = segment * i
b = segment * (i + 1)
lower_bound = np.random.uniform(a, b)
idx, p, data = self.tree.get_leaf(lower_bound)
prob = p / self.tree.root_priority
ISWeights.append(self.tree.capacity * prob)
batch_idx.append(idx)
batch_memory.append(data)
ISWeights = np.vstack(ISWeights)
ISWeights = np.power(ISWeights, -self.beta) / maxiwi # normalize
return batch_idx, np.vstack(batch_memory), ISWeights
def update(self, idx, error):
p = self._get_priority(error)
self.tree.update(idx, p)
def _get_priority(self, error):
error += self.epsilon # avoid 0
clipped_error = np.clip(error, 0, self.abs_err_upper)
return np.power(clipped_error, self.alpha)
class DuelingDQNPrioritizedReplay:
def __init__(
self,
n_actions,
n_features,
learning_rate=0.005,
reward_decay=0.9,
e_greedy=0.9,
replace_target_iter=500,
memory_size=10000,
batch_size=32,
e_greedy_increment=None,
hidden=[100, 50],
output_graph=False,
sess=None,
):
self.n_actions = n_actions
self.n_features = n_features
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon_max = e_greedy
self.replace_target_iter = replace_target_iter
self.memory_size = memory_size
self.batch_size = batch_size
self.hidden = hidden
self.epsilon_increment = e_greedy_increment
self.epsilon = 0.5 if e_greedy_increment is not None else self.epsilon_max
self.learn_step_counter = 0
self._build_net()
self.memory = Memory(capacity=memory_size)
if sess is None:
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
else:
self.sess = sess
if output_graph:
tf.summary.FileWriter("logs/", self.sess.graph)
self.cost_his = []
def _build_net(self):
def build_layers(s, c_names, w_initializer, b_initializer):
for i, h in enumerate(self.hidden):
if i == 0:
in_units, out_units, inputs = self.n_features, self.hidden[i], s
else:
in_units, out_units, inputs = self.hidden[i-1], self.hidden[i], l
with tf.variable_scope('l%i' % i):
w = tf.get_variable('w', [in_units, out_units], initializer=w_initializer, collections=c_names)
b = tf.get_variable('b', [1, out_units], initializer=b_initializer, collections=c_names)
l = tf.nn.relu(tf.matmul(inputs, w) + b)
with tf.variable_scope('Value'):
w = tf.get_variable('w', [self.hidden[-1], 1], initializer=w_initializer, collections=c_names)
b = tf.get_variable('b', [1, 1], initializer=b_initializer, collections=c_names)
self.V = tf.matmul(l, w) + b
with tf.variable_scope('Advantage'):
w = tf.get_variable('w', [self.hidden[-1], self.n_actions], initializer=w_initializer, collections=c_names)
b = tf.get_variable('b', [1, self.n_actions], initializer=b_initializer, collections=c_names)
self.A = tf.matmul(l, w) + b
with tf.variable_scope('Q'):
out = self.V + (self.A - tf.reduce_mean(self.A, axis=1, keep_dims=True)) # Q = V(s) + A(s,a)
# with tf.variable_scope('out'):
# w = tf.get_variable('w', [self.hidden[-1], self.n_actions], initializer=w_initializer, collections=c_names)
# b = tf.get_variable('b', [1, self.n_actions], initializer=b_initializer, collections=c_names)
# out = tf.matmul(l, w) + b
return out
# ------------------ build evaluate_net ------------------
self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input
self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss
self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights')
with tf.variable_scope('eval_net'):
c_names, w_initializer, b_initializer = \
['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], \
tf.random_normal_initializer(0., 0.01), tf.constant_initializer(0.01) # config of layers
self.q_eval = build_layers(self.s, c_names, w_initializer, b_initializer)
with tf.variable_scope('loss'):
self.abs_errors = tf.abs(tf.reduce_sum(self.q_target - self.q_eval, axis=1)) # for updating Sumtree
self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.q_target, self.q_eval))
with tf.variable_scope('train'):
self._train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
# ------------------ build target_net ------------------
self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input
with tf.variable_scope('target_net'):
c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
self.q_next = build_layers(self.s_, c_names, w_initializer, b_initializer)
def store_transition(self, s, a, r, s_):
transition = np.hstack((s, [a, r], s_))
max_p = np.max(self.memory.tree.tree[-self.memory.tree.capacity:])
self.memory.store(max_p, transition)
def choose_action(self, observation):
observation = observation[np.newaxis, :]
if np.random.uniform() < self.epsilon:
actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
action = np.argmax(actions_value)
else:
action = np.random.randint(0, self.n_actions)
return action
def _replace_target_params(self):
t_params = tf.get_collection('target_net_params')
e_params = tf.get_collection('eval_net_params')
self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
def learn(self):
if self.learn_step_counter % self.replace_target_iter == 0:
self._replace_target_params()
tree_idx, batch_memory, ISWeights = self.memory.sample(self.batch_size)
# double DQN
q_next, q_eval4next = self.sess.run(
[self.q_next, self.q_eval],
feed_dict={self.s_: batch_memory[:, -self.n_features:], # next observation
self.s: batch_memory[:, -self.n_features:]}) # next observation
q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_features]})
q_target = q_eval.copy()
batch_index = np.arange(self.batch_size, dtype=np.int32)
eval_act_index = batch_memory[:, self.n_features].astype(int)
reward = batch_memory[:, self.n_features + 1]
max_act4next = np.argmax(q_eval4next,
axis=1) # the action that brings the highest value is evaluated by q_eval
selected_q_next = q_next[batch_index, max_act4next] # Double DQN, select q_next depending on above actions
q_target[batch_index, eval_act_index] = reward + self.gamma * selected_q_next
# q_next, q_eval = self.sess.run(
# [self.q_next, self.q_eval],
# feed_dict={self.s_: batch_memory[:, -self.n_features:],
# self.s: batch_memory[:, :self.n_features]})
#
# q_target = q_eval.copy()
# batch_index = np.arange(self.batch_size, dtype=np.int32)
# eval_act_index = batch_memory[:, self.n_features].astype(int)
# reward = batch_memory[:, self.n_features + 1]
#
# q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
_, abs_errors, self.cost = self.sess.run([self._train_op, self.abs_errors, self.loss],
feed_dict={self.s: batch_memory[:, :self.n_features],
self.q_target: q_target,
self.ISWeights: ISWeights})
for i in range(len(tree_idx)): # update priority
idx = tree_idx[i]
self.memory.update(idx, abs_errors[i])
self.cost_his.append(self.cost)
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
self.learn_step_counter += 1
================================================
FILE: experiments/Solve_LunarLander/run_LunarLander.py
================================================
"""
Deep Q network,
LunarLander-v2 example
Using:
Tensorflow: 1.0
gym: 0.8.0
"""
import gym
from gym import wrappers
from DuelingDQNPrioritizedReplay import DuelingDQNPrioritizedReplay
env = gym.make('LunarLander-v2')
# env = env.unwrapped
env.seed(1)
N_A = env.action_space.n
N_S = env.observation_space.shape[0]
MEMORY_CAPACITY = 50000
TARGET_REP_ITER = 2000
MAX_EPISODES = 900
E_GREEDY = 0.95
E_INCREMENT = 0.00001
GAMMA = 0.99
LR = 0.0001
BATCH_SIZE = 32
HIDDEN = [400, 400]
RENDER = True
RL = DuelingDQNPrioritizedReplay(
n_actions=N_A, n_features=N_S, learning_rate=LR, e_greedy=E_GREEDY, reward_decay=GAMMA,
hidden=HIDDEN, batch_size=BATCH_SIZE, replace_target_iter=TARGET_REP_ITER,
memory_size=MEMORY_CAPACITY, e_greedy_increment=E_INCREMENT,)
total_steps = 0
running_r = 0
r_scale = 100
for i_episode in range(MAX_EPISODES):
s = env.reset() # (coord_x, coord_y, vel_x, vel_y, angle, angular_vel, l_leg_on_ground, r_leg_on_ground)
ep_r = 0
while True:
if total_steps > MEMORY_CAPACITY: env.render()
a = RL.choose_action(s)
s_, r, done, _ = env.step(a)
if r == -100: r = -30
r /= r_scale
ep_r += r
RL.store_transition(s, a, r, s_)
if total_steps > MEMORY_CAPACITY:
RL.learn()
if done:
land = '| Landed' if r == 100/r_scale else '| ------'
running_r = 0.99 * running_r + 0.01 * ep_r
print('Epi: ', i_episode,
land,
'| Epi_R: ', round(ep_r, 2),
'| Running_R: ', round(running_r, 2),
'| Epsilon: ', round(RL.epsilon, 3))
break
s = s_
total_steps += 1