Repository: mohammadasghari/dqn-multi-agent-rl Branch: master Commit: 6f392154b850 Files: 19 Total size: 63.0 KB Directory structure: gitextract_yem4qnp2/ ├── LICENSE ├── README.md ├── agents_landmarks_multiagent.py ├── brain.py ├── dqn_agent.py ├── environments/ │ ├── __init__.py │ ├── agents_landmarks/ │ │ ├── __init__.py │ │ └── env.py │ └── predators_prey/ │ ├── __init__.py │ └── env.py ├── predators_prey_multiagent.py ├── prioritized_experience_replay.py ├── results_predators_prey/ │ ├── rewards_files/ │ │ └── 5e-05_RMSProp_1000000_64_10000_100000_100_0_4_256_DQN_PER_0.5_False_3_5_1_1.csv │ ├── timesteps_files/ │ │ └── 5e-05_RMSProp_1000000_64_10000_100000_100_0_4_256_DQN_PER_0.5_False_3_5_1_1.csv │ └── weights_files/ │ ├── 5e-05_RMSProp_1000000_64_10000_100000_100_0_4_256_DQN_PER_0.5_False_3_5_1_1_0.h5 │ ├── 5e-05_RMSProp_1000000_64_10000_100000_100_0_4_256_DQN_PER_0.5_False_3_5_1_1_1.h5 │ └── 5e-05_RMSProp_1000000_64_10000_100000_100_0_4_256_DQN_PER_0.5_False_3_5_1_1_2.h5 ├── sum_tree.py └── uniform_experience_replay.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2019 mohammadasghari Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Deep Q-learning (DQN) for Multi-agent Reinforcement Learning (RL) DQN implementation for two multi-agent environments: `agents_landmarks` and `predators_prey` (See [details.pdf](https://github.com/mohammadasghari/dqn-multi-agent-rl/blob/master/details.pdf) for a detailed description of these environments). ## Code structure - `./environments/`: folder where the two environments (`agents_landmarks` and `predators_prey`) are stored. 1) `./environments/agents_landmarks`: in this environment, there exist ***n*** agents that must cooperate through actions to reach a set of ***n*** landmarks in a two dimensional discrete ***k***-by-***k*** grid environment. 2) `./environments/predators_prey`: in this environment, ***n*** agents (called predators) must cooperate with each other to capture one prey in a two dimensional discrete ***k***-by-***k*** grid environment. - `./dqn_agent.py`: contains code for the implementation of DQN and its extensions (Double DQN, Dueling DQN, DQN with Prioritized Experience Replay) (See [details.pdf](https://github.com/mohammadasghari/dqn-multi-agent-rl/blob/master/details.pdf) for a detailed description of the DQN and its extensions). - `./brain.py`: contains code for the implementation of neural networks required for DQN (See [details.pdf](https://github.com/mohammadasghari/dqn-multi-agent-rl/blob/master/details.pdf) for a detailed description of the neural network implementation). - `./uniform_experience_replay.py`: contains code for the implementation of Uniform Experience Replay (UER) which can be used in DQN. - `./prioritized_experience_replay.py`: contains code for the implementation of Prioritized Experience Replay (PER) which can be used in DQN. - `./sum_tree.py`: contains code for the implementation of sum tree data structure which is used in Prioritized Experience Replay (PER). - `./agents_landmarks_multiagent.py`: contains code for applying DQN to the `agents_landmarks` environment. - `./predators_prey_multiagent.py`: contains code for applying DQN to the `predators_prey` environment. - `./results_agents_landmarks/`: folder where the results (neural net weights, rewards of the episodes, videos, figures, etc.) for the `agents_landmarks` environment are stored. - `./results_predators_prey/`: folder where the results (neural net weights, rewards of the episodes, videos, figures, etc.) for the `predators_prey` environment are stored. - `./details.pdf`: a pdf file including a detailed description of the DQN and its extensions, the environments, and the neural network implementation. ## Results #### Predators and Prey Environment In this environment, the prey is captured when one predator moves to the location of the prey while the other predators occupy, for support, the neighboring cells of the prey's location. ##### Fixed prey (mode 0) ##### Random prey (mode 1) ##### Random escaping prey (mode 2) #### Agents and Landmarks Environment ##### 10 agents and 10 landmarks ##### 16 agents and 16 landmarks ### Todos - Write required dependencies and installation steps - ... ================================================ FILE: agents_landmarks_multiagent.py ================================================ """ Created on Wednesday Jan 16 2019 @author: Seyed Mohammad Asghari @github: https://github.com/s3yyy3d-m """ import numpy as np import os import random import argparse import pandas as pd from environments.agents_landmarks.env import agentslandmarks from dqn_agent import Agent import glob ARG_LIST = ['learning_rate', 'optimizer', 'memory_capacity', 'batch_size', 'target_frequency', 'maximum_exploration', 'max_timestep', 'first_step_memory', 'replay_steps', 'number_nodes', 'target_type', 'memory', 'prioritization_scale', 'dueling', 'agents_number', 'grid_size', 'game_mode', 'reward_mode'] def get_name_brain(args, idx): file_name_str = '_'.join([str(args[x]) for x in ARG_LIST]) return './results_agents_landmarks/weights_files/' + file_name_str + '_' + str(idx) + '.h5' def get_name_rewards(args): file_name_str = '_'.join([str(args[x]) for x in ARG_LIST]) return './results_agents_landmarks/rewards_files/' + file_name_str + '.csv' def get_name_timesteps(args): file_name_str = '_'.join([str(args[x]) for x in ARG_LIST]) return './results_agents_landmarks/timesteps_files/' + file_name_str + '.csv' class Environment(object): def __init__(self, arguments): current_path = os.path.dirname(__file__) # Where your .py file is located self.env = agentslandmarks(arguments, current_path) self.episodes_number = arguments['episode_number'] self.render = arguments['render'] self.recorder = arguments['recorder'] self.max_ts = arguments['max_timestep'] self.test = arguments['test'] self.filling_steps = arguments['first_step_memory'] self.steps_b_updates = arguments['replay_steps'] self.max_random_moves = arguments['max_random_moves'] self.num_agents = arguments['agents_number'] self.num_landmarks = self.num_agents self.game_mode = arguments['game_mode'] self.grid_size = arguments['grid_size'] def run(self, agents, file1, file2): total_step = 0 rewards_list = [] timesteps_list = [] max_score = -10000 for episode_num in xrange(self.episodes_number): state = self.env.reset() if self.render: self.env.render() random_moves = random.randint(0, self.max_random_moves) # create randomness in initial state for _ in xrange(random_moves): actions = [4 for _ in xrange(len(agents))] state, _, _ = self.env.step(actions) if self.render: self.env.render() # converting list of positions to an array state = np.array(state) state = state.ravel() done = False reward_all = 0 time_step = 0 while not done and time_step < self.max_ts: # if self.render: # self.env.render() actions = [] for agent in agents: actions.append(agent.greedy_actor(state)) next_state, reward, done = self.env.step(actions) # converting list of positions to an array next_state = np.array(next_state) next_state = next_state.ravel() if not self.test: for agent in agents: agent.observe((state, actions, reward, next_state, done)) if total_step >= self.filling_steps: agent.decay_epsilon() if time_step % self.steps_b_updates == 0: agent.replay() agent.update_target_model() total_step += 1 time_step += 1 state = next_state reward_all += reward if self.render: self.env.render() rewards_list.append(reward_all) timesteps_list.append(time_step) print("Episode {p}, Score: {s}, Final Step: {t}, Goal: {g}".format(p=episode_num, s=reward_all, t=time_step, g=done)) if self.recorder: os.system("ffmpeg -r 2 -i ./results_agents_landmarks/snaps/%04d.png -b:v 40000 -minrate 40000 -maxrate 4000k -bufsize 1835k -c:v mjpeg -qscale:v 0 " + "./results_agents_landmarks/videos/{a1}_{a2}_{a3}_{a4}.avi".format(a1=self.num_agents, a2=self.num_landmarks, a3=self.game_mode, a4=self.grid_size)) files = glob.glob('./results_agents_landmarks/snaps/*') for f in files: os.remove(f) if not self.test: if episode_num % 100 == 0: df = pd.DataFrame(rewards_list, columns=['score']) df.to_csv(file1) df = pd.DataFrame(timesteps_list, columns=['steps']) df.to_csv(file2) if total_step >= self.filling_steps: if reward_all > max_score: for agent in agents: agent.brain.save_model() max_score = reward_all if __name__ =="__main__": parser = argparse.ArgumentParser() # DQN Parameters parser.add_argument('-e', '--episode-number', default=1000000, type=int, help='Number of episodes') parser.add_argument('-l', '--learning-rate', default=0.00005, type=float, help='Learning rate') parser.add_argument('-op', '--optimizer', choices=['Adam', 'RMSProp'], default='RMSProp', help='Optimization method') parser.add_argument('-m', '--memory-capacity', default=1000000, type=int, help='Memory capacity') parser.add_argument('-b', '--batch-size', default=64, type=int, help='Batch size') parser.add_argument('-t', '--target-frequency', default=10000, type=int, help='Number of steps between the updates of target network') parser.add_argument('-x', '--maximum-exploration', default=100000, type=int, help='Maximum exploration step') parser.add_argument('-fsm', '--first-step-memory', default=0, type=float, help='Number of initial steps for just filling the memory') parser.add_argument('-rs', '--replay-steps', default=4, type=float, help='Steps between updating the network') parser.add_argument('-nn', '--number-nodes', default=256, type=int, help='Number of nodes in each layer of NN') parser.add_argument('-tt', '--target-type', choices=['DQN', 'DDQN'], default='DDQN') parser.add_argument('-mt', '--memory', choices=['UER', 'PER'], default='PER') parser.add_argument('-pl', '--prioritization-scale', default=0.5, type=float, help='Scale for prioritization') parser.add_argument('-du', '--dueling', action='store_true', help='Enable Dueling architecture if "store_false" ') parser.add_argument('-gn', '--gpu-num', default='2', type=str, help='Number of GPU to use') parser.add_argument('-test', '--test', action='store_true', help='Enable the test phase if "store_false"') # Game Parameters parser.add_argument('-k', '--agents-number', default=5, type=int, help='The number of agents') parser.add_argument('-g', '--grid-size', default=10, type=int, help='Grid size') parser.add_argument('-ts', '--max-timestep', default=100, type=int, help='Maximum number of timesteps per episode') parser.add_argument('-gm', '--game-mode', choices=[0, 1], type=int, default=1, help='Mode of the game, ' '0: landmarks and agents fixed, ' '1: landmarks and agents random ') parser.add_argument('-rw', '--reward-mode', choices=[0, 1, 2], type=int, default=1, help='Mode of the reward,' '0: Only terminal rewards' '1: Partial rewards ' '(number of unoccupied landmarks' '2: Full rewards ' '(sum of dinstances of agents to landmarks)') parser.add_argument('-rm', '--max-random-moves', default=0, type=int, help='Maximum number of random initial moves for the agents') # Visualization Parameters parser.add_argument('-r', '--render', action='store_false', help='Turn on visualization if "store_false"') parser.add_argument('-re', '--recorder', action='store_true', help='Store the visualization as a movie ' 'if "store_false"') args = vars(parser.parse_args()) os.environ['CUDA_VISIBLE_DEVICES'] = args['gpu_num'] env = Environment(args) state_size = env.env.state_size action_space = env.env.action_space() all_agents = [] for b_idx in xrange(args['agents_number']): brain_file = get_name_brain(args, b_idx) all_agents.append(Agent(state_size, action_space, b_idx, brain_file, args)) rewards_file = get_name_rewards(args) timesteps_file = get_name_timesteps(args) env.run(all_agents, rewards_file, timesteps_file) ================================================ FILE: brain.py ================================================ """ Created on Wednesday Jan 16 2019 @author: Seyed Mohammad Asghari @github: https://github.com/s3yyy3d-m """ import os from keras.models import Sequential, Model from keras.layers import Dense, Lambda, Input, Concatenate from keras.optimizers import * import tensorflow as tf from keras import backend as K HUBER_LOSS_DELTA = 1.0 def huber_loss(y_true, y_predict): err = y_true - y_predict cond = K.abs(err) < HUBER_LOSS_DELTA L2 = 0.5 * K.square(err) L1 = HUBER_LOSS_DELTA * (K.abs(err) - 0.5 * HUBER_LOSS_DELTA) loss = tf.where(cond, L2, L1) return K.mean(loss) class Brain(object): def __init__(self, state_size, action_size, brain_name, arguments): self.state_size = state_size self.action_size = action_size self.weight_backup = brain_name self.batch_size = arguments['batch_size'] self.learning_rate = arguments['learning_rate'] self.test = arguments['test'] self.num_nodes = arguments['number_nodes'] self.dueling = arguments['dueling'] self.optimizer_model = arguments['optimizer'] self.model = self._build_model() self.model_ = self._build_model() def _build_model(self): if self.dueling: x = Input(shape=(self.state_size,)) # a series of fully connected layer for estimating V(s) y11 = Dense(self.num_nodes, activation='relu')(x) y12 = Dense(self.num_nodes, activation='relu')(y11) y13 = Dense(1, activation="linear")(y12) # a series of fully connected layer for estimating A(s,a) y21 = Dense(self.num_nodes, activation='relu')(x) y22 = Dense(self.num_nodes, activation='relu')(y21) y23 = Dense(self.action_size, activation="linear")(y22) w = Concatenate(axis=-1)([y13, y23]) # combine V(s) and A(s,a) to get Q(s,a) z = Lambda(lambda a: K.expand_dims(a[:, 0], axis=-1) + a[:, 1:] - K.mean(a[:, 1:], keepdims=True), output_shape=(self.action_size,))(w) else: x = Input(shape=(self.state_size,)) # a series of fully connected layer for estimating Q(s,a) y1 = Dense(self.num_nodes, activation='relu')(x) y2 = Dense(self.num_nodes, activation='relu')(y1) z = Dense(self.action_size, activation="linear")(y2) model = Model(inputs=x, outputs=z) if self.optimizer_model == 'Adam': optimizer = Adam(lr=self.learning_rate, clipnorm=1.) elif self.optimizer_model == 'RMSProp': optimizer = RMSprop(lr=self.learning_rate, clipnorm=1.) else: print('Invalid optimizer!') model.compile(loss=huber_loss, optimizer=optimizer) if self.test: if not os.path.isfile(self.weight_backup): print('Error:no file') else: model.load_weights(self.weight_backup) return model def train(self, x, y, sample_weight=None, epochs=1, verbose=0): # x is the input to the network and y is the output self.model.fit(x, y, batch_size=len(x), sample_weight=sample_weight, epochs=epochs, verbose=verbose) def predict(self, state, target=False): if target: # get prediction from target network return self.model_.predict(state) else: # get prediction from local network return self.model.predict(state) def predict_one_sample(self, state, target=False): return self.predict(state.reshape(1,self.state_size), target=target).flatten() def update_target_model(self): self.model_.set_weights(self.model.get_weights()) def save_model(self): self.model.save(self.weight_backup) ================================================ FILE: dqn_agent.py ================================================ """ Created on Wednesday Jan 16 2019 @author: Seyed Mohammad Asghari @github: https://github.com/s3yyy3d-m """ import numpy as np import random from brain import Brain from uniform_experience_replay import Memory as UER from prioritized_experience_replay import Memory as PER MAX_EPSILON = 1.0 MIN_EPSILON = 0.01 MIN_BETA = 0.4 MAX_BETA = 1.0 class Agent(object): epsilon = MAX_EPSILON beta = MIN_BETA def __init__(self, state_size, action_size, bee_index, brain_name, arguments): self.state_size = state_size self.action_size = action_size self.bee_index = bee_index self.learning_rate = arguments['learning_rate'] self.gamma = 0.95 self.brain = Brain(self.state_size, self.action_size, brain_name, arguments) self.memory_model = arguments['memory'] if self.memory_model == 'UER': self.memory = UER(arguments['memory_capacity']) elif self.memory_model == 'PER': self.memory = PER(arguments['memory_capacity'], arguments['prioritization_scale']) else: print('Invalid memory model!') self.target_type = arguments['target_type'] self.update_target_frequency = arguments['target_frequency'] self.max_exploration_step = arguments['maximum_exploration'] self.batch_size = arguments['batch_size'] self.step = 0 self.test = arguments['test'] if self.test: self.epsilon = MIN_EPSILON def greedy_actor(self, state): if np.random.rand() <= self.epsilon: return random.randrange(self.action_size) else: return np.argmax(self.brain.predict_one_sample(state)) def find_targets_per(self, batch): batch_len = len(batch) states = np.array([o[1][0] for o in batch]) states_ = np.array([o[1][3] for o in batch]) p = self.brain.predict(states) p_ = self.brain.predict(states_) pTarget_ = self.brain.predict(states_, target=True) x = np.zeros((batch_len, self.state_size)) y = np.zeros((batch_len, self.action_size)) errors = np.zeros(batch_len) for i in range(batch_len): o = batch[i][1] s = o[0] a = o[1][self.bee_index] r = o[2] s_ = o[3] done = o[4] t = p[i] old_value = t[a] if done: t[a] = r else: if self.target_type == 'DDQN': t[a] = r + self.gamma * pTarget_[i][np.argmax(p_[i])] elif self.target_type == 'DQN': t[a] = r + self.gamma * np.amax(pTarget_[i]) else: print('Invalid type for target network!') x[i] = s y[i] = t errors[i] = np.abs(t[a] - old_value) return [x, y, errors] def find_targets_uer(self, batch): batch_len = len(batch) states = np.array([o[0] for o in batch]) states_ = np.array([o[3] for o in batch]) p = self.brain.predict(states) p_ = self.brain.predict(states_) pTarget_ = self.brain.predict(states_, target=True) x = np.zeros((batch_len, self.state_size)) y = np.zeros((batch_len, self.action_size)) errors = np.zeros(batch_len) for i in range(batch_len): o = batch[i] s = o[0] a = o[1][self.bee_index] r = o[2] s_ = o[3] done = o[4] t = p[i] old_value = t[a] if done: t[a] = r else: if self.target_type == 'DDQN': t[a] = r + self.gamma * pTarget_[i][np.argmax(p_[i])] elif self.target_type == 'DQN': t[a] = r + self.gamma * np.amax(pTarget_[i]) else: print('Invalid type for target network!') x[i] = s y[i] = t errors[i] = np.abs(t[a] - old_value) return [x, y] def observe(self, sample): if self.memory_model == 'UER': self.memory.remember(sample) elif self.memory_model == 'PER': _, _, errors = self.find_targets_per([[0, sample]]) self.memory.remember(sample, errors[0]) else: print('Invalid memory model!') def decay_epsilon(self): # slowly decrease Epsilon based on our experience self.step += 1 if self.test: self.epsilon = MIN_EPSILON self.beta = MAX_BETA else: if self.step < self.max_exploration_step: self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * (self.max_exploration_step - self.step)/self.max_exploration_step self.beta = MAX_BETA + (MIN_BETA - MAX_BETA) * (self.max_exploration_step - self.step)/self.max_exploration_step else: self.epsilon = MIN_EPSILON def replay(self): if self.memory_model == 'UER': batch = self.memory.sample(self.batch_size) x, y = self.find_targets_uer(batch) self.brain.train(x, y) elif self.memory_model == 'PER': [batch, batch_indices, batch_priorities] = self.memory.sample(self.batch_size) x, y, errors = self.find_targets_per(batch) normalized_batch_priorities = [float(i) / sum(batch_priorities) for i in batch_priorities] importance_sampling_weights = [(self.batch_size * i) ** (-1 * self.beta) for i in normalized_batch_priorities] normalized_importance_sampling_weights = [float(i) / max(importance_sampling_weights) for i in importance_sampling_weights] sample_weights = [errors[i] * normalized_importance_sampling_weights[i] for i in xrange(len(errors))] self.brain.train(x, y, np.array(sample_weights)) self.memory.update(batch_indices, errors) else: print('Invalid memory model!') def update_target_model(self): if self.step % self.update_target_frequency == 0: self.brain.update_target_model() ================================================ FILE: environments/__init__.py ================================================ ================================================ FILE: environments/agents_landmarks/__init__.py ================================================ ================================================ FILE: environments/agents_landmarks/env.py ================================================ """ Created on Wednesday Jan 16 2019 @author: Seyed Mohammad Asghari @github: https://github.com/s3yyy3d-m """ import random import operator import numpy as np import pygame import sys import os # Define some colors BLACK = (0, 0, 0) WHITE = (255, 255, 255) GREEN = (0, 255, 0) RED = (255, 0, 0) BLUE = (0, 0, 255) GRAY = (128, 128, 128) ORANGE = (255, 128, 0) # This sets the WIDTH and HEIGHT of each grid location WIDTH = 60 HEIGHT = 60 # This sets the margin between each cell MARGIN = 1 class agentslandmarks: UP = 0 DOWN = 1 LEFT = 2 RIGHT = 3 STAY = 4 A = [UP, DOWN, LEFT, RIGHT, STAY] A_DIFF = [(-1, 0), (1, 0), (0, -1), (0, 1), (0, 0)] def __init__(self, args, current_path): self.game_mode = args['game_mode'] self.reward_mode = args['reward_mode'] self.num_agents = args['agents_number'] self.num_landmarks = self.num_agents self.grid_size = args['grid_size'] self.state_size = (self.num_agents + self.num_landmarks) * 2 self.agents_positions = [] self.landmarks_positions = [] self.render_flag = args['render'] self.recorder_flag = args['recorder'] # enables visualizer if self.render_flag: [self.screen, self.my_font] = self.gui_setup() self.step_num = 1 resource_path = os.path.join(current_path, 'environments') # The resource folder path resource_path = os.path.join(resource_path, 'agents_landmarks') # The resource folder path image_path = os.path.join(resource_path, 'images') # The image folder path img = pygame.image.load(os.path.join(image_path, 'agent.jpg')).convert() self.img_agent = pygame.transform.scale(img, (WIDTH, WIDTH)) img = pygame.image.load(os.path.join(image_path, 'landmark.jpg')).convert() self.img_landmark = pygame.transform.scale(img, (WIDTH, WIDTH)) img = pygame.image.load(os.path.join(image_path, 'agent_landmark.jpg')).convert() self.img_agent_landmark = pygame.transform.scale(img, (WIDTH, WIDTH)) img = pygame.image.load(os.path.join(image_path, 'agent_agent_landmark.jpg')).convert() self.img_agent_agent_landmark = pygame.transform.scale(img, (WIDTH, WIDTH)) img = pygame.image.load(os.path.join(image_path, 'agent_agent.jpg')).convert() self.img_agent_agent = pygame.transform.scale(img, (WIDTH, WIDTH)) if self.recorder_flag: self.snaps_path = os.path.join(current_path, 'results_agents_landmarks') # The resource folder path self.snaps_path = os.path.join(self.snaps_path, 'snaps') # The resource folder path self.cells = [] self.positions_idx = [] # self.agents_collide_flag = args['collide_flag'] # self.penalty_per_collision = args['penalty_collision'] self.num_episodes = 0 self.terminal = False def set_positions_idx(self): cells = [(i, j) for i in range(0, self.grid_size) for j in range(0, self.grid_size)] positions_idx = [] if self.game_mode == 0: # first enter the positions for the landmarks and then for the agents. If the grid is n*n, then the # positions are # 0 1 2 ... n-1 # n n+1 n+2 ... 2n-1 # 2n 2n+1 2n+2 ... 3n-1 # . . . . . # . . . . . # . . . . . # (n-1)*n (n-1)*n+1 (n-1)*n+2 ... n*n+1 # , e.g., # positions_idx = [0, 6, 23, 24] where 0 and 6 are the positions of landmarks and 23 and 24 are positions # of agents positions_idx = [] if self.game_mode == 1: positions_idx = np.random.choice(len(cells), size=self.num_landmarks + self.num_agents, replace=False) return [cells, positions_idx] def reset(self): # initialize the world self.terminal = False [self.cells, self.positions_idx] = self.set_positions_idx() # separate the generated position indices for walls, pursuers, and evaders landmarks_positions_idx = self.positions_idx[0:self.num_landmarks] agents_positions_idx = self.positions_idx[self.num_landmarks:self.num_landmarks + self.num_agents] # map generated position indices to positions self.landmarks_positions = [self.cells[pos] for pos in landmarks_positions_idx] self.agents_positions = [self.cells[pos] for pos in agents_positions_idx] initial_state = list(sum(self.landmarks_positions + self.agents_positions, ())) return initial_state def step(self, agents_actions): # update the position of agents self.agents_positions = self.update_positions(self.agents_positions, agents_actions) if self.reward_mode == 0: binary_cover_list = [] for landmark in self.landmarks_positions: distances = [np.linalg.norm(np.array(landmark) - np.array(agent_pos), 1) for agent_pos in self.agents_positions] min_dist = min(distances) if min_dist == 0: binary_cover_list.append(min_dist) else: binary_cover_list.append(1) # check the terminal case if sum(binary_cover_list) == 0: reward = 0 self.terminal = True else: reward = -1 self.terminal = False if self.reward_mode == 1: binary_cover_list = [] for landmark in self.landmarks_positions: distances = [np.linalg.norm(np.array(landmark) - np.array(agent_pos), 1) for agent_pos in self.agents_positions] min_dist = min(distances) if min_dist == 0: binary_cover_list.append(0) else: binary_cover_list.append(1) reward = -1 * sum(binary_cover_list) # check the terminal case if reward == 0: self.terminal = True else: self.terminal = False if self.reward_mode == 2: # calculate the sum of minimum distances of agents to landmarks reward = 0 for landmark in self.landmarks_positions: distances = [np.linalg.norm(np.array(landmark) - np.array(agent_pos), 1) for agent_pos in self.agents_positions] reward -= min(distances) # check the terminal case if reward == 0: self.terminal = True new_state = list(sum(self.landmarks_positions + self.agents_positions, ())) return [new_state, reward, self.terminal] def update_positions(self, pos_list, act_list): positions_action_applied = [] for idx in xrange(len(pos_list)): if act_list[idx] != 4: pos_act_applied = map(operator.add, pos_list[idx], self.A_DIFF[act_list[idx]]) # checks to make sure the new pos in inside the grid for i in xrange(0, 2): if pos_act_applied[i] < 0: pos_act_applied[i] = 0 if pos_act_applied[i] >= self.grid_size: pos_act_applied[i] = self.grid_size - 1 positions_action_applied.append(tuple(pos_act_applied)) else: positions_action_applied.append(pos_list[idx]) final_positions = [] for pos_idx in xrange(len(pos_list)): if positions_action_applied[pos_idx] == pos_list[pos_idx]: final_positions.append(pos_list[pos_idx]) elif positions_action_applied[pos_idx] not in pos_list and positions_action_applied[ pos_idx] not in positions_action_applied[ 0:pos_idx] + positions_action_applied[ pos_idx + 1:]: final_positions.append(positions_action_applied[pos_idx]) else: final_positions.append(pos_list[pos_idx]) return final_positions def action_space(self): return len(self.A) def render(self): pygame.time.delay(500) pygame.display.flip() for event in pygame.event.get(): if event.type == pygame.QUIT: sys.exit() self.screen.fill(BLACK) text = self.my_font.render("Step: {0}".format(self.step_num), 1, WHITE) self.screen.blit(text, (5, 15)) for row in range(self.grid_size): for column in range(self.grid_size): pos = (row, column) frequency = self.find_frequency(pos, self.agents_positions) if pos in self.landmarks_positions and frequency >= 1: if frequency == 1: self.screen.blit(self.img_agent_landmark, ((MARGIN + WIDTH) * column + MARGIN, (MARGIN + HEIGHT) * row + MARGIN + 50)) else: self.screen.blit(self.img_agent_agent_landmark, ((MARGIN + WIDTH) * column + MARGIN, (MARGIN + HEIGHT) * row + MARGIN + 50)) elif pos in self.landmarks_positions: self.screen.blit(self.img_landmark, ((MARGIN + WIDTH) * column + MARGIN, (MARGIN + HEIGHT) * row + MARGIN + 50)) elif frequency >= 1: if frequency == 1: self.screen.blit(self.img_agent, ((MARGIN + WIDTH) * column + MARGIN, (MARGIN + HEIGHT) * row + MARGIN + 50)) elif frequency > 1: self.screen.blit(self.img_agent_agent, ((MARGIN + WIDTH) * column + MARGIN, (MARGIN + HEIGHT) * row + MARGIN + 50)) else: print('Error!') else: pygame.draw.rect(self.screen, WHITE, [(MARGIN + WIDTH) * column + MARGIN, (MARGIN + HEIGHT) * row + MARGIN + 50, WIDTH, HEIGHT]) if self.recorder_flag: file_name = "%04d.png" % self.step_num pygame.image.save(self.screen, os.path.join(self.snaps_path, file_name)) if not self.terminal: self.step_num += 1 def gui_setup(self): # Initialize pygame pygame.init() # Set the HEIGHT and WIDTH of the screen board_size_x = (WIDTH + MARGIN) * self.grid_size board_size_y = (HEIGHT + MARGIN) * self.grid_size window_size_x = int(board_size_x) window_size_y = int(board_size_y * 1.2) window_size = [window_size_x, window_size_y] screen = pygame.display.set_mode(window_size) # Set title of screen pygame.display.set_caption("Agents-and-Landmarks Game") myfont = pygame.font.SysFont("monospace", 30) return [screen, myfont] def find_frequency(self, a, items): freq = 0 for item in items: if item == a: freq += 1 return freq ================================================ FILE: environments/predators_prey/__init__.py ================================================ ================================================ FILE: environments/predators_prey/env.py ================================================ """ Created on Wednesday Jan 16 2019 @author: Seyed Mohammad Asghari @github: https://github.com/s3yyy3d-m """ import random import operator import numpy as np import pygame import sys import os # Define some colors BLACK = (0, 0, 0) WHITE = (255, 255, 255) GREEN = (0, 255, 0) RED = (255, 0, 0) BLUE = (0, 0, 255) GRAY = (128, 128, 128) ORANGE = (255, 128, 0) # This sets the WIDTH and HEIGHT of each grid location WIDTH = 60 HEIGHT = 60 # This sets the margin between each cell MARGIN = 1 class PredatorsPrey(object): UP = 0 DOWN = 1 LEFT = 2 RIGHT = 3 STAY = 4 A = [UP, DOWN, LEFT, RIGHT, STAY] A_DIFF = [(-1, 0), (1, 0), (0, -1), (0, 1), (0,0)] def __init__(self, args, current_path): self.num_predators = args['agents_number'] self.num_preys = 1 self.preys_mode = args['preys_mode'] self.num_walls = 0 self.grid_size = args['grid_size'] self.game_mode = args['game_mode'] self.reward_mode = args['reward_mode'] self.state_size = (self.num_preys + self.num_predators + self.num_walls)*2 self.predators_positions = [] self.preys_positions = [] self.walls_positions = [] self.render_flag = args['render'] self.recorder_flag = args['recorder'] # enables visualizer if self.render_flag: [self.screen, self.my_font] = self.gui_setup() self.step_num = 1 resource_path = os.path.join(current_path, 'environments') # The resource folder path resource_path = os.path.join(resource_path, 'predators_prey') # The resource folder path image_path = os.path.join(resource_path, 'images') # The image folder path img = pygame.image.load(os.path.join(image_path, 'predator_prey.jpg')).convert() self.img_predator_prey = pygame.transform.scale(img, (WIDTH, WIDTH)) img = pygame.image.load(os.path.join(image_path, 'predator.jpg')).convert() self.img_predator = pygame.transform.scale(img, (WIDTH, WIDTH)) img = pygame.image.load(os.path.join(image_path, 'prey.jpg')).convert() self.img_prey = pygame.transform.scale(img, (WIDTH, WIDTH)) if self.recorder_flag: self.snaps_path = os.path.join(current_path, 'results_predators_prey') # The resource folder path self.snaps_path = os.path.join(self.snaps_path, 'snaps') # The resource folder path self.cells = [] self.agents_positions_idx = [] self.num_episodes = 0 self.terminal = False def set_positions_idx(self): cells = [(i, j) for i in range(0, self.grid_size) for j in range(0, self.grid_size)] positions_idx = [] if self.game_mode == 0: # first enter the positions for the agents (predators) and the single prey. If the grid is n*n, # then the positions are # 0 1 2 ... n-1 # n n+1 n+2 ... 2n-1 # 2n 2n+1 2n+2 ... 3n-1 # . . . . . # . . . . . # . . . . . # (n-1)*n (n-1)*n+1 (n-1)*n+2 ... n*n+1 # , e.g., # positions_idx = [0, 6, 23, 24] where 0, 6, and 23 are the positions of the agents 24 is the position # of the prey positions_idx = [] if self.game_mode == 1: positions_idx = np.random.choice(len(cells), size=self.num_predators + self.num_preys, replace=False) return [cells, positions_idx] def reset(self): # initialize the world self.terminal = False self.num_catches = 0 [self.cells, self.agents_positions_idx] = self.set_positions_idx() # separate the generated position indices for walls, predators, and preys walls_positions_idx = self.agents_positions_idx[0:self.num_walls] predators_positions_idx = self.agents_positions_idx[self.num_walls:self.num_walls + self.num_predators] preys_positions_idx = self.agents_positions_idx[self.num_walls + self.num_predators:] # map generated position indices to positions self.walls_positions = [self.cells[pos] for pos in walls_positions_idx] self.predators_positions = [self.cells[pos] for pos in predators_positions_idx] self.preys_positions = [self.cells[pos] for pos in preys_positions_idx] initial_state = list(sum(self.walls_positions + self.predators_positions + self.preys_positions, ())) return initial_state def fix_prey(self): return 4 def actor_prey_random(self): return random.randrange(self.action_space()) def actor_prey_random_escape(self, prey_index): prey_pos = self.preys_positions[prey_index] [_, action_to_neighbors] = self.empty_neighbor_finder(prey_pos) return random.choice(action_to_neighbors) def neighbor_finder(self, pos): neighbors_pos = [] action_to_neighbor = [] pos_repeat = [pos for _ in xrange(4)] for idx in xrange(4): neighbor_pos = map(operator.add, pos_repeat[idx], self.A_DIFF[idx]) if neighbor_pos[0] in range(0,self.grid_size) and neighbor_pos[1] in range(0,self.grid_size)\ and neighbor_pos not in self.walls_positions: neighbors_pos.append(neighbor_pos) action_to_neighbor.append(idx) neighbors_pos.append(pos) action_to_neighbor.append(4) return [neighbors_pos, action_to_neighbor] def empty_neighbor_finder(self, pos): neighbors_pos = [] action_to_neighbor = [] pos_repeat = [pos for _ in xrange(4)] for idx in xrange(4): neighbor_pos = map(operator.add, pos_repeat[idx], self.A_DIFF[idx]) if neighbor_pos[0] in range(0,self.grid_size) and neighbor_pos[1] in range(0, self.grid_size)\ and neighbor_pos not in self.walls_positions: neighbors_pos.append(neighbor_pos) action_to_neighbor.append(idx) neighbors_pos.append(pos) action_to_neighbor.append(4) empty_neighbors_pos = [] action_to_empty_neighbor = [] for idx in xrange(len(neighbors_pos)): if tuple(neighbors_pos[idx]) not in self.predators_positions: empty_neighbors_pos.append(neighbors_pos[idx]) action_to_empty_neighbor.append(action_to_neighbor[idx]) return [empty_neighbors_pos, action_to_empty_neighbor] def step(self, predators_actions): # update the position of preys preys_actions = [] for prey_idx in xrange(len(self.preys_positions)): if self.preys_mode == 0: preys_actions.append(self.fix_prey()) elif self.preys_mode == 1: preys_actions.append(self.actor_prey_random_escape(prey_idx)) elif self.preys_mode == 2: preys_actions.append(self.actor_prey_random()) else: print('Invalid mode for the prey') self.preys_positions = self.update_positions(self.preys_positions, preys_actions) # update the position of predators self.predators_positions = self.update_positions(self.predators_positions, predators_actions) # check whether any predator catches any prey [reward, self.terminal] = self.check_catching() new_state = list(sum(self.walls_positions + self.predators_positions + self.preys_positions,())) return [new_state, reward, self.terminal] def check_catching(self): new_preys_position = [] terminal_flag = False # checks to see whether the position of any prey is the same of as the position of any predator if self.reward_mode == 0: for prey_pos in self.preys_positions: new_preys_position.append(prey_pos) distances = 0 for predator in self.predators_positions: distances += np.linalg.norm(np.array(predator) - np.array(self.preys_positions[0]), 1) [prey_empty_neigbours, _] = self.empty_neighbor_finder(self.preys_positions[0]) # check the terminal case if int(distances) == self.num_predators - 1 or len(prey_empty_neigbours) == 0: terminal_flag = True reward = 0 else: reward = -1 elif self.reward_mode == 1: for prey_pos in self.preys_positions: new_preys_position.append(prey_pos) distances = 0 for predator in self.predators_positions: distances += np.linalg.norm(np.array(predator) - np.array(self.preys_positions[0]), 1) [prey_empty_neigbours, _] = self.empty_neighbor_finder(self.preys_positions[0]) # check the terminal case if int(distances) == self.num_predators - 1 or len(prey_empty_neigbours) == 0: terminal_flag = True reward = 0 else: reward = -1 * distances else: print('Invalid game mode') self.preys_positions = new_preys_position return [reward, terminal_flag] def update_positions(self, pos_list, act_list): positions_action_applied = [] for idx in xrange(len(pos_list)): if act_list[idx] != 4: pos_act_applied = map(operator.add, pos_list[idx], self.A_DIFF[act_list[idx]]) # checks to make sure the new pos in inside the grid for i in xrange(0, 2): if pos_act_applied[i] < 0: pos_act_applied[i] = 0 if pos_act_applied[i] >= self.grid_size: pos_act_applied[i] = self.grid_size - 1 positions_action_applied.append(tuple(pos_act_applied)) else: positions_action_applied.append(pos_list[idx]) final_positions = [] for pos_idx in xrange(len(pos_list)): if positions_action_applied[pos_idx] == pos_list[pos_idx]: final_positions.append(pos_list[pos_idx]) elif positions_action_applied[pos_idx] not in pos_list and positions_action_applied[pos_idx] not in positions_action_applied[ 0:pos_idx] + positions_action_applied[ pos_idx + 1:]: final_positions.append(positions_action_applied[pos_idx]) else: final_positions.append(pos_list[pos_idx]) return final_positions def action_space(self): return len(self.A) def render(self): pygame.time.wait(500) pygame.display.flip() for event in pygame.event.get(): if event.type == pygame.QUIT: sys.exit() self.screen.fill(BLACK) text = self.my_font.render("Step: {0}".format(self.step_num), 1, WHITE) self.screen.blit(text, (5, 15)) # for row in range(self.grid_size): # for column in range(self.grid_size): # pos = (row, column) # if pos in self.predators_positions and pos in self.preys_positions: # color = ORANGE # elif pos in self.predators_positions: # color = BLUE # elif pos in self.preys_positions: # color = RED # else: # color = WHITE # pygame.draw.rect(self.screen, color, # [(MARGIN + WIDTH) * column + MARGIN, (MARGIN + HEIGHT) * row + MARGIN + 50, WIDTH, # HEIGHT]) for row in range(self.grid_size): for column in range(self.grid_size): pos = (row, column) if pos in self.predators_positions and pos in self.preys_positions: self.screen.blit(self.img_predator_prey, ((MARGIN + WIDTH) * column + MARGIN, (MARGIN + HEIGHT) * row + MARGIN + 50)) elif pos in self.predators_positions: self.screen.blit(self.img_predator, ((MARGIN + WIDTH) * column + MARGIN, (MARGIN + HEIGHT) * row + MARGIN + 50)) elif pos in self.preys_positions: self.screen.blit(self.img_prey, ((MARGIN + WIDTH) * column + MARGIN, (MARGIN + HEIGHT) * row + MARGIN + 50)) else: color = WHITE pygame.draw.rect(self.screen, color, [(MARGIN + WIDTH) * column + MARGIN, (MARGIN + HEIGHT) * row + MARGIN + 50, WIDTH, HEIGHT]) if self.recorder_flag: file_name = "%04d.png" % self.step_num pygame.image.save(self.screen, os.path.join(self.snaps_path, file_name)) if not self.terminal: self.step_num += 1 def gui_setup(self): # Initialize pygame pygame.init() # Set the HEIGHT and WIDTH of the screen board_size_x = (WIDTH + MARGIN) * self.grid_size board_size_y = (HEIGHT + MARGIN) * self.grid_size window_size_x = int(board_size_x*1.01) window_size_y = int(board_size_y * 1.2) window_size = [window_size_x, window_size_y] screen = pygame.display.set_mode(window_size, 0, 32) # Set title of screen pygame.display.set_caption("Predators-and-Prey Game") myfont = pygame.font.SysFont("monospace", 30) return [screen, myfont] ================================================ FILE: predators_prey_multiagent.py ================================================ """ Created on Wednesday Jan 16 2019 @author: Seyed Mohammad Asghari @github: https://github.com/s3yyy3d-m """ import numpy as np import os import random import argparse import pandas as pd from environments.predators_prey.env import PredatorsPrey from dqn_agent import Agent import glob ARG_LIST = ['learning_rate', 'optimizer', 'memory_capacity', 'batch_size', 'target_frequency', 'maximum_exploration', 'max_timestep', 'first_step_memory', 'replay_steps', 'number_nodes', 'target_type', 'memory', 'prioritization_scale', 'dueling', 'agents_number', 'grid_size', 'game_mode', 'reward_mode'] def get_name_brain(args, idx): file_name_str = '_'.join([str(args[x]) for x in ARG_LIST]) return './results_predators_prey/weights_files/' + file_name_str + '_' + str(idx) + '.h5' def get_name_rewards(args): file_name_str = '_'.join([str(args[x]) for x in ARG_LIST]) return './results_predators_prey/rewards_files/' + file_name_str + '.csv' def get_name_timesteps(args): file_name_str = '_'.join([str(args[x]) for x in ARG_LIST]) return './results_predators_prey/timesteps_files/' + file_name_str + '.csv' class Environment(object): def __init__(self, arguments): current_path = os.path.dirname(__file__) # Where your .py file is located self.env = PredatorsPrey(arguments, current_path) self.episodes_number = arguments['episode_number'] self.render = arguments['render'] self.recorder = arguments['recorder'] self.max_ts = arguments['max_timestep'] self.test = arguments['test'] self.filling_steps = arguments['first_step_memory'] self.steps_b_updates = arguments['replay_steps'] self.max_random_moves = arguments['max_random_moves'] self.num_predators = arguments['agents_number'] self.num_preys = 1 self.preys_mode = arguments['preys_mode'] self.game_mode = arguments['game_mode'] self.grid_size = arguments['grid_size'] def run(self, agents, file1, file2): total_step = 0 rewards_list = [] timesteps_list = [] max_score = -10000 for episode_num in xrange(self.episodes_number): state = self.env.reset() if self.render: self.env.render() random_moves = random.randint(0, self.max_random_moves) # create randomness in initial state for _ in xrange(random_moves): actions = [4 for _ in xrange(len(agents))] state, _, _ = self.env.step(actions) if self.render: self.env.render() # converting list of positions to an array state = np.array(state) state = state.ravel() done = False reward_all = 0 time_step = 0 while not done and time_step < self.max_ts: # if self.render: # self.env.render() actions = [] for agent in agents: actions.append(agent.greedy_actor(state)) next_state, reward, done = self.env.step(actions) # converting list of positions to an array next_state = np.array(next_state) next_state = next_state.ravel() if not self.test: for agent in agents: agent.observe((state, actions, reward, next_state, done)) if total_step >= self.filling_steps: agent.decay_epsilon() if time_step % self.steps_b_updates == 0: agent.replay() agent.update_target_model() total_step += 1 time_step += 1 state = next_state reward_all += reward if self.render: self.env.render() rewards_list.append(reward_all) timesteps_list.append(time_step) print("Episode {p}, Score: {s}, Final Step: {t}, Goal: {g}".format(p=episode_num, s=reward_all, t=time_step, g=done)) if self.recorder: os.system("ffmpeg -r 4 -i ./results_predators_prey/snaps/%04d.png -b:v 40000 -minrate 40000 -maxrate 4000k -bufsize 1835k -c:v mjpeg -qscale:v 0 " + "./results_predators_prey/videos/{a1}_{a2}_{a3}_{a4}_{a5}.avi".format(a1=self.num_predators, a2=self.num_preys, a3=self.preys_mode, a4=self.game_mode, a5=self.grid_size)) files = glob.glob('./results_predators_prey/snaps/*') for f in files: os.remove(f) if not self.test: if episode_num % 100 == 0: df = pd.DataFrame(rewards_list, columns=['score']) df.to_csv(file1) df = pd.DataFrame(timesteps_list, columns=['steps']) df.to_csv(file2) if total_step >= self.filling_steps: if reward_all > max_score: for agent in agents: agent.brain.save_model() max_score = reward_all if __name__ =="__main__": parser = argparse.ArgumentParser() # DQN Parameters parser.add_argument('-e', '--episode-number', default=1, type=int, help='Number of episodes') parser.add_argument('-l', '--learning-rate', default=0.00005, type=float, help='Learning rate') parser.add_argument('-op', '--optimizer', choices=['Adam', 'RMSProp'], default='RMSProp', help='Optimization method') parser.add_argument('-m', '--memory-capacity', default=1000000, type=int, help='Memory capacity') parser.add_argument('-b', '--batch-size', default=64, type=int, help='Batch size') parser.add_argument('-t', '--target-frequency', default=10000, type=int, help='Number of steps between the updates of target network') parser.add_argument('-x', '--maximum-exploration', default=100000, type=int, help='Maximum exploration step') parser.add_argument('-fsm', '--first-step-memory', default=0, type=float, help='Number of initial steps for just filling the memory') parser.add_argument('-rs', '--replay-steps', default=4, type=float, help='Steps between updating the network') parser.add_argument('-nn', '--number-nodes', default=256, type=int, help='Number of nodes in each layer of NN') parser.add_argument('-tt', '--target-type', choices=['DQN', 'DDQN'], default='DQN') parser.add_argument('-mt', '--memory', choices=['UER', 'PER'], default='PER') parser.add_argument('-pl', '--prioritization-scale', default=0.5, type=float, help='Scale for prioritization') parser.add_argument('-du', '--dueling', action='store_true', help='Enable Dueling architecture if "store_false" ') parser.add_argument('-gn', '--gpu-num', default='2', type=str, help='Number of GPU to use') parser.add_argument('-test', '--test', action='store_true', help='Enable the test phase if "store_false"') # Game Parameters parser.add_argument('-k', '--agents-number', default=3, type=int, help='The number of agents') parser.add_argument('-g', '--grid-size', default=5, type=int, help='Grid size') parser.add_argument('-ts', '--max-timestep', default=100, type=int, help='Maximum number of timesteps per episode') parser.add_argument('-gm', '--game-mode', choices=[0, 1], type=int, default=1, help='Mode of the game, ' '0: prey and agents (predators)' 'are fixed,' '1: prey and agents (predators)' 'are random ') parser.add_argument('-rw', '--reward-mode', choices=[0, 1], type=int, default=1, help='Mode of the reward,' '0: Only terminal rewards, ' '1: Full rewards,' '(sum of dinstances of agents' ' to the prey)') parser.add_argument('-rm', '--max-random-moves', default=0, type=int, help='Maximum number of random initial moves for agents') parser.add_argument('-evm', '--preys-mode', choices=[0, 1, 2], type=int, default=2, help='Mode of preys:' '0: fixed,' '1: random,' '2: random escape') # Visualization Parameters parser.add_argument('-r', '--render', action='store_false', help='Turn on visualization if "store_false"') parser.add_argument('-re', '--recorder', action='store_true', help='Store the visualization as a movie if ' '"store_false"') args = vars(parser.parse_args()) os.environ['CUDA_VISIBLE_DEVICES'] = args['gpu_num'] env = Environment(args) state_size = env.env.state_size action_space = env.env.action_space() all_agents = [] for b_idx in xrange(args['agents_number']): brain_file = get_name_brain(args, b_idx) all_agents.append(Agent(state_size, action_space, b_idx, brain_file, args)) rewards_file = get_name_rewards(args) timesteps_file = get_name_timesteps(args) env.run(all_agents, rewards_file, timesteps_file) ================================================ FILE: prioritized_experience_replay.py ================================================ """ Created on Wednesday Jan 16 2019 @author: Seyed Mohammad Asghari @github: https://github.com/s3yyy3d-m """ import random from sum_tree import SumTree as ST class Memory(object): e = 0.05 def __init__(self, capacity, pr_scale): self.capacity = capacity self.memory = ST(self.capacity) self.pr_scale = pr_scale self.max_pr = 0 def get_priority(self, error): return (error + self.e) ** self.pr_scale def remember(self, sample, error): p = self.get_priority(error) self_max = max(self.max_pr, p) self.memory.add(self_max, sample) def sample(self, n): sample_batch = [] sample_batch_indices = [] sample_batch_priorities = [] num_segments = self.memory.total() / n for i in xrange(n): left = num_segments * i right = num_segments * (i + 1) s = random.uniform(left, right) idx, pr, data = self.memory.get(s) sample_batch.append((idx, data)) sample_batch_indices.append(idx) sample_batch_priorities.append(pr) return [sample_batch, sample_batch_indices, sample_batch_priorities] def update(self, batch_indices, errors): for i in xrange(len(batch_indices)): p = self.get_priority(errors[i]) self.memory.update(batch_indices[i], p) ================================================ FILE: results_predators_prey/rewards_files/5e-05_RMSProp_1000000_64_10000_100000_100_0_4_256_DQN_PER_0.5_False_3_5_1_1.csv ================================================ ,score 0,-984.0 ================================================ FILE: results_predators_prey/timesteps_files/5e-05_RMSProp_1000000_64_10000_100000_100_0_4_256_DQN_PER_0.5_False_3_5_1_1.csv ================================================ ,steps 0,100 ================================================ FILE: sum_tree.py ================================================ import numpy class SumTree(object): def __init__(self, capacity): self.write = 0 self.capacity = capacity self.tree = numpy.zeros(2*capacity - 1) self.data = numpy.zeros(capacity, dtype=object) def _propagate(self, idx, change): parent = (idx - 1) // 2 self.tree[parent] += change if parent != 0: self._propagate(parent, change) def _retrieve(self, idx, s): left = 2 * idx + 1 right = left + 1 if left >= len(self.tree): return idx if s <= self.tree[left]: return self._retrieve(left, s) else: return self._retrieve(right, s-self.tree[left]) def total(self): return self.tree[0] def add(self, p, data): idx = self.write + self.capacity - 1 self.data[self.write] = data self.update(idx, p) self.write += 1 if self.write >= self.capacity: self.write = 0 def update(self, idx, p): change = p - self.tree[idx] self.tree[idx] = p self._propagate(idx, change) # def get_real_idx(self, data_idx): # # tempIdx = data_idx - self.write # if tempIdx >= 0: # return tempIdx # else: # return tempIdx + self.capacity def get(self, s): idx = self._retrieve(0, s) dataIdx = idx - self.capacity + 1 # realIdx = self.get_real_idx(dataIdx) return idx, self.tree[idx], self.data[dataIdx] ================================================ FILE: uniform_experience_replay.py ================================================ """ Created on Wednesday Jan 16 2019 @author: Seyed Mohammad Asghari @github: https://github.com/s3yyy3d-m """ import random from collections import deque class Memory(object): def __init__(self, capacity): self.capacity = capacity self.memory = deque(maxlen=self.capacity) def remember(self, sample): self.memory.append(sample) def sample(self, n): n = min(n, len(self.memory)) sample_batch = random.sample(self.memory, n) return sample_batch