Repository: vwxyzjn/invalid-action-masking Branch: master Commit: 6daedd29e4b4 Files: 40 Total size: 361.1 KB Directory structure: gitextract_uyz0i984/ ├── .gitignore ├── .python-version ├── LICENSE ├── README.MD ├── build.sh ├── gym_vec_api/ │ ├── ppo_multidiscrete.py │ └── ppo_multidiscrete_mask.py ├── invalid_action_masking/ │ ├── ppo_10x10.py │ ├── ppo_16x16.py │ ├── ppo_24x24.py │ ├── ppo_4x4.py │ ├── ppo_no_adj_10x10.py │ ├── ppo_no_adj_16x16.py │ ├── ppo_no_adj_24x24.py │ ├── ppo_no_adj_4x4.py │ ├── ppo_no_mask_10x10.py │ ├── ppo_no_mask_16x16.py │ ├── ppo_no_mask_24x24.py │ └── ppo_no_mask_4x4.py ├── plots/ │ ├── analysis.py │ ├── approx_kl.py │ ├── charts_episode_reward/ │ │ ├── all_df_cache.pkl │ │ ├── data/ │ │ │ ├── MicrortsMining10x10F9-v0.pkl │ │ │ ├── MicrortsMining16x16F9-v0.pkl │ │ │ ├── MicrortsMining24x24F9-v0.pkl │ │ │ └── MicrortsMining4x4F9-v0.pkl │ │ ├── envs_cache.pkl │ │ └── exp_names_cache.pkl │ ├── episode_reward.py │ └── losses_approx_kl/ │ ├── all_df_cache.pkl │ ├── data/ │ │ ├── MicrortsMining10x10F9-v0.pkl │ │ ├── MicrortsMining16x16F9-v0.pkl │ │ ├── MicrortsMining24x24F9-v0.pkl │ │ └── MicrortsMining4x4F9-v0.pkl │ ├── envs_cache.pkl │ └── exp_names_cache.pkl ├── ppo.py ├── pyproject.toml ├── requirements.txt └── test.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ **.tfevents.** # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ ================================================ FILE: .python-version ================================================ 3.9.5/envs/invalid-action-masking ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2020 neurips2020submission Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.MD ================================================ # A Closer Look at Invalid Action Masking in Policy Gradient Algorithms This repo contains the source code to reproduce the results in the paper [*A Closer Look at Invalid Action Masking in Policy Gradient Algorithms*](https://arxiv.org/abs/2006.14171). ## Get started If you have pyenv or poetry: ```bash poetry install rm -rf ~/microrts && mkdir ~/microrts && \ wget -O ~/microrts/microrts.zip http://microrts.s3.amazonaws.com/microrts/artifacts/202004222224.microrts.zip && \ unzip ~/microrts/microrts.zip -d ~/microrts/ && \ rm ~/microrts/microrts.zip ``` Else, you can also install dependencies via `pip install -r requirements.txt`. ## 10x10 Experiments ``` poetry run python invalid_action_masking/ppo_10x10.py poetry run python invalid_action_masking/ppo_no_adj_10x10.py poetry run python invalid_action_masking/ppo_no_mask_10x10.py poetry run python ppo.py # newer & recommended PPO implementation that matches implementation details in `openai/baselines` ``` ## Citation ```bibtex @inproceedings{huang2020closer, author = {Shengyi Huang and Santiago Onta{\~{n}}{\'{o}}n}, editor = {Roman Bart{\'{a}}k and Fazel Keshtkar and Michael Franklin}, title = {A Closer Look at Invalid Action Masking in Policy Gradient Algorithms}, booktitle = {Proceedings of the Thirty-Fifth International Florida Artificial Intelligence Research Society Conference, {FLAIRS} 2022, Hutchinson Island, Jensen Beach, Florida, USA, May 15-18, 2022}, year = {2022}, url = {https://doi.org/10.32473/flairs.v35i.130584}, doi = {10.32473/flairs.v35i.130584}, timestamp = {Thu, 09 Jun 2022 16:44:11 +0200}, biburl = {https://dblp.org/rec/conf/flairs/HuangO22.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} } ``` ================================================ FILE: build.sh ================================================ docker build -t invalid_action_masking:latest -f sharedmemory.Dockerfile . ================================================ FILE: gym_vec_api/ppo_multidiscrete.py ================================================ import argparse import os import random import time from distutils.util import strtobool import gym import gym_microrts # fmt: off import numpy as np import torch import torch.nn as nn import torch.optim as optim from torch.distributions.categorical import Categorical from torch.utils.tensorboard import SummaryWriter def parse_args(): # fmt: off parser = argparse.ArgumentParser() parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"), help='the name of this experiment') parser.add_argument('--gym-id', type=str, default="MicrortsMining10x10F9-v0", help='the id of the gym environment') parser.add_argument('--learning-rate', type=float, default=2.5e-4, help='the learning rate of the optimizer') parser.add_argument('--seed', type=int, default=1, help='seed of the experiment') parser.add_argument('--total-timesteps', type=int, default=10000000, help='total timesteps of the experiments') parser.add_argument('--torch-deterministic', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, help='if toggled, `torch.backends.cudnn.deterministic=False`') parser.add_argument('--cuda', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, help='if toggled, cuda will be enabled by default') parser.add_argument('--track', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, help='if toggled, this experiment will be tracked with Weights and Biases') parser.add_argument('--wandb-project-name', type=str, default="cleanRL", help="the wandb's project name") parser.add_argument('--wandb-entity', type=str, default=None, help="the entity (team) of wandb's project") parser.add_argument('--capture-video', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, help='weather to capture videos of the agent performances (check out `videos` folder)') # Algorithm specific arguments parser.add_argument('--num-envs', type=int, default=4, help='the number of parallel game environments') parser.add_argument('--num-steps', type=int, default=128, help='the number of steps to run in each environment per policy rollout') parser.add_argument('--anneal-lr', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, help="Toggle learning rate annealing for policy and value networks") parser.add_argument('--gae', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, help='Use GAE for advantage computation') parser.add_argument('--gamma', type=float, default=0.99, help='the discount factor gamma') parser.add_argument('--gae-lambda', type=float, default=0.95, help='the lambda for the general advantage estimation') parser.add_argument('--num-minibatches', type=int, default=4, help='the number of mini-batches') parser.add_argument('--update-epochs', type=int, default=4, help="the K epochs to update the policy") parser.add_argument('--norm-adv', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, help="Toggles advantages normalization") parser.add_argument('--clip-coef', type=float, default=0.1, help="the surrogate clipping coefficient") parser.add_argument('--clip-vloss', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, help='Toggles wheter or not to use a clipped loss for the value function, as per the paper.') parser.add_argument('--ent-coef', type=float, default=0.01, help="coefficient of the entropy") parser.add_argument('--vf-coef', type=float, default=0.5, help="coefficient of the value function") parser.add_argument('--max-grad-norm', type=float, default=0.5, help='the maximum norm for the gradient clipping') parser.add_argument('--target-kl', type=float, default=None, help='the target KL divergence threshold') args = parser.parse_args() args.batch_size = int(args.num_envs * args.num_steps) args.minibatch_size = int(args.batch_size // args.num_minibatches) # fmt: on return args def make_env(gym_id, seed, idx, capture_video, run_name): def thunk(): env = gym.make(gym_id) env = gym.wrappers.RecordEpisodeStatistics(env) if capture_video: if idx == 0: env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") env.seed(seed) env.action_space.seed(seed) env.observation_space.seed(seed) return env return thunk def layer_init(layer, std=np.sqrt(2), bias_const=0.0): torch.nn.init.orthogonal_(layer.weight, std) torch.nn.init.constant_(layer.bias, bias_const) return layer class Transpose(nn.Module): def __init__(self, permutation): super().__init__() self.permutation = permutation def forward(self, x): return x.permute(self.permutation) class Agent(nn.Module): def __init__(self, envs): super(Agent, self).__init__() self.network = nn.Sequential( Transpose((0, 3, 1, 2)), layer_init(nn.Conv2d(27, 16, kernel_size=3, stride=2)), nn.ReLU(), layer_init(nn.Conv2d(16, 32, kernel_size=2)), nn.ReLU(), nn.Flatten(), layer_init(nn.Linear(32*3*3, 128)), nn.ReLU(), ) self.nvec = envs.single_action_space.nvec self.actor = layer_init(nn.Linear(128, self.nvec.sum()), std=0.01) self.critic = layer_init(nn.Linear(128, 1), std=1) def get_value(self, x): return self.critic(self.network(x)) def get_action_and_value(self, x, action=None): hidden = self.network(x) logits = self.actor(hidden) split_logits = torch.split(logits, self.nvec.tolist(), dim=1) multi_categoricals = [Categorical(logits=logits) for logits in split_logits] if action is None: action = torch.stack([categorical.sample() for categorical in multi_categoricals]) logprob = torch.stack([categorical.log_prob(a) for a, categorical in zip(action, multi_categoricals)]) entropy = torch.stack([categorical.entropy() for categorical in multi_categoricals]) return action.T, logprob.sum(0), entropy.sum(0), self.critic(hidden) if __name__ == "__main__": args = parse_args() run_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" if args.track: import wandb wandb.init( project=args.wandb_project_name, entity=args.wandb_entity, sync_tensorboard=True, config=vars(args), name=run_name, monitor_gym=True, save_code=True, ) writer = SummaryWriter(f"runs/{run_name}") writer.add_text( "hyperparameters", "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), ) # TRY NOT TO MODIFY: seeding random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") # env setup envs = gym.vector.SyncVectorEnv( [make_env(args.gym_id, args.seed + i, i, args.capture_video, run_name) for i in range(args.num_envs)] ) assert isinstance(envs.single_action_space, gym.spaces.MultiDiscrete), "only MultiDiscrete action space is supported" agent = Agent(envs).to(device) optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5) # ALGO Logic: Storage setup obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device) actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device) logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device) rewards = torch.zeros((args.num_steps, args.num_envs)).to(device) dones = torch.zeros((args.num_steps, args.num_envs)).to(device) values = torch.zeros((args.num_steps, args.num_envs)).to(device) # TRY NOT TO MODIFY: start the game global_step = 0 start_time = time.time() next_obs = torch.Tensor(envs.reset()).to(device) next_done = torch.zeros(args.num_envs).to(device) num_updates = args.total_timesteps // args.batch_size for update in range(1, num_updates + 1): # Annealing the rate if instructed to do so. if args.anneal_lr: frac = 1.0 - (update - 1.0) / num_updates lrnow = frac * args.learning_rate optimizer.param_groups[0]["lr"] = lrnow for step in range(0, args.num_steps): global_step += 1 * args.num_envs obs[step] = next_obs dones[step] = next_done # ALGO LOGIC: action logic with torch.no_grad(): action, logprob, _, value = agent.get_action_and_value(next_obs) values[step] = value.flatten() actions[step] = action logprobs[step] = logprob # TRY NOT TO MODIFY: execute the game and log data. next_obs, reward, done, info = envs.step(action.cpu().numpy()) rewards[step] = torch.tensor(reward).to(device).view(-1) next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device) for item in info: if "episode" in item.keys(): print(f"global_step={global_step}, episodic_return={item['episode']['r']}") writer.add_scalar("charts/episodic_return", item["episode"]["r"], global_step) writer.add_scalar("charts/episodic_length", item["episode"]["l"], global_step) break # bootstrap value if not done with torch.no_grad(): next_value = agent.get_value(next_obs).reshape(1, -1) if args.gae: advantages = torch.zeros_like(rewards).to(device) lastgaelam = 0 for t in reversed(range(args.num_steps)): if t == args.num_steps - 1: nextnonterminal = 1.0 - next_done nextvalues = next_value else: nextnonterminal = 1.0 - dones[t + 1] nextvalues = values[t + 1] delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam returns = advantages + values else: returns = torch.zeros_like(rewards).to(device) for t in reversed(range(args.num_steps)): if t == args.num_steps - 1: nextnonterminal = 1.0 - next_done next_return = next_value else: nextnonterminal = 1.0 - dones[t + 1] next_return = returns[t + 1] returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return advantages = returns - values # flatten the batch b_obs = obs.reshape((-1,) + envs.single_observation_space.shape) b_logprobs = logprobs.reshape(-1) b_actions = actions.reshape((-1,) + envs.single_action_space.shape) b_advantages = advantages.reshape(-1) b_returns = returns.reshape(-1) b_values = values.reshape(-1) # Optimizaing the policy and value network b_inds = np.arange(args.batch_size) clipfracs = [] for epoch in range(args.update_epochs): np.random.shuffle(b_inds) for start in range(0, args.batch_size, args.minibatch_size): end = start + args.minibatch_size mb_inds = b_inds[start:end] _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions.long()[mb_inds].T) logratio = newlogprob - b_logprobs[mb_inds] ratio = logratio.exp() with torch.no_grad(): # calculate approx_kl http://joschu.net/blog/kl-approx.html # old_approx_kl = (-logratio).mean() approx_kl = ((ratio - 1) - logratio).mean() clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()] mb_advantages = b_advantages[mb_inds] if args.norm_adv: mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8) # Policy loss pg_loss1 = -mb_advantages * ratio pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_coef, 1 + args.clip_coef) pg_loss = torch.max(pg_loss1, pg_loss2).mean() # Value loss newvalue = newvalue.view(-1) if args.clip_vloss: v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 v_clipped = b_values[mb_inds] + torch.clamp( newvalue - b_values[mb_inds], -args.clip_coef, args.clip_coef, ) v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) v_loss = 0.5 * v_loss_max.mean() else: v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() entropy_loss = entropy.mean() loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) optimizer.step() if args.target_kl is not None: if approx_kl > args.target_kl: break y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy() var_y = np.var(y_true) explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y # TRY NOT TO MODIFY: record rewards for plotting purposes writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step) writer.add_scalar("losses/value_loss", v_loss.item(), global_step) writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step) writer.add_scalar("losses/entropy", entropy_loss.item(), global_step) writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step) writer.add_scalar("losses/explained_variance", explained_var, global_step) print("SPS:", int(global_step / (time.time() - start_time))) writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) envs.close() writer.close() ================================================ FILE: gym_vec_api/ppo_multidiscrete_mask.py ================================================ import argparse import os import random import time from distutils.util import strtobool import gym import gym_microrts # fmt: off import numpy as np import torch import torch.nn as nn import torch.optim as optim from torch.distributions.categorical import Categorical from torch.utils.tensorboard import SummaryWriter def parse_args(): # fmt: off parser = argparse.ArgumentParser() parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"), help='the name of this experiment') parser.add_argument('--gym-id', type=str, default="MicrortsMining10x10F9-v0", help='the id of the gym environment') parser.add_argument('--learning-rate', type=float, default=2.5e-4, help='the learning rate of the optimizer') parser.add_argument('--seed', type=int, default=1, help='seed of the experiment') parser.add_argument('--total-timesteps', type=int, default=10000000, help='total timesteps of the experiments') parser.add_argument('--torch-deterministic', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, help='if toggled, `torch.backends.cudnn.deterministic=False`') parser.add_argument('--cuda', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, help='if toggled, cuda will be enabled by default') parser.add_argument('--track', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, help='if toggled, this experiment will be tracked with Weights and Biases') parser.add_argument('--wandb-project-name', type=str, default="cleanRL", help="the wandb's project name") parser.add_argument('--wandb-entity', type=str, default=None, help="the entity (team) of wandb's project") parser.add_argument('--capture-video', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, help='weather to capture videos of the agent performances (check out `videos` folder)') # Algorithm specific arguments parser.add_argument('--num-envs', type=int, default=4, help='the number of parallel game environments') parser.add_argument('--num-steps', type=int, default=128, help='the number of steps to run in each environment per policy rollout') parser.add_argument('--anneal-lr', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, help="Toggle learning rate annealing for policy and value networks") parser.add_argument('--gae', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, help='Use GAE for advantage computation') parser.add_argument('--gamma', type=float, default=0.99, help='the discount factor gamma') parser.add_argument('--gae-lambda', type=float, default=0.95, help='the lambda for the general advantage estimation') parser.add_argument('--num-minibatches', type=int, default=4, help='the number of mini-batches') parser.add_argument('--update-epochs', type=int, default=4, help="the K epochs to update the policy") parser.add_argument('--norm-adv', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, help="Toggles advantages normalization") parser.add_argument('--clip-coef', type=float, default=0.1, help="the surrogate clipping coefficient") parser.add_argument('--clip-vloss', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, help='Toggles wheter or not to use a clipped loss for the value function, as per the paper.') parser.add_argument('--ent-coef', type=float, default=0.01, help="coefficient of the entropy") parser.add_argument('--vf-coef', type=float, default=0.5, help="coefficient of the value function") parser.add_argument('--max-grad-norm', type=float, default=0.5, help='the maximum norm for the gradient clipping') parser.add_argument('--target-kl', type=float, default=None, help='the target KL divergence threshold') args = parser.parse_args() args.batch_size = int(args.num_envs * args.num_steps) args.minibatch_size = int(args.batch_size // args.num_minibatches) # fmt: on return args def make_env(gym_id, seed, idx, capture_video, run_name): def thunk(): env = gym.make(gym_id) env = gym.wrappers.RecordEpisodeStatistics(env) if capture_video: if idx == 0: env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") env.seed(seed) env.action_space.seed(seed) env.observation_space.seed(seed) return env return thunk def layer_init(layer, std=np.sqrt(2), bias_const=0.0): torch.nn.init.orthogonal_(layer.weight, std) torch.nn.init.constant_(layer.bias, bias_const) return layer class Transpose(nn.Module): def __init__(self, permutation): super().__init__() self.permutation = permutation def forward(self, x): return x.permute(self.permutation) class CategoricalMasked(Categorical): def __init__(self, probs=None, logits=None, validate_args=None, masks=[]): self.masks = masks if len(self.masks) == 0: super(CategoricalMasked, self).__init__(probs, logits, validate_args) else: self.masks = masks.type(torch.BoolTensor).to(device) logits = torch.where(self.masks, logits, torch.tensor(-1e+8).to(device)) super(CategoricalMasked, self).__init__(probs, logits, validate_args) def entropy(self): if len(self.masks) == 0: return super(CategoricalMasked, self).entropy() p_log_p = self.logits * self.probs p_log_p = torch.where(self.masks, p_log_p, torch.tensor(0.).to(device)) return -p_log_p.sum(-1) class Agent(nn.Module): def __init__(self, envs): super(Agent, self).__init__() self.network = nn.Sequential( Transpose((0, 3, 1, 2)), layer_init(nn.Conv2d(27, 16, kernel_size=3, stride=2)), nn.ReLU(), layer_init(nn.Conv2d(16, 32, kernel_size=2)), nn.ReLU(), nn.Flatten(), layer_init(nn.Linear(32*3*3, 128)), nn.ReLU(), ) self.nvec = envs.single_action_space.nvec self.actor = layer_init(nn.Linear(128, self.nvec.sum()), std=0.01) self.critic = layer_init(nn.Linear(128, 1), std=1) def get_value(self, x): return self.critic(self.network(x)) def get_action_and_value(self, x, action_mask, action=None): hidden = self.network(x) logits = self.actor(hidden) split_logits = torch.split(logits, self.nvec.tolist(), dim=1) split_action_masks = torch.split(action_mask, self.nvec.tolist(), dim=1) multi_categoricals = [ CategoricalMasked(logits=logits, masks=iam) for (logits, iam) in zip(split_logits, split_action_masks) ] if action is None: action = torch.stack([categorical.sample() for categorical in multi_categoricals]) logprob = torch.stack([categorical.log_prob(a) for a, categorical in zip(action, multi_categoricals)]) entropy = torch.stack([categorical.entropy() for categorical in multi_categoricals]) return action.T, logprob.sum(0), entropy.sum(0), self.critic(hidden) if __name__ == "__main__": args = parse_args() run_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" if args.track: import wandb wandb.init( project=args.wandb_project_name, entity=args.wandb_entity, sync_tensorboard=True, config=vars(args), name=run_name, monitor_gym=True, save_code=True, ) writer = SummaryWriter(f"runs/{run_name}") writer.add_text( "hyperparameters", "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), ) # TRY NOT TO MODIFY: seeding random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") # env setup envs = gym.vector.SyncVectorEnv( [make_env(args.gym_id, args.seed + i, i, args.capture_video, run_name) for i in range(args.num_envs)] ) assert isinstance(envs.single_action_space, gym.spaces.MultiDiscrete), "only MultiDiscrete action space is supported" agent = Agent(envs).to(device) optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5) # ALGO Logic: Storage setup obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device) actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device) logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device) rewards = torch.zeros((args.num_steps, args.num_envs)).to(device) dones = torch.zeros((args.num_steps, args.num_envs)).to(device) values = torch.zeros((args.num_steps, args.num_envs)).to(device) action_masks = torch.zeros((args.num_steps, args.num_envs) + (envs.single_action_space.nvec.sum(),)).to(device) # TRY NOT TO MODIFY: start the game global_step = 0 start_time = time.time() next_obs = torch.Tensor(envs.reset()).to(device) next_done = torch.zeros(args.num_envs).to(device) num_updates = args.total_timesteps // args.batch_size for update in range(1, num_updates + 1): # Annealing the rate if instructed to do so. if args.anneal_lr: frac = 1.0 - (update - 1.0) / num_updates lrnow = frac * args.learning_rate optimizer.param_groups[0]["lr"] = lrnow for step in range(0, args.num_steps): global_step += 1 * args.num_envs obs[step] = next_obs dones[step] = next_done action_masks[step] = torch.Tensor( np.array([env.action_mask for env in envs.envs]) ) # ALGO LOGIC: action logic with torch.no_grad(): action, logprob, _, value = agent.get_action_and_value(next_obs, action_masks[step]) values[step] = value.flatten() actions[step] = action logprobs[step] = logprob # TRY NOT TO MODIFY: execute the game and log data. next_obs, reward, done, info = envs.step(action.cpu().numpy()) rewards[step] = torch.tensor(reward).to(device).view(-1) next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device) for item in info: if "episode" in item.keys(): print(f"global_step={global_step}, episodic_return={item['episode']['r']}") writer.add_scalar("charts/episodic_return", item["episode"]["r"], global_step) writer.add_scalar("charts/episodic_length", item["episode"]["l"], global_step) break # bootstrap value if not done with torch.no_grad(): next_value = agent.get_value(next_obs).reshape(1, -1) if args.gae: advantages = torch.zeros_like(rewards).to(device) lastgaelam = 0 for t in reversed(range(args.num_steps)): if t == args.num_steps - 1: nextnonterminal = 1.0 - next_done nextvalues = next_value else: nextnonterminal = 1.0 - dones[t + 1] nextvalues = values[t + 1] delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam returns = advantages + values else: returns = torch.zeros_like(rewards).to(device) for t in reversed(range(args.num_steps)): if t == args.num_steps - 1: nextnonterminal = 1.0 - next_done next_return = next_value else: nextnonterminal = 1.0 - dones[t + 1] next_return = returns[t + 1] returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return advantages = returns - values # flatten the batch b_obs = obs.reshape((-1,) + envs.single_observation_space.shape) b_logprobs = logprobs.reshape(-1) b_actions = actions.reshape((-1,) + envs.single_action_space.shape) b_advantages = advantages.reshape(-1) b_returns = returns.reshape(-1) b_values = values.reshape(-1) b_action_masks = action_masks.reshape((-1, action_masks.shape[-1])) # Optimizaing the policy and value network b_inds = np.arange(args.batch_size) clipfracs = [] for epoch in range(args.update_epochs): np.random.shuffle(b_inds) for start in range(0, args.batch_size, args.minibatch_size): end = start + args.minibatch_size mb_inds = b_inds[start:end] _, newlogprob, entropy, newvalue = agent.get_action_and_value( b_obs[mb_inds], b_action_masks[mb_inds], b_actions.long()[mb_inds].T, ) logratio = newlogprob - b_logprobs[mb_inds] ratio = logratio.exp() with torch.no_grad(): # calculate approx_kl http://joschu.net/blog/kl-approx.html # old_approx_kl = (-logratio).mean() approx_kl = ((ratio - 1) - logratio).mean() clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()] mb_advantages = b_advantages[mb_inds] if args.norm_adv: mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8) # Policy loss pg_loss1 = -mb_advantages * ratio pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_coef, 1 + args.clip_coef) pg_loss = torch.max(pg_loss1, pg_loss2).mean() # Value loss newvalue = newvalue.view(-1) if args.clip_vloss: v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 v_clipped = b_values[mb_inds] + torch.clamp( newvalue - b_values[mb_inds], -args.clip_coef, args.clip_coef, ) v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) v_loss = 0.5 * v_loss_max.mean() else: v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() entropy_loss = entropy.mean() loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) optimizer.step() if args.target_kl is not None: if approx_kl > args.target_kl: break y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy() var_y = np.var(y_true) explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y # TRY NOT TO MODIFY: record rewards for plotting purposes writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step) writer.add_scalar("losses/value_loss", v_loss.item(), global_step) writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step) writer.add_scalar("losses/entropy", entropy_loss.item(), global_step) writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step) writer.add_scalar("losses/explained_variance", explained_var, global_step) print("SPS:", int(global_step / (time.time() - start_time))) writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) envs.close() writer.close() ================================================ FILE: invalid_action_masking/ppo_10x10.py ================================================ import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.distributions.categorical import Categorical from torch.utils.tensorboard import SummaryWriter from cleanrl.common import preprocess_obs_space, preprocess_ac_space import argparse import numpy as np import gym import gym_microrts from gym.wrappers import TimeLimit, Monitor from gym.spaces import Discrete, Box, MultiBinary, MultiDiscrete, Space import time import random import os import pandas as pd # taken from https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_normalize.py class RunningMeanStd(object): def __init__(self, epsilon=1e-4, shape=()): self.mean = np.zeros(shape, 'float64') self.var = np.ones(shape, 'float64') self.count = epsilon def update(self, x): batch_mean = np.mean([x], axis=0) batch_var = np.var([x], axis=0) batch_count = 1 self.update_from_moments(batch_mean, batch_var, batch_count) def update_from_moments(self, batch_mean, batch_var, batch_count): self.mean, self.var, self.count = update_mean_var_count_from_moments( self.mean, self.var, self.count, batch_mean, batch_var, batch_count) def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count): delta = batch_mean - mean tot_count = count + batch_count new_mean = mean + delta * batch_count / tot_count m_a = var * count m_b = batch_var * batch_count M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count new_var = M2 / tot_count new_count = tot_count return new_mean, new_var, new_count class NormalizedEnv(gym.core.Wrapper): def __init__(self, env, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): super(NormalizedEnv, self).__init__(env) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=(1,)) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(()) self.gamma = gamma self.epsilon = epsilon def step(self, action): obs, rews, news, infos = self.env.step(action) infos['real_reward'] = rews # print("before", self.ret) self.ret = self.ret * self.gamma + rews # print("after", self.ret) obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(np.array([self.ret].copy())) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret = self.ret * (1-float(news)) return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): self.ret = np.zeros(()) obs = self.env.reset() return self._obfilt(obs) if __name__ == "__main__": parser = argparse.ArgumentParser(description='PPO agent') # Common arguments parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"), help='the name of this experiment') parser.add_argument('--gym-id', type=str, default="MicrortsMining10x10F9-v0", help='the id of the gym environment') parser.add_argument('--seed', type=int, default=1, help='seed of the experiment') parser.add_argument('--episode-length', type=int, default=0, help='the maximum length of each episode') parser.add_argument('--total-timesteps', type=int, default=100000, help='total timesteps of the experiments') parser.add_argument('--no-torch-deterministic', action='store_false', dest="torch_deterministic", default=True, help='if toggled, `torch.backends.cudnn.deterministic=False`') parser.add_argument('--no-cuda', action='store_false', dest="cuda", default=True, help='if toggled, cuda will not be enabled by default') parser.add_argument('--prod-mode', action='store_true', default=False, help='run the script in production mode and use wandb to log outputs') parser.add_argument('--capture-video', action='store_true', default=False, help='weather to capture videos of the agent performances (check out `videos` folder)') parser.add_argument('--wandb-project-name', type=str, default="cleanRL", help="the wandb's project name") parser.add_argument('--wandb-entity', type=str, default=None, help="the entity (team) of wandb's project") # Algorithm specific arguments parser.add_argument('--batch-size', type=int, default=2048, help='the batch size of ppo') parser.add_argument('--minibatch-size', type=int, default=256, help='the mini batch size of ppo') parser.add_argument('--gamma', type=float, default=0.99, help='the discount factor gamma') parser.add_argument('--gae-lambda', type=float, default=0.97, help='the lambda for the general advantage estimation') parser.add_argument('--ent-coef', type=float, default=0.01, help="coefficient of the entropy") parser.add_argument('--max-grad-norm', type=float, default=0.5, help='the maximum norm for the gradient clipping') parser.add_argument('--clip-coef', type=float, default=0.2, help="the surrogate clipping coefficient") parser.add_argument('--update-epochs', type=int, default=10, help="the K epochs to update the policy") parser.add_argument('--kle-stop', action='store_true', default=False, help='If toggled, the policy updates will be early stopped w.r.t target-kl') parser.add_argument('--kle-rollback', action='store_true', default=False, help='If toggled, the policy updates will roll back to previous policy if KL exceeds target-kl') parser.add_argument('--target-kl', type=float, default=0.015, help='the target-kl variable that is referred by --kl') parser.add_argument('--gae', action='store_true', default=True, help='Use GAE for advantage computation') parser.add_argument('--policy-lr', type=float, default=3e-4, help="the learning rate of the policy optimizer") parser.add_argument('--value-lr', type=float, default=3e-4, help="the learning rate of the critic optimizer") parser.add_argument('--norm-obs', action='store_true', default=False, help="Toggles observation normalization") parser.add_argument('--norm-returns', action='store_true', default=False, help="Toggles returns normalization") parser.add_argument('--norm-adv', action='store_true', default=False, help="Toggles advantages normalization") parser.add_argument('--obs-clip', type=float, default=10.0, help="Value for reward clipping, as per the paper") parser.add_argument('--rew-clip', type=float, default=10.0, help="Value for observation clipping, as per the paper") parser.add_argument('--anneal-lr', action='store_true', default=False, help="Toggle learning rate annealing for policy and value networks") parser.add_argument('--weights-init', default="orthogonal", choices=["xavier", 'orthogonal'], help='Selects the scheme to be used for weights initialization'), parser.add_argument('--clip-vloss', action="store_true", default=False, help='Toggles wheter or not to use a clipped loss for the value function, as per the paper.') parser.add_argument('--pol-layer-norm', action='store_true', default=False, help='Enables layer normalization in the policy network') args = parser.parse_args() if not args.seed: args.seed = int(time.time()) args.features_turned_on = sum([args.kle_stop, args.kle_rollback, args.gae, args.norm_obs, args.norm_returns, args.norm_adv, args.anneal_lr, args.clip_vloss, args.pol_layer_norm]) # TRY NOT TO MODIFY: setup the environment experiment_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" writer = SummaryWriter(f"runs/{experiment_name}") writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % ( '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) if args.prod_mode: import wandb wandb.init(project=args.wandb_project_name, entity=args.wandb_entity, tensorboard=True, config=vars(args), name=experiment_name, monitor_gym=True) writer = SummaryWriter(f"/tmp/{experiment_name}") wandb.save(os.path.abspath(__file__)) # TRY NOT TO MODIFY: seeding device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu') env = gym.make(args.gym_id) # respect the default timelimit assert isinstance(env.action_space, MultiDiscrete), "only MultiDiscrete action space is supported" assert isinstance(env, TimeLimit) or int(args.episode_length), "the gym env does not have a built in TimeLimit, please specify by using --episode-length" if isinstance(env, TimeLimit): if int(args.episode_length): env._max_episode_steps = int(args.episode_length) args.episode_length = env._max_episode_steps else: env = TimeLimit(env, int(args.episode_length)) env = NormalizedEnv(env.env, ob=args.norm_obs, ret=args.norm_returns, clipob=args.obs_clip, cliprew=args.rew_clip, gamma=args.gamma) env = TimeLimit(env, int(args.episode_length)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic env.seed(args.seed) env.action_space.seed(args.seed) env.observation_space.seed(args.seed) if args.capture_video: env = Monitor(env, f'videos/{experiment_name}') # ALGO LOGIC: initialize agent here: class CategoricalMasked(Categorical): def __init__(self, probs=None, logits=None, validate_args=None, masks=[]): self.masks = masks if len(self.masks) == 0: super(CategoricalMasked, self).__init__(probs, logits, validate_args) else: self.masks = masks.type(torch.BoolTensor).to(device) logits = torch.where(self.masks, logits, torch.tensor(-1e+8).to(device)) super(CategoricalMasked, self).__init__(probs, logits, validate_args) def entropy(self): if len(self.masks) == 0: return super(CategoricalMasked, self).entropy() p_log_p = self.logits * self.probs p_log_p = torch.where(self.masks, p_log_p, torch.tensor(0.).to(device)) return -p_log_p.sum(-1) class Policy(nn.Module): def __init__(self): super(Policy, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=3,), nn.MaxPool2d(1), nn.ReLU(), nn.Conv2d(16, 32, kernel_size=3), nn.MaxPool2d(1), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(32*6*6, 128), nn.ReLU(), nn.Linear(128, env.action_space.nvec.sum()) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def get_action(self, x, action=None, invalid_action_masks=None): logits = self.forward(x) split_logits = torch.split(logits, env.action_space.nvec.tolist(), dim=1) if invalid_action_masks is not None: split_invalid_action_masks = torch.split(invalid_action_masks, env.action_space.nvec.tolist(), dim=1) multi_categoricals = [CategoricalMasked(logits=logits, masks=iam) for (logits, iam) in zip(split_logits, split_invalid_action_masks)] else: multi_categoricals = [Categorical(logits=logits) for logits in split_logits] if action is None: action = torch.stack([categorical.sample() for categorical in multi_categoricals]) logprob = torch.stack([categorical.log_prob(a) for a, categorical in zip(action, multi_categoricals)]) return action, logprob, [], multi_categoricals class Value(nn.Module): def __init__(self): super(Value, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=3,), nn.MaxPool2d(1), nn.ReLU(), nn.Conv2d(16, 32, kernel_size=3), nn.MaxPool2d(1), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(32*6*6, 128), nn.ReLU(), nn.Linear(128, 1) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def discount_cumsum(x, dones, gamma): """ computing discounted cumulative sums of vectors that resets with dones input: vector x, vector dones, [x0, [0, x1, 0, x2 1, x3 0, x4] 0] output: [x0 + discount * x1 + discount^2 * x2, x1 + discount * x2, x2, x3 + discount * x4, x4] """ discount_cumsum = np.zeros_like(x) discount_cumsum[-1] = x[-1] for t in reversed(range(x.shape[0]-1)): discount_cumsum[t] = x[t] + gamma * discount_cumsum[t+1] * (1-dones[t]) return discount_cumsum pg = Policy().to(device) vf = Value().to(device) # MODIFIED: Separate optimizer and learning rates pg_optimizer = optim.Adam(list(pg.parameters()), lr=args.policy_lr) v_optimizer = optim.Adam(list(vf.parameters()), lr=args.value_lr) # MODIFIED: Initializing learning rate anneal scheduler when need if args.anneal_lr: anneal_fn = lambda f: max(0, 1-f / args.total_timesteps) pg_lr_scheduler = optim.lr_scheduler.LambdaLR(pg_optimizer, lr_lambda=anneal_fn) vf_lr_scheduler = optim.lr_scheduler.LambdaLR(v_optimizer, lr_lambda=anneal_fn) loss_fn = nn.MSELoss() def evaluate_with_no_mask(): evaluate_rewards = [] evaluate_invalid_action_stats = [] if args.capture_video: env.stats_recorder.done=True next_obs = np.array(env.reset()) # ALGO Logic: Storage for epoch data obs = np.empty((args.batch_size,) + env.observation_space.shape) actions = np.empty((args.batch_size,) + env.action_space.shape) logprobs = torch.zeros((env.action_space.nvec.shape[0], args.batch_size,)).to(device) rewards = np.zeros((args.batch_size,)) raw_rewards = np.zeros((len(env.rfs),args.batch_size,)) real_rewards = [] invalid_action_stats = [] dones = np.zeros((args.batch_size,)) values = torch.zeros((args.batch_size,)).to(device) # TRY NOT TO MODIFY: prepare the execution of the game. for step in range(args.batch_size): env.render() obs[step] = next_obs.copy() # ALGO LOGIC: put action logic here with torch.no_grad(): values[step] = vf.forward(obs[step:step+1]) action, logproba, _, probs = pg.get_action(obs[step:step+1]) actions[step] = action[:,0].data.cpu().numpy() logprobs[:,[step]] = logproba # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards[step], dones[step], info = env.step(action[:,0].data.cpu().numpy()) raw_rewards[:,step] = info["rewards"] real_rewards += [info['real_reward']] invalid_action_stats += [info['invalid_action_stats']] next_obs = np.array(next_obs) if dones[step]: # Computing the discounted returns: writer.add_scalar("charts/episode_reward", np.sum(real_rewards), global_step) evaluate_rewards += [np.sum(real_rewards)] real_rewards = [] evaluate_invalid_action_stats += [pd.DataFrame(invalid_action_stats).sum(0)] invalid_action_stats = [] next_obs = np.array(env.reset()) return np.average(evaluate_rewards), pd.DataFrame(evaluate_invalid_action_stats).mean(0) # TRY NOT TO MODIFY: start the game global_step = 0 while global_step < args.total_timesteps: if args.capture_video: env.stats_recorder.done=True next_obs = np.array(env.reset()) # ALGO Logic: Storage for epoch data obs = np.empty((args.batch_size,) + env.observation_space.shape) actions = np.empty((args.batch_size,) + env.action_space.shape) logprobs = torch.zeros((env.action_space.nvec.shape[0], args.batch_size,)).to(device) rewards = np.zeros((args.batch_size,)) raw_rewards = np.zeros((len(env.rfs),args.batch_size,)) real_rewards = [] invalid_action_stats = [] dones = np.zeros((args.batch_size,)) values = torch.zeros((args.batch_size,)).to(device) invalid_action_masks = torch.zeros((args.batch_size, env.action_space.nvec.sum())) # TRY NOT TO MODIFY: prepare the execution of the game. for step in range(args.batch_size): env.render() global_step += 1 obs[step] = next_obs.copy() # ALGO LOGIC: put action logic here invalid_action_mask = torch.ones(env.action_space.nvec.sum()) invalid_action_mask[0:env.action_space.nvec[0]] = torch.tensor(env.unit_location_mask) invalid_action_mask[-env.action_space.nvec[-1]:] = torch.tensor(env.target_unit_location_mask) invalid_action_masks[step] = invalid_action_mask with torch.no_grad(): values[step] = vf.forward(obs[step:step+1]) action, logproba, _, probs = pg.get_action(obs[step:step+1], invalid_action_masks=invalid_action_masks[step:step+1]) actions[step] = action[:,0].data.cpu().numpy() logprobs[:,[step]] = logproba # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards[step], dones[step], info = env.step(action[:,0].data.cpu().numpy()) raw_rewards[:,step] = info["rewards"] real_rewards += [info['real_reward']] invalid_action_stats += [info['invalid_action_stats']] next_obs = np.array(next_obs) # Annealing the rate if instructed to do so. if args.anneal_lr: pg_lr_scheduler.step() vf_lr_scheduler.step() if dones[step]: # Computing the discounted returns: writer.add_scalar("charts/episode_reward", np.sum(real_rewards), global_step) print(f"global_step={global_step}, episode_reward={np.sum(real_rewards)}") for i in range(len(env.rfs)): writer.add_scalar(f"charts/episode_reward/{str(env.rfs[i])}", raw_rewards.sum(1)[i], global_step) real_rewards = [] for key, idx in zip(info['invalid_action_stats'], range(len(info['invalid_action_stats']))): writer.add_scalar(f"stats/{key}", pd.DataFrame(invalid_action_stats).sum(0)[idx], global_step) invalid_action_stats = [] next_obs = np.array(env.reset()) # bootstrap reward if not done. reached the batch limit last_value = 0 if not dones[step]: last_value = vf.forward(next_obs.reshape((1,)+next_obs.shape))[0].detach().cpu().numpy()[0] bootstrapped_rewards = np.append(rewards, last_value) # calculate the returns and advantages if args.gae: bootstrapped_values = np.append(values.detach().cpu().numpy(), last_value) deltas = bootstrapped_rewards[:-1] + args.gamma * bootstrapped_values[1:] * (1-dones) - bootstrapped_values[:-1] advantages = discount_cumsum(deltas, dones, args.gamma * args.gae_lambda) advantages = torch.Tensor(advantages).to(device) returns = advantages + values else: returns = discount_cumsum(bootstrapped_rewards, dones, args.gamma)[:-1] advantages = returns - values.detach().cpu().numpy() advantages = torch.Tensor(advantages).to(device) returns = torch.Tensor(returns).to(device) # Advantage normalization if args.norm_adv: EPS = 1e-10 advantages = (advantages - advantages.mean()) / (advantages.std() + EPS) # Optimizaing policy network entropys = [] target_pg = Policy().to(device) inds = np.arange(args.batch_size,) for i_epoch_pi in range(args.update_epochs): np.random.shuffle(inds) for start in range(0, args.batch_size, args.minibatch_size): end = start + args.minibatch_size minibatch_ind = inds[start:end] target_pg.load_state_dict(pg.state_dict()) _, newlogproba, _, _ = pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T, invalid_action_masks[minibatch_ind]) ratio = (newlogproba - logprobs[:,minibatch_ind]).exp() # Policy loss as in OpenAI SpinUp clip_adv = torch.where(advantages[minibatch_ind] > 0, (1.+args.clip_coef) * advantages[minibatch_ind], (1.-args.clip_coef) * advantages[minibatch_ind]).to(device) # Entropy computation with resampled actions entropy = -(newlogproba.exp() * newlogproba).mean() entropys.append(entropy.item()) policy_loss = -torch.min(ratio * advantages[minibatch_ind], clip_adv) + args.ent_coef * entropy policy_loss = policy_loss.mean() pg_optimizer.zero_grad() policy_loss.backward() nn.utils.clip_grad_norm_(pg.parameters(), args.max_grad_norm) pg_optimizer.step() approx_kl = (logprobs[:,minibatch_ind] - newlogproba).mean() # Optimizing value network new_values = vf.forward(obs[minibatch_ind]).view(-1) # Value loss clipping if args.clip_vloss: v_loss_unclipped = ((new_values - returns[minibatch_ind]) ** 2) v_clipped = values[minibatch_ind] + torch.clamp(new_values - values[minibatch_ind], -args.clip_coef, args.clip_coef) v_loss_clipped = (v_clipped - returns[minibatch_ind])**2 v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) v_loss = 0.5 * v_loss_max.mean() else: v_loss = torch.mean((returns[minibatch_ind]- new_values).pow(2)) v_optimizer.zero_grad() v_loss.backward() nn.utils.clip_grad_norm_(vf.parameters(), args.max_grad_norm) v_optimizer.step() if args.kle_stop: if approx_kl > args.target_kl: break if args.kle_rollback: if (logprobs[:,minibatch_ind] - pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T, invalid_action_masks[minibatch_ind])[1]).mean() > args.target_kl: pg.load_state_dict(target_pg.state_dict()) break # TRY NOT TO MODIFY: record rewards for plotting purposes writer.add_scalar("losses/value_loss", v_loss.item(), global_step) writer.add_scalar("charts/policy_learning_rate", pg_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("charts/value_learning_rate", v_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("losses/policy_loss", policy_loss.item(), global_step) writer.add_scalar("losses/entropy", np.mean(entropys), global_step) writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) if args.kle_stop or args.kle_rollback: writer.add_scalar("debug/pg_stop_iter", i_epoch_pi, global_step) # evaluate no mask average_reward, average_invalid_action_stats = evaluate_with_no_mask() writer.add_scalar("evals/charts/episode_reward", average_reward, global_step) print(f"global_step={global_step}, eval_reward={average_reward}") for key, idx in zip(info['invalid_action_stats'], range(len(info['invalid_action_stats']))): writer.add_scalar(f"evals/stats/{key}", average_invalid_action_stats[idx], global_step) env.close() writer.close() ================================================ FILE: invalid_action_masking/ppo_16x16.py ================================================ import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.distributions.categorical import Categorical from torch.utils.tensorboard import SummaryWriter from cleanrl.common import preprocess_obs_space, preprocess_ac_space import argparse import numpy as np import gym import gym_microrts from gym.wrappers import TimeLimit, Monitor from gym.spaces import Discrete, Box, MultiBinary, MultiDiscrete, Space import time import random import os import pandas as pd # taken from https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_normalize.py class RunningMeanStd(object): def __init__(self, epsilon=1e-4, shape=()): self.mean = np.zeros(shape, 'float64') self.var = np.ones(shape, 'float64') self.count = epsilon def update(self, x): batch_mean = np.mean([x], axis=0) batch_var = np.var([x], axis=0) batch_count = 1 self.update_from_moments(batch_mean, batch_var, batch_count) def update_from_moments(self, batch_mean, batch_var, batch_count): self.mean, self.var, self.count = update_mean_var_count_from_moments( self.mean, self.var, self.count, batch_mean, batch_var, batch_count) def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count): delta = batch_mean - mean tot_count = count + batch_count new_mean = mean + delta * batch_count / tot_count m_a = var * count m_b = batch_var * batch_count M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count new_var = M2 / tot_count new_count = tot_count return new_mean, new_var, new_count class NormalizedEnv(gym.core.Wrapper): def __init__(self, env, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): super(NormalizedEnv, self).__init__(env) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=(1,)) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(()) self.gamma = gamma self.epsilon = epsilon def step(self, action): obs, rews, news, infos = self.env.step(action) infos['real_reward'] = rews # print("before", self.ret) self.ret = self.ret * self.gamma + rews # print("after", self.ret) obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(np.array([self.ret].copy())) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret = self.ret * (1-float(news)) return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): self.ret = np.zeros(()) obs = self.env.reset() return self._obfilt(obs) if __name__ == "__main__": parser = argparse.ArgumentParser(description='PPO agent') # Common arguments parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"), help='the name of this experiment') parser.add_argument('--gym-id', type=str, default="MicrortsMining16x16F9-v0", help='the id of the gym environment') parser.add_argument('--seed', type=int, default=1, help='seed of the experiment') parser.add_argument('--episode-length', type=int, default=0, help='the maximum length of each episode') parser.add_argument('--total-timesteps', type=int, default=100000, help='total timesteps of the experiments') parser.add_argument('--no-torch-deterministic', action='store_false', dest="torch_deterministic", default=True, help='if toggled, `torch.backends.cudnn.deterministic=False`') parser.add_argument('--no-cuda', action='store_false', dest="cuda", default=True, help='if toggled, cuda will not be enabled by default') parser.add_argument('--prod-mode', action='store_true', default=False, help='run the script in production mode and use wandb to log outputs') parser.add_argument('--capture-video', action='store_true', default=False, help='weather to capture videos of the agent performances (check out `videos` folder)') parser.add_argument('--wandb-project-name', type=str, default="cleanRL", help="the wandb's project name") parser.add_argument('--wandb-entity', type=str, default=None, help="the entity (team) of wandb's project") # Algorithm specific arguments parser.add_argument('--batch-size', type=int, default=2048, help='the batch size of ppo') parser.add_argument('--minibatch-size', type=int, default=256, help='the mini batch size of ppo') parser.add_argument('--gamma', type=float, default=0.99, help='the discount factor gamma') parser.add_argument('--gae-lambda', type=float, default=0.97, help='the lambda for the general advantage estimation') parser.add_argument('--ent-coef', type=float, default=0.01, help="coefficient of the entropy") parser.add_argument('--max-grad-norm', type=float, default=0.5, help='the maximum norm for the gradient clipping') parser.add_argument('--clip-coef', type=float, default=0.2, help="the surrogate clipping coefficient") parser.add_argument('--update-epochs', type=int, default=10, help="the K epochs to update the policy") parser.add_argument('--kle-stop', action='store_true', default=False, help='If toggled, the policy updates will be early stopped w.r.t target-kl') parser.add_argument('--kle-rollback', action='store_true', default=False, help='If toggled, the policy updates will roll back to previous policy if KL exceeds target-kl') parser.add_argument('--target-kl', type=float, default=0.015, help='the target-kl variable that is referred by --kl') parser.add_argument('--gae', action='store_true', default=True, help='Use GAE for advantage computation') parser.add_argument('--policy-lr', type=float, default=3e-4, help="the learning rate of the policy optimizer") parser.add_argument('--value-lr', type=float, default=3e-4, help="the learning rate of the critic optimizer") parser.add_argument('--norm-obs', action='store_true', default=True, help="Toggles observation normalization") parser.add_argument('--norm-returns', action='store_true', default=False, help="Toggles returns normalization") parser.add_argument('--norm-adv', action='store_true', default=True, help="Toggles advantages normalization") parser.add_argument('--obs-clip', type=float, default=10.0, help="Value for reward clipping, as per the paper") parser.add_argument('--rew-clip', type=float, default=10.0, help="Value for observation clipping, as per the paper") parser.add_argument('--anneal-lr', action='store_true', default=True, help="Toggle learning rate annealing for policy and value networks") parser.add_argument('--weights-init', default="orthogonal", choices=["xavier", 'orthogonal'], help='Selects the scheme to be used for weights initialization'), parser.add_argument('--clip-vloss', action="store_true", default=True, help='Toggles wheter or not to use a clipped loss for the value function, as per the paper.') parser.add_argument('--pol-layer-norm', action='store_true', default=False, help='Enables layer normalization in the policy network') args = parser.parse_args() if not args.seed: args.seed = int(time.time()) args.features_turned_on = sum([args.kle_stop, args.kle_rollback, args.gae, args.norm_obs, args.norm_returns, args.norm_adv, args.anneal_lr, args.clip_vloss, args.pol_layer_norm]) # TRY NOT TO MODIFY: setup the environment experiment_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" writer = SummaryWriter(f"runs/{experiment_name}") writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % ( '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) if args.prod_mode: import wandb wandb.init(project=args.wandb_project_name, entity=args.wandb_entity, tensorboard=True, config=vars(args), name=experiment_name, monitor_gym=True) writer = SummaryWriter(f"/tmp/{experiment_name}") wandb.save(os.path.abspath(__file__)) # TRY NOT TO MODIFY: seeding device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu') env = gym.make(args.gym_id) # respect the default timelimit assert isinstance(env.action_space, MultiDiscrete), "only MultiDiscrete action space is supported" assert isinstance(env, TimeLimit) or int(args.episode_length), "the gym env does not have a built in TimeLimit, please specify by using --episode-length" if isinstance(env, TimeLimit): if int(args.episode_length): env._max_episode_steps = int(args.episode_length) args.episode_length = env._max_episode_steps else: env = TimeLimit(env, int(args.episode_length)) env = NormalizedEnv(env.env, ob=args.norm_obs, ret=args.norm_returns, clipob=args.obs_clip, cliprew=args.rew_clip, gamma=args.gamma) env = TimeLimit(env, int(args.episode_length)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic env.seed(args.seed) env.action_space.seed(args.seed) env.observation_space.seed(args.seed) if args.capture_video: env = Monitor(env, f'videos/{experiment_name}') # ALGO LOGIC: initialize agent here: class CategoricalMasked(Categorical): def __init__(self, probs=None, logits=None, validate_args=None, masks=[]): self.masks = masks if len(self.masks) == 0: super(CategoricalMasked, self).__init__(probs, logits, validate_args) else: self.masks = masks.type(torch.BoolTensor).to(device) logits = torch.where(self.masks, logits, torch.tensor(-1e+8).to(device)) super(CategoricalMasked, self).__init__(probs, logits, validate_args) def entropy(self): if len(self.masks) == 0: return super(CategoricalMasked, self).entropy() p_log_p = self.logits * self.probs p_log_p = torch.where(self.masks, p_log_p, torch.tensor(0.).to(device)) return -p_log_p.sum(-1) class Policy(nn.Module): def __init__(self): super(Policy, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=3), nn.MaxPool2d(1), nn.ReLU(), nn.Conv2d(16, 32, kernel_size=3), nn.MaxPool2d(1), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(32*12*12, 128), nn.ReLU(), nn.Linear(128, env.action_space.nvec.sum()) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def get_action(self, x, action=None, invalid_action_masks=None): logits = self.forward(x) split_logits = torch.split(logits, env.action_space.nvec.tolist(), dim=1) if invalid_action_masks is not None: split_invalid_action_masks = torch.split(invalid_action_masks, env.action_space.nvec.tolist(), dim=1) multi_categoricals = [CategoricalMasked(logits=logits, masks=iam) for (logits, iam) in zip(split_logits, split_invalid_action_masks)] else: multi_categoricals = [Categorical(logits=logits) for logits in split_logits] if action is None: action = torch.stack([categorical.sample() for categorical in multi_categoricals]) logprob = torch.stack([categorical.log_prob(a) for a, categorical in zip(action, multi_categoricals)]) return action, logprob, [], multi_categoricals class Value(nn.Module): def __init__(self): super(Value, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=3), nn.MaxPool2d(1), nn.ReLU(), nn.Conv2d(16, 32, kernel_size=3), nn.MaxPool2d(1), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(32*12*12, 128), nn.ReLU(), nn.Linear(128, 1) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def discount_cumsum(x, dones, gamma): """ computing discounted cumulative sums of vectors that resets with dones input: vector x, vector dones, [x0, [0, x1, 0, x2 1, x3 0, x4] 0] output: [x0 + discount * x1 + discount^2 * x2, x1 + discount * x2, x2, x3 + discount * x4, x4] """ discount_cumsum = np.zeros_like(x) discount_cumsum[-1] = x[-1] for t in reversed(range(x.shape[0]-1)): discount_cumsum[t] = x[t] + gamma * discount_cumsum[t+1] * (1-dones[t]) return discount_cumsum pg = Policy().to(device) vf = Value().to(device) # MODIFIED: Separate optimizer and learning rates pg_optimizer = optim.Adam(list(pg.parameters()), lr=args.policy_lr) v_optimizer = optim.Adam(list(vf.parameters()), lr=args.value_lr) # MODIFIED: Initializing learning rate anneal scheduler when need if args.anneal_lr: anneal_fn = lambda f: max(0, 1-f / args.total_timesteps) pg_lr_scheduler = optim.lr_scheduler.LambdaLR(pg_optimizer, lr_lambda=anneal_fn) vf_lr_scheduler = optim.lr_scheduler.LambdaLR(v_optimizer, lr_lambda=anneal_fn) loss_fn = nn.MSELoss() def evaluate_with_no_mask(): evaluate_rewards = [] evaluate_invalid_action_stats = [] if args.capture_video: env.stats_recorder.done=True next_obs = np.array(env.reset()) # ALGO Logic: Storage for epoch data obs = np.empty((args.batch_size,) + env.observation_space.shape) actions = np.empty((args.batch_size,) + env.action_space.shape) logprobs = torch.zeros((env.action_space.nvec.shape[0], args.batch_size,)).to(device) rewards = np.zeros((args.batch_size,)) raw_rewards = np.zeros((len(env.rfs),args.batch_size,)) real_rewards = [] invalid_action_stats = [] dones = np.zeros((args.batch_size,)) values = torch.zeros((args.batch_size,)).to(device) # TRY NOT TO MODIFY: prepare the execution of the game. for step in range(args.batch_size): env.render() obs[step] = next_obs.copy() # ALGO LOGIC: put action logic here with torch.no_grad(): values[step] = vf.forward(obs[step:step+1]) action, logproba, _, probs = pg.get_action(obs[step:step+1]) actions[step] = action[:,0].data.cpu().numpy() logprobs[:,[step]] = logproba # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards[step], dones[step], info = env.step(action[:,0].data.cpu().numpy()) raw_rewards[:,step] = info["rewards"] real_rewards += [info['real_reward']] invalid_action_stats += [info['invalid_action_stats']] next_obs = np.array(next_obs) if dones[step]: # Computing the discounted returns: writer.add_scalar("charts/episode_reward", np.sum(real_rewards), global_step) evaluate_rewards += [np.sum(real_rewards)] real_rewards = [] evaluate_invalid_action_stats += [pd.DataFrame(invalid_action_stats).sum(0)] invalid_action_stats = [] next_obs = np.array(env.reset()) return np.average(evaluate_rewards), pd.DataFrame(evaluate_invalid_action_stats).mean(0) # TRY NOT TO MODIFY: start the game global_step = 0 while global_step < args.total_timesteps: if args.capture_video: env.stats_recorder.done=True next_obs = np.array(env.reset()) # ALGO Logic: Storage for epoch data obs = np.empty((args.batch_size,) + env.observation_space.shape) actions = np.empty((args.batch_size,) + env.action_space.shape) logprobs = torch.zeros((env.action_space.nvec.shape[0], args.batch_size,)).to(device) rewards = np.zeros((args.batch_size,)) raw_rewards = np.zeros((len(env.rfs),args.batch_size,)) real_rewards = [] invalid_action_stats = [] dones = np.zeros((args.batch_size,)) values = torch.zeros((args.batch_size,)).to(device) invalid_action_masks = torch.zeros((args.batch_size, env.action_space.nvec.sum())) # TRY NOT TO MODIFY: prepare the execution of the game. for step in range(args.batch_size): env.render() global_step += 1 obs[step] = next_obs.copy() # ALGO LOGIC: put action logic here invalid_action_mask = torch.ones(env.action_space.nvec.sum()) invalid_action_mask[0:env.action_space.nvec[0]] = torch.tensor(env.unit_location_mask) invalid_action_mask[-env.action_space.nvec[-1]:] = torch.tensor(env.target_unit_location_mask) invalid_action_masks[step] = invalid_action_mask with torch.no_grad(): values[step] = vf.forward(obs[step:step+1]) action, logproba, _, probs = pg.get_action(obs[step:step+1], invalid_action_masks=invalid_action_masks[step:step+1]) actions[step] = action[:,0].data.cpu().numpy() logprobs[:,[step]] = logproba # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards[step], dones[step], info = env.step(action[:,0].data.cpu().numpy()) raw_rewards[:,step] = info["rewards"] real_rewards += [info['real_reward']] invalid_action_stats += [info['invalid_action_stats']] next_obs = np.array(next_obs) # Annealing the rate if instructed to do so. if args.anneal_lr: pg_lr_scheduler.step() vf_lr_scheduler.step() if dones[step]: # Computing the discounted returns: writer.add_scalar("charts/episode_reward", np.sum(real_rewards), global_step) print(f"global_step={global_step}, episode_reward={np.sum(real_rewards)}") for i in range(len(env.rfs)): writer.add_scalar(f"charts/episode_reward/{str(env.rfs[i])}", raw_rewards.sum(1)[i], global_step) real_rewards = [] for key, idx in zip(info['invalid_action_stats'], range(len(info['invalid_action_stats']))): writer.add_scalar(f"stats/{key}", pd.DataFrame(invalid_action_stats).sum(0)[idx], global_step) invalid_action_stats = [] next_obs = np.array(env.reset()) # bootstrap reward if not done. reached the batch limit last_value = 0 if not dones[step]: last_value = vf.forward(next_obs.reshape((1,)+next_obs.shape))[0].detach().cpu().numpy()[0] bootstrapped_rewards = np.append(rewards, last_value) # calculate the returns and advantages if args.gae: bootstrapped_values = np.append(values.detach().cpu().numpy(), last_value) deltas = bootstrapped_rewards[:-1] + args.gamma * bootstrapped_values[1:] * (1-dones) - bootstrapped_values[:-1] advantages = discount_cumsum(deltas, dones, args.gamma * args.gae_lambda) advantages = torch.Tensor(advantages).to(device) returns = advantages + values else: returns = discount_cumsum(bootstrapped_rewards, dones, args.gamma)[:-1] advantages = returns - values.detach().cpu().numpy() advantages = torch.Tensor(advantages).to(device) returns = torch.Tensor(returns).to(device) # Advantage normalization if args.norm_adv: EPS = 1e-10 advantages = (advantages - advantages.mean()) / (advantages.std() + EPS) # Optimizaing policy network entropys = [] target_pg = Policy().to(device) inds = np.arange(args.batch_size,) for i_epoch_pi in range(args.update_epochs): np.random.shuffle(inds) for start in range(0, args.batch_size, args.minibatch_size): end = start + args.minibatch_size minibatch_ind = inds[start:end] target_pg.load_state_dict(pg.state_dict()) _, newlogproba, _, _ = pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T, invalid_action_masks[minibatch_ind]) ratio = (newlogproba - logprobs[:,minibatch_ind]).exp() # Policy loss as in OpenAI SpinUp clip_adv = torch.where(advantages[minibatch_ind] > 0, (1.+args.clip_coef) * advantages[minibatch_ind], (1.-args.clip_coef) * advantages[minibatch_ind]).to(device) # Entropy computation with resampled actions entropy = -(newlogproba.exp() * newlogproba).mean() entropys.append(entropy.item()) policy_loss = -torch.min(ratio * advantages[minibatch_ind], clip_adv) + args.ent_coef * entropy policy_loss = policy_loss.mean() pg_optimizer.zero_grad() policy_loss.backward() nn.utils.clip_grad_norm_(pg.parameters(), args.max_grad_norm) pg_optimizer.step() approx_kl = (logprobs[:,minibatch_ind] - newlogproba).mean() # Optimizing value network new_values = vf.forward(obs[minibatch_ind]).view(-1) # Value loss clipping if args.clip_vloss: v_loss_unclipped = ((new_values - returns[minibatch_ind]) ** 2) v_clipped = values[minibatch_ind] + torch.clamp(new_values - values[minibatch_ind], -args.clip_coef, args.clip_coef) v_loss_clipped = (v_clipped - returns[minibatch_ind])**2 v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) v_loss = 0.5 * v_loss_max.mean() else: v_loss = torch.mean((returns[minibatch_ind]- new_values).pow(2)) v_optimizer.zero_grad() v_loss.backward() nn.utils.clip_grad_norm_(vf.parameters(), args.max_grad_norm) v_optimizer.step() if args.kle_stop: if approx_kl > args.target_kl: break if args.kle_rollback: if (logprobs[:,minibatch_ind] - pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T, invalid_action_masks[minibatch_ind])[1]).mean() > args.target_kl: pg.load_state_dict(target_pg.state_dict()) break # TRY NOT TO MODIFY: record rewards for plotting purposes writer.add_scalar("losses/value_loss", v_loss.item(), global_step) writer.add_scalar("charts/policy_learning_rate", pg_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("charts/value_learning_rate", v_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("losses/policy_loss", policy_loss.item(), global_step) writer.add_scalar("losses/entropy", np.mean(entropys), global_step) writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) if args.kle_stop or args.kle_rollback: writer.add_scalar("debug/pg_stop_iter", i_epoch_pi, global_step) # evaluate no mask average_reward, average_invalid_action_stats = evaluate_with_no_mask() writer.add_scalar("evals/charts/episode_reward", average_reward, global_step) print(f"global_step={global_step}, eval_reward={average_reward}") for key, idx in zip(info['invalid_action_stats'], range(len(info['invalid_action_stats']))): writer.add_scalar(f"evals/stats/{key}", average_invalid_action_stats[idx], global_step) env.close() writer.close() ================================================ FILE: invalid_action_masking/ppo_24x24.py ================================================ import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.distributions.categorical import Categorical from torch.utils.tensorboard import SummaryWriter from cleanrl.common import preprocess_obs_space, preprocess_ac_space import argparse import numpy as np import gym import gym_microrts from gym.wrappers import TimeLimit, Monitor from gym.spaces import Discrete, Box, MultiBinary, MultiDiscrete, Space import time import random import os import pandas as pd # taken from https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_normalize.py class RunningMeanStd(object): def __init__(self, epsilon=1e-4, shape=()): self.mean = np.zeros(shape, 'float64') self.var = np.ones(shape, 'float64') self.count = epsilon def update(self, x): batch_mean = np.mean([x], axis=0) batch_var = np.var([x], axis=0) batch_count = 1 self.update_from_moments(batch_mean, batch_var, batch_count) def update_from_moments(self, batch_mean, batch_var, batch_count): self.mean, self.var, self.count = update_mean_var_count_from_moments( self.mean, self.var, self.count, batch_mean, batch_var, batch_count) def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count): delta = batch_mean - mean tot_count = count + batch_count new_mean = mean + delta * batch_count / tot_count m_a = var * count m_b = batch_var * batch_count M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count new_var = M2 / tot_count new_count = tot_count return new_mean, new_var, new_count class NormalizedEnv(gym.core.Wrapper): def __init__(self, env, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): super(NormalizedEnv, self).__init__(env) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=(1,)) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(()) self.gamma = gamma self.epsilon = epsilon def step(self, action): obs, rews, news, infos = self.env.step(action) infos['real_reward'] = rews # print("before", self.ret) self.ret = self.ret * self.gamma + rews # print("after", self.ret) obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(np.array([self.ret].copy())) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret = self.ret * (1-float(news)) return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): self.ret = np.zeros(()) obs = self.env.reset() return self._obfilt(obs) if __name__ == "__main__": parser = argparse.ArgumentParser(description='PPO agent') # Common arguments parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"), help='the name of this experiment') parser.add_argument('--gym-id', type=str, default="MicrortsMining24x24F9-v0", help='the id of the gym environment') parser.add_argument('--seed', type=int, default=1, help='seed of the experiment') parser.add_argument('--episode-length', type=int, default=0, help='the maximum length of each episode') parser.add_argument('--total-timesteps', type=int, default=100000, help='total timesteps of the experiments') parser.add_argument('--no-torch-deterministic', action='store_false', dest="torch_deterministic", default=True, help='if toggled, `torch.backends.cudnn.deterministic=False`') parser.add_argument('--no-cuda', action='store_false', dest="cuda", default=True, help='if toggled, cuda will not be enabled by default') parser.add_argument('--prod-mode', action='store_true', default=False, help='run the script in production mode and use wandb to log outputs') parser.add_argument('--capture-video', action='store_true', default=False, help='weather to capture videos of the agent performances (check out `videos` folder)') parser.add_argument('--wandb-project-name', type=str, default="cleanRL", help="the wandb's project name") parser.add_argument('--wandb-entity', type=str, default=None, help="the entity (team) of wandb's project") # Algorithm specific arguments parser.add_argument('--batch-size', type=int, default=2048, help='the batch size of ppo') parser.add_argument('--minibatch-size', type=int, default=256, help='the mini batch size of ppo') parser.add_argument('--gamma', type=float, default=0.99, help='the discount factor gamma') parser.add_argument('--gae-lambda', type=float, default=0.97, help='the lambda for the general advantage estimation') parser.add_argument('--ent-coef', type=float, default=0.01, help="coefficient of the entropy") parser.add_argument('--max-grad-norm', type=float, default=0.5, help='the maximum norm for the gradient clipping') parser.add_argument('--clip-coef', type=float, default=0.2, help="the surrogate clipping coefficient") parser.add_argument('--update-epochs', type=int, default=10, help="the K epochs to update the policy") parser.add_argument('--kle-stop', action='store_true', default=False, help='If toggled, the policy updates will be early stopped w.r.t target-kl') parser.add_argument('--kle-rollback', action='store_true', default=False, help='If toggled, the policy updates will roll back to previous policy if KL exceeds target-kl') parser.add_argument('--target-kl', type=float, default=0.015, help='the target-kl variable that is referred by --kl') parser.add_argument('--gae', action='store_true', default=True, help='Use GAE for advantage computation') parser.add_argument('--policy-lr', type=float, default=3e-4, help="the learning rate of the policy optimizer") parser.add_argument('--value-lr', type=float, default=3e-4, help="the learning rate of the critic optimizer") parser.add_argument('--norm-obs', action='store_true', default=True, help="Toggles observation normalization") parser.add_argument('--norm-returns', action='store_true', default=False, help="Toggles returns normalization") parser.add_argument('--norm-adv', action='store_true', default=True, help="Toggles advantages normalization") parser.add_argument('--obs-clip', type=float, default=10.0, help="Value for reward clipping, as per the paper") parser.add_argument('--rew-clip', type=float, default=10.0, help="Value for observation clipping, as per the paper") parser.add_argument('--anneal-lr', action='store_true', default=True, help="Toggle learning rate annealing for policy and value networks") parser.add_argument('--weights-init', default="orthogonal", choices=["xavier", 'orthogonal'], help='Selects the scheme to be used for weights initialization'), parser.add_argument('--clip-vloss', action="store_true", default=True, help='Toggles wheter or not to use a clipped loss for the value function, as per the paper.') parser.add_argument('--pol-layer-norm', action='store_true', default=False, help='Enables layer normalization in the policy network') args = parser.parse_args() if not args.seed: args.seed = int(time.time()) args.features_turned_on = sum([args.kle_stop, args.kle_rollback, args.gae, args.norm_obs, args.norm_returns, args.norm_adv, args.anneal_lr, args.clip_vloss, args.pol_layer_norm]) # TRY NOT TO MODIFY: setup the environment experiment_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" writer = SummaryWriter(f"runs/{experiment_name}") writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % ( '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) if args.prod_mode: import wandb wandb.init(project=args.wandb_project_name, entity=args.wandb_entity, tensorboard=True, config=vars(args), name=experiment_name, monitor_gym=True) writer = SummaryWriter(f"/tmp/{experiment_name}") wandb.save(os.path.abspath(__file__)) # TRY NOT TO MODIFY: seeding device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu') env = gym.make(args.gym_id) # respect the default timelimit assert isinstance(env.action_space, MultiDiscrete), "only MultiDiscrete action space is supported" assert isinstance(env, TimeLimit) or int(args.episode_length), "the gym env does not have a built in TimeLimit, please specify by using --episode-length" if isinstance(env, TimeLimit): if int(args.episode_length): env._max_episode_steps = int(args.episode_length) args.episode_length = env._max_episode_steps else: env = TimeLimit(env, int(args.episode_length)) env = NormalizedEnv(env.env, ob=args.norm_obs, ret=args.norm_returns, clipob=args.obs_clip, cliprew=args.rew_clip, gamma=args.gamma) env = TimeLimit(env, int(args.episode_length)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic env.seed(args.seed) env.action_space.seed(args.seed) env.observation_space.seed(args.seed) if args.capture_video: env = Monitor(env, f'videos/{experiment_name}') # ALGO LOGIC: initialize agent here: class CategoricalMasked(Categorical): def __init__(self, probs=None, logits=None, validate_args=None, masks=[]): self.masks = masks if len(self.masks) == 0: super(CategoricalMasked, self).__init__(probs, logits, validate_args) else: self.masks = masks.type(torch.BoolTensor).to(device) logits = torch.where(self.masks, logits, torch.tensor(-1e+8).to(device)) super(CategoricalMasked, self).__init__(probs, logits, validate_args) def entropy(self): if len(self.masks) == 0: return super(CategoricalMasked, self).entropy() p_log_p = self.logits * self.probs p_log_p = torch.where(self.masks, p_log_p, torch.tensor(0.).to(device)) return -p_log_p.sum(-1) class Policy(nn.Module): def __init__(self): super(Policy, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=3, stride=1), nn.MaxPool2d(2), nn.ReLU(), nn.Conv2d(16, 32, kernel_size=2, stride=1), nn.MaxPool2d(2), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(32*5*5, 128), nn.ReLU(), nn.Linear(128, env.action_space.nvec.sum()) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def get_action(self, x, action=None, invalid_action_masks=None): logits = self.forward(x) split_logits = torch.split(logits, env.action_space.nvec.tolist(), dim=1) if invalid_action_masks is not None: split_invalid_action_masks = torch.split(invalid_action_masks, env.action_space.nvec.tolist(), dim=1) multi_categoricals = [CategoricalMasked(logits=logits, masks=iam) for (logits, iam) in zip(split_logits, split_invalid_action_masks)] else: multi_categoricals = [Categorical(logits=logits) for logits in split_logits] if action is None: action = torch.stack([categorical.sample() for categorical in multi_categoricals]) logprob = torch.stack([categorical.log_prob(a) for a, categorical in zip(action, multi_categoricals)]) return action, logprob, [], multi_categoricals class Value(nn.Module): def __init__(self): super(Value, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=3, stride=1), nn.MaxPool2d(2), nn.ReLU(), nn.Conv2d(16, 32, kernel_size=2, stride=1), nn.MaxPool2d(2), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(32*5*5, 128), nn.ReLU(), nn.Linear(128, 1) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def discount_cumsum(x, dones, gamma): """ computing discounted cumulative sums of vectors that resets with dones input: vector x, vector dones, [x0, [0, x1, 0, x2 1, x3 0, x4] 0] output: [x0 + discount * x1 + discount^2 * x2, x1 + discount * x2, x2, x3 + discount * x4, x4] """ discount_cumsum = np.zeros_like(x) discount_cumsum[-1] = x[-1] for t in reversed(range(x.shape[0]-1)): discount_cumsum[t] = x[t] + gamma * discount_cumsum[t+1] * (1-dones[t]) return discount_cumsum pg = Policy().to(device) vf = Value().to(device) # MODIFIED: Separate optimizer and learning rates pg_optimizer = optim.Adam(list(pg.parameters()), lr=args.policy_lr) v_optimizer = optim.Adam(list(vf.parameters()), lr=args.value_lr) # MODIFIED: Initializing learning rate anneal scheduler when need if args.anneal_lr: anneal_fn = lambda f: max(0, 1-f / args.total_timesteps) pg_lr_scheduler = optim.lr_scheduler.LambdaLR(pg_optimizer, lr_lambda=anneal_fn) vf_lr_scheduler = optim.lr_scheduler.LambdaLR(v_optimizer, lr_lambda=anneal_fn) loss_fn = nn.MSELoss() def evaluate_with_no_mask(): evaluate_rewards = [] evaluate_invalid_action_stats = [] if args.capture_video: env.stats_recorder.done=True next_obs = np.array(env.reset()) # ALGO Logic: Storage for epoch data obs = np.empty((args.batch_size,) + env.observation_space.shape) actions = np.empty((args.batch_size,) + env.action_space.shape) logprobs = torch.zeros((env.action_space.nvec.shape[0], args.batch_size,)).to(device) rewards = np.zeros((args.batch_size,)) raw_rewards = np.zeros((len(env.rfs),args.batch_size,)) real_rewards = [] invalid_action_stats = [] dones = np.zeros((args.batch_size,)) values = torch.zeros((args.batch_size,)).to(device) # TRY NOT TO MODIFY: prepare the execution of the game. for step in range(args.batch_size): env.render() obs[step] = next_obs.copy() # ALGO LOGIC: put action logic here with torch.no_grad(): values[step] = vf.forward(obs[step:step+1]) action, logproba, _, probs = pg.get_action(obs[step:step+1]) actions[step] = action[:,0].data.cpu().numpy() logprobs[:,[step]] = logproba # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards[step], dones[step], info = env.step(action[:,0].data.cpu().numpy()) raw_rewards[:,step] = info["rewards"] real_rewards += [info['real_reward']] invalid_action_stats += [info['invalid_action_stats']] next_obs = np.array(next_obs) if dones[step]: # Computing the discounted returns: writer.add_scalar("charts/episode_reward", np.sum(real_rewards), global_step) evaluate_rewards += [np.sum(real_rewards)] real_rewards = [] evaluate_invalid_action_stats += [pd.DataFrame(invalid_action_stats).sum(0)] invalid_action_stats = [] next_obs = np.array(env.reset()) return np.average(evaluate_rewards), pd.DataFrame(evaluate_invalid_action_stats).mean(0) # TRY NOT TO MODIFY: start the game global_step = 0 while global_step < args.total_timesteps: if args.capture_video: env.stats_recorder.done=True next_obs = np.array(env.reset()) # ALGO Logic: Storage for epoch data obs = np.empty((args.batch_size,) + env.observation_space.shape) actions = np.empty((args.batch_size,) + env.action_space.shape) logprobs = torch.zeros((env.action_space.nvec.shape[0], args.batch_size,)).to(device) rewards = np.zeros((args.batch_size,)) raw_rewards = np.zeros((len(env.rfs),args.batch_size,)) real_rewards = [] invalid_action_stats = [] dones = np.zeros((args.batch_size,)) values = torch.zeros((args.batch_size,)).to(device) invalid_action_masks = torch.zeros((args.batch_size, env.action_space.nvec.sum())) # TRY NOT TO MODIFY: prepare the execution of the game. for step in range(args.batch_size): env.render() global_step += 1 obs[step] = next_obs.copy() # ALGO LOGIC: put action logic here invalid_action_mask = torch.ones(env.action_space.nvec.sum()) invalid_action_mask[0:env.action_space.nvec[0]] = torch.tensor(env.unit_location_mask) invalid_action_mask[-env.action_space.nvec[-1]:] = torch.tensor(env.target_unit_location_mask) invalid_action_masks[step] = invalid_action_mask with torch.no_grad(): values[step] = vf.forward(obs[step:step+1]) action, logproba, _, probs = pg.get_action(obs[step:step+1], invalid_action_masks=invalid_action_masks[step:step+1]) actions[step] = action[:,0].data.cpu().numpy() logprobs[:,[step]] = logproba # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards[step], dones[step], info = env.step(action[:,0].data.cpu().numpy()) raw_rewards[:,step] = info["rewards"] real_rewards += [info['real_reward']] invalid_action_stats += [info['invalid_action_stats']] next_obs = np.array(next_obs) # Annealing the rate if instructed to do so. if args.anneal_lr: pg_lr_scheduler.step() vf_lr_scheduler.step() if dones[step]: # Computing the discounted returns: writer.add_scalar("charts/episode_reward", np.sum(real_rewards), global_step) print(f"global_step={global_step}, episode_reward={np.sum(real_rewards)}") for i in range(len(env.rfs)): writer.add_scalar(f"charts/episode_reward/{str(env.rfs[i])}", raw_rewards.sum(1)[i], global_step) real_rewards = [] for key, idx in zip(info['invalid_action_stats'], range(len(info['invalid_action_stats']))): writer.add_scalar(f"stats/{key}", pd.DataFrame(invalid_action_stats).sum(0)[idx], global_step) invalid_action_stats = [] next_obs = np.array(env.reset()) # bootstrap reward if not done. reached the batch limit last_value = 0 if not dones[step]: last_value = vf.forward(next_obs.reshape((1,)+next_obs.shape))[0].detach().cpu().numpy()[0] bootstrapped_rewards = np.append(rewards, last_value) # calculate the returns and advantages if args.gae: bootstrapped_values = np.append(values.detach().cpu().numpy(), last_value) deltas = bootstrapped_rewards[:-1] + args.gamma * bootstrapped_values[1:] * (1-dones) - bootstrapped_values[:-1] advantages = discount_cumsum(deltas, dones, args.gamma * args.gae_lambda) advantages = torch.Tensor(advantages).to(device) returns = advantages + values else: returns = discount_cumsum(bootstrapped_rewards, dones, args.gamma)[:-1] advantages = returns - values.detach().cpu().numpy() advantages = torch.Tensor(advantages).to(device) returns = torch.Tensor(returns).to(device) # Advantage normalization if args.norm_adv: EPS = 1e-10 advantages = (advantages - advantages.mean()) / (advantages.std() + EPS) # Optimizaing policy network entropys = [] target_pg = Policy().to(device) inds = np.arange(args.batch_size,) for i_epoch_pi in range(args.update_epochs): np.random.shuffle(inds) for start in range(0, args.batch_size, args.minibatch_size): end = start + args.minibatch_size minibatch_ind = inds[start:end] target_pg.load_state_dict(pg.state_dict()) _, newlogproba, _, _ = pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T, invalid_action_masks[minibatch_ind]) ratio = (newlogproba - logprobs[:,minibatch_ind]).exp() # Policy loss as in OpenAI SpinUp clip_adv = torch.where(advantages[minibatch_ind] > 0, (1.+args.clip_coef) * advantages[minibatch_ind], (1.-args.clip_coef) * advantages[minibatch_ind]).to(device) # Entropy computation with resampled actions entropy = -(newlogproba.exp() * newlogproba).mean() entropys.append(entropy.item()) policy_loss = -torch.min(ratio * advantages[minibatch_ind], clip_adv) + args.ent_coef * entropy policy_loss = policy_loss.mean() pg_optimizer.zero_grad() policy_loss.backward() nn.utils.clip_grad_norm_(pg.parameters(), args.max_grad_norm) pg_optimizer.step() approx_kl = (logprobs[:,minibatch_ind] - newlogproba).mean() # Optimizing value network new_values = vf.forward(obs[minibatch_ind]).view(-1) # Value loss clipping if args.clip_vloss: v_loss_unclipped = ((new_values - returns[minibatch_ind]) ** 2) v_clipped = values[minibatch_ind] + torch.clamp(new_values - values[minibatch_ind], -args.clip_coef, args.clip_coef) v_loss_clipped = (v_clipped - returns[minibatch_ind])**2 v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) v_loss = 0.5 * v_loss_max.mean() else: v_loss = torch.mean((returns[minibatch_ind]- new_values).pow(2)) v_optimizer.zero_grad() v_loss.backward() nn.utils.clip_grad_norm_(vf.parameters(), args.max_grad_norm) v_optimizer.step() if args.kle_stop: if approx_kl > args.target_kl: break if args.kle_rollback: if (logprobs[:,minibatch_ind] - pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T, invalid_action_masks[minibatch_ind])[1]).mean() > args.target_kl: pg.load_state_dict(target_pg.state_dict()) break # TRY NOT TO MODIFY: record rewards for plotting purposes writer.add_scalar("losses/value_loss", v_loss.item(), global_step) writer.add_scalar("charts/policy_learning_rate", pg_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("charts/value_learning_rate", v_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("losses/policy_loss", policy_loss.item(), global_step) writer.add_scalar("losses/entropy", np.mean(entropys), global_step) writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) if args.kle_stop or args.kle_rollback: writer.add_scalar("debug/pg_stop_iter", i_epoch_pi, global_step) # evaluate no mask average_reward, average_invalid_action_stats = evaluate_with_no_mask() writer.add_scalar("evals/charts/episode_reward", average_reward, global_step) print(f"global_step={global_step}, eval_reward={average_reward}") for key, idx in zip(info['invalid_action_stats'], range(len(info['invalid_action_stats']))): writer.add_scalar(f"evals/stats/{key}", average_invalid_action_stats[idx], global_step) env.close() writer.close() ================================================ FILE: invalid_action_masking/ppo_4x4.py ================================================ import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.distributions.categorical import Categorical from torch.utils.tensorboard import SummaryWriter from cleanrl.common import preprocess_obs_space, preprocess_ac_space import argparse import numpy as np import gym import gym_microrts from gym.wrappers import TimeLimit, Monitor from gym.spaces import Discrete, Box, MultiBinary, MultiDiscrete, Space import time import random import os import pandas as pd # taken from https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_normalize.py class RunningMeanStd(object): def __init__(self, epsilon=1e-4, shape=()): self.mean = np.zeros(shape, 'float64') self.var = np.ones(shape, 'float64') self.count = epsilon def update(self, x): batch_mean = np.mean([x], axis=0) batch_var = np.var([x], axis=0) batch_count = 1 self.update_from_moments(batch_mean, batch_var, batch_count) def update_from_moments(self, batch_mean, batch_var, batch_count): self.mean, self.var, self.count = update_mean_var_count_from_moments( self.mean, self.var, self.count, batch_mean, batch_var, batch_count) def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count): delta = batch_mean - mean tot_count = count + batch_count new_mean = mean + delta * batch_count / tot_count m_a = var * count m_b = batch_var * batch_count M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count new_var = M2 / tot_count new_count = tot_count return new_mean, new_var, new_count class NormalizedEnv(gym.core.Wrapper): def __init__(self, env, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): super(NormalizedEnv, self).__init__(env) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=(1,)) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(()) self.gamma = gamma self.epsilon = epsilon def step(self, action): obs, rews, news, infos = self.env.step(action) infos['real_reward'] = rews # print("before", self.ret) self.ret = self.ret * self.gamma + rews # print("after", self.ret) obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(np.array([self.ret].copy())) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret = self.ret * (1-float(news)) return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): self.ret = np.zeros(()) obs = self.env.reset() return self._obfilt(obs) if __name__ == "__main__": parser = argparse.ArgumentParser(description='PPO agent') # Common arguments parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"), help='the name of this experiment') parser.add_argument('--gym-id', type=str, default="MicrortsMining4x4F9-v0", help='the id of the gym environment') parser.add_argument('--seed', type=int, default=1, help='seed of the experiment') parser.add_argument('--episode-length', type=int, default=0, help='the maximum length of each episode') parser.add_argument('--total-timesteps', type=int, default=100000, help='total timesteps of the experiments') parser.add_argument('--no-torch-deterministic', action='store_false', dest="torch_deterministic", default=True, help='if toggled, `torch.backends.cudnn.deterministic=False`') parser.add_argument('--no-cuda', action='store_false', dest="cuda", default=True, help='if toggled, cuda will not be enabled by default') parser.add_argument('--prod-mode', action='store_true', default=False, help='run the script in production mode and use wandb to log outputs') parser.add_argument('--capture-video', action='store_true', default=False, help='weather to capture videos of the agent performances (check out `videos` folder)') parser.add_argument('--wandb-project-name', type=str, default="cleanRL", help="the wandb's project name") parser.add_argument('--wandb-entity', type=str, default=None, help="the entity (team) of wandb's project") # Algorithm specific arguments parser.add_argument('--batch-size', type=int, default=2048, help='the batch size of ppo') parser.add_argument('--minibatch-size', type=int, default=256, help='the mini batch size of ppo') parser.add_argument('--gamma', type=float, default=0.99, help='the discount factor gamma') parser.add_argument('--gae-lambda', type=float, default=0.97, help='the lambda for the general advantage estimation') parser.add_argument('--ent-coef', type=float, default=0.01, help="coefficient of the entropy") parser.add_argument('--max-grad-norm', type=float, default=0.5, help='the maximum norm for the gradient clipping') parser.add_argument('--clip-coef', type=float, default=0.2, help="the surrogate clipping coefficient") parser.add_argument('--update-epochs', type=int, default=10, help="the K epochs to update the policy") parser.add_argument('--kle-stop', action='store_true', default=False, help='If toggled, the policy updates will be early stopped w.r.t target-kl') parser.add_argument('--kle-rollback', action='store_true', default=False, help='If toggled, the policy updates will roll back to previous policy if KL exceeds target-kl') parser.add_argument('--target-kl', type=float, default=0.015, help='the target-kl variable that is referred by --kl') parser.add_argument('--gae', action='store_true', default=True, help='Use GAE for advantage computation') parser.add_argument('--policy-lr', type=float, default=3e-4, help="the learning rate of the policy optimizer") parser.add_argument('--value-lr', type=float, default=3e-4, help="the learning rate of the critic optimizer") parser.add_argument('--norm-obs', action='store_true', default=True, help="Toggles observation normalization") parser.add_argument('--norm-returns', action='store_true', default=False, help="Toggles returns normalization") parser.add_argument('--norm-adv', action='store_true', default=True, help="Toggles advantages normalization") parser.add_argument('--obs-clip', type=float, default=10.0, help="Value for reward clipping, as per the paper") parser.add_argument('--rew-clip', type=float, default=10.0, help="Value for observation clipping, as per the paper") parser.add_argument('--anneal-lr', action='store_true', default=True, help="Toggle learning rate annealing for policy and value networks") parser.add_argument('--weights-init', default="orthogonal", choices=["xavier", 'orthogonal'], help='Selects the scheme to be used for weights initialization'), parser.add_argument('--clip-vloss', action="store_true", default=True, help='Toggles wheter or not to use a clipped loss for the value function, as per the paper.') parser.add_argument('--pol-layer-norm', action='store_true', default=False, help='Enables layer normalization in the policy network') args = parser.parse_args() if not args.seed: args.seed = int(time.time()) args.features_turned_on = sum([args.kle_stop, args.kle_rollback, args.gae, args.norm_obs, args.norm_returns, args.norm_adv, args.anneal_lr, args.clip_vloss, args.pol_layer_norm]) # TRY NOT TO MODIFY: setup the environment experiment_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" writer = SummaryWriter(f"runs/{experiment_name}") writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % ( '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) if args.prod_mode: import wandb wandb.init(project=args.wandb_project_name, entity=args.wandb_entity, tensorboard=True, config=vars(args), name=experiment_name, monitor_gym=True) writer = SummaryWriter(f"/tmp/{experiment_name}") wandb.save(os.path.abspath(__file__)) # TRY NOT TO MODIFY: seeding device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu') env = gym.make(args.gym_id) # respect the default timelimit assert isinstance(env.action_space, MultiDiscrete), "only MultiDiscrete action space is supported" assert isinstance(env, TimeLimit) or int(args.episode_length), "the gym env does not have a built in TimeLimit, please specify by using --episode-length" if isinstance(env, TimeLimit): if int(args.episode_length): env._max_episode_steps = int(args.episode_length) args.episode_length = env._max_episode_steps else: env = TimeLimit(env, int(args.episode_length)) env = NormalizedEnv(env.env, ob=args.norm_obs, ret=args.norm_returns, clipob=args.obs_clip, cliprew=args.rew_clip, gamma=args.gamma) env = TimeLimit(env, int(args.episode_length)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic env.seed(args.seed) env.action_space.seed(args.seed) env.observation_space.seed(args.seed) if args.capture_video: env = Monitor(env, f'videos/{experiment_name}') # ALGO LOGIC: initialize agent here: class CategoricalMasked(Categorical): def __init__(self, probs=None, logits=None, validate_args=None, masks=[]): self.masks = masks if len(self.masks) == 0: super(CategoricalMasked, self).__init__(probs, logits, validate_args) else: self.masks = masks.type(torch.BoolTensor).to(device) logits = torch.where(self.masks, logits, torch.tensor(-1e+8).to(device)) super(CategoricalMasked, self).__init__(probs, logits, validate_args) def entropy(self): if len(self.masks) == 0: return super(CategoricalMasked, self).entropy() p_log_p = self.logits * self.probs p_log_p = torch.where(self.masks, p_log_p, torch.tensor(0.).to(device)) return -p_log_p.sum(-1) class Policy(nn.Module): def __init__(self): super(Policy, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=2,), nn.MaxPool2d(1), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(16*3*3, 128), nn.ReLU(), nn.Linear(128, env.action_space.nvec.sum()) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def get_action(self, x, action=None, invalid_action_masks=None): logits = self.forward(x) split_logits = torch.split(logits, env.action_space.nvec.tolist(), dim=1) if invalid_action_masks is not None: split_invalid_action_masks = torch.split(invalid_action_masks, env.action_space.nvec.tolist(), dim=1) multi_categoricals = [CategoricalMasked(logits=logits, masks=iam) for (logits, iam) in zip(split_logits, split_invalid_action_masks)] else: multi_categoricals = [Categorical(logits=logits) for logits in split_logits] if action is None: action = torch.stack([categorical.sample() for categorical in multi_categoricals]) logprob = torch.stack([categorical.log_prob(a) for a, categorical in zip(action, multi_categoricals)]) return action, logprob, [], multi_categoricals class Value(nn.Module): def __init__(self): super(Value, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=2,), nn.MaxPool2d(1), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(16*3*3, 128), nn.ReLU(), nn.Linear(128, 1) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def discount_cumsum(x, dones, gamma): """ computing discounted cumulative sums of vectors that resets with dones input: vector x, vector dones, [x0, [0, x1, 0, x2 1, x3 0, x4] 0] output: [x0 + discount * x1 + discount^2 * x2, x1 + discount * x2, x2, x3 + discount * x4, x4] """ discount_cumsum = np.zeros_like(x) discount_cumsum[-1] = x[-1] for t in reversed(range(x.shape[0]-1)): discount_cumsum[t] = x[t] + gamma * discount_cumsum[t+1] * (1-dones[t]) return discount_cumsum pg = Policy().to(device) vf = Value().to(device) # MODIFIED: Separate optimizer and learning rates pg_optimizer = optim.Adam(list(pg.parameters()), lr=args.policy_lr) v_optimizer = optim.Adam(list(vf.parameters()), lr=args.value_lr) # MODIFIED: Initializing learning rate anneal scheduler when need if args.anneal_lr: anneal_fn = lambda f: max(0, 1-f / args.total_timesteps) pg_lr_scheduler = optim.lr_scheduler.LambdaLR(pg_optimizer, lr_lambda=anneal_fn) vf_lr_scheduler = optim.lr_scheduler.LambdaLR(v_optimizer, lr_lambda=anneal_fn) loss_fn = nn.MSELoss() def evaluate_with_no_mask(): evaluate_rewards = [] evaluate_invalid_action_stats = [] if args.capture_video: env.stats_recorder.done=True next_obs = np.array(env.reset()) # ALGO Logic: Storage for epoch data obs = np.empty((args.batch_size,) + env.observation_space.shape) actions = np.empty((args.batch_size,) + env.action_space.shape) logprobs = torch.zeros((env.action_space.nvec.shape[0], args.batch_size,)).to(device) rewards = np.zeros((args.batch_size,)) raw_rewards = np.zeros((len(env.rfs),args.batch_size,)) real_rewards = [] invalid_action_stats = [] dones = np.zeros((args.batch_size,)) values = torch.zeros((args.batch_size,)).to(device) # TRY NOT TO MODIFY: prepare the execution of the game. for step in range(args.batch_size): env.render() obs[step] = next_obs.copy() # ALGO LOGIC: put action logic here with torch.no_grad(): values[step] = vf.forward(obs[step:step+1]) action, logproba, _, probs = pg.get_action(obs[step:step+1]) actions[step] = action[:,0].data.cpu().numpy() logprobs[:,[step]] = logproba # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards[step], dones[step], info = env.step(action[:,0].data.cpu().numpy()) raw_rewards[:,step] = info["rewards"] real_rewards += [info['real_reward']] invalid_action_stats += [info['invalid_action_stats']] next_obs = np.array(next_obs) if dones[step]: # Computing the discounted returns: writer.add_scalar("charts/episode_reward", np.sum(real_rewards), global_step) evaluate_rewards += [np.sum(real_rewards)] real_rewards = [] evaluate_invalid_action_stats += [pd.DataFrame(invalid_action_stats).sum(0)] invalid_action_stats = [] next_obs = np.array(env.reset()) return np.average(evaluate_rewards), pd.DataFrame(evaluate_invalid_action_stats).mean(0) # TRY NOT TO MODIFY: start the game global_step = 0 while global_step < args.total_timesteps: if args.capture_video: env.stats_recorder.done=True next_obs = np.array(env.reset()) # ALGO Logic: Storage for epoch data obs = np.empty((args.batch_size,) + env.observation_space.shape) actions = np.empty((args.batch_size,) + env.action_space.shape) logprobs = torch.zeros((env.action_space.nvec.shape[0], args.batch_size,)).to(device) rewards = np.zeros((args.batch_size,)) raw_rewards = np.zeros((len(env.rfs),args.batch_size,)) real_rewards = [] invalid_action_stats = [] dones = np.zeros((args.batch_size,)) values = torch.zeros((args.batch_size,)).to(device) invalid_action_masks = torch.zeros((args.batch_size, env.action_space.nvec.sum())) # TRY NOT TO MODIFY: prepare the execution of the game. for step in range(args.batch_size): env.render() global_step += 1 obs[step] = next_obs.copy() # ALGO LOGIC: put action logic here invalid_action_mask = torch.ones(env.action_space.nvec.sum()) invalid_action_mask[0:env.action_space.nvec[0]] = torch.tensor(env.unit_location_mask) invalid_action_mask[-env.action_space.nvec[-1]:] = torch.tensor(env.target_unit_location_mask) invalid_action_masks[step] = invalid_action_mask with torch.no_grad(): values[step] = vf.forward(obs[step:step+1]) action, logproba, _, probs = pg.get_action(obs[step:step+1], invalid_action_masks=invalid_action_masks[step:step+1]) actions[step] = action[:,0].data.cpu().numpy() logprobs[:,[step]] = logproba # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards[step], dones[step], info = env.step(action[:,0].data.cpu().numpy()) raw_rewards[:,step] = info["rewards"] real_rewards += [info['real_reward']] invalid_action_stats += [info['invalid_action_stats']] next_obs = np.array(next_obs) # Annealing the rate if instructed to do so. if args.anneal_lr: pg_lr_scheduler.step() vf_lr_scheduler.step() if dones[step]: # Computing the discounted returns: writer.add_scalar("charts/episode_reward", np.sum(real_rewards), global_step) print(f"global_step={global_step}, episode_reward={np.sum(real_rewards)}") for i in range(len(env.rfs)): writer.add_scalar(f"charts/episode_reward/{str(env.rfs[i])}", raw_rewards.sum(1)[i], global_step) real_rewards = [] for key, idx in zip(info['invalid_action_stats'], range(len(info['invalid_action_stats']))): writer.add_scalar(f"stats/{key}", pd.DataFrame(invalid_action_stats).sum(0)[idx], global_step) invalid_action_stats = [] next_obs = np.array(env.reset()) # bootstrap reward if not done. reached the batch limit last_value = 0 if not dones[step]: last_value = vf.forward(next_obs.reshape((1,)+next_obs.shape))[0].detach().cpu().numpy()[0] bootstrapped_rewards = np.append(rewards, last_value) # calculate the returns and advantages if args.gae: bootstrapped_values = np.append(values.detach().cpu().numpy(), last_value) deltas = bootstrapped_rewards[:-1] + args.gamma * bootstrapped_values[1:] * (1-dones) - bootstrapped_values[:-1] advantages = discount_cumsum(deltas, dones, args.gamma * args.gae_lambda) advantages = torch.Tensor(advantages).to(device) returns = advantages + values else: returns = discount_cumsum(bootstrapped_rewards, dones, args.gamma)[:-1] advantages = returns - values.detach().cpu().numpy() advantages = torch.Tensor(advantages).to(device) returns = torch.Tensor(returns).to(device) # Advantage normalization if args.norm_adv: EPS = 1e-10 advantages = (advantages - advantages.mean()) / (advantages.std() + EPS) # Optimizaing policy network entropys = [] target_pg = Policy().to(device) inds = np.arange(args.batch_size,) for i_epoch_pi in range(args.update_epochs): np.random.shuffle(inds) for start in range(0, args.batch_size, args.minibatch_size): end = start + args.minibatch_size minibatch_ind = inds[start:end] target_pg.load_state_dict(pg.state_dict()) _, newlogproba, _, _ = pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T, invalid_action_masks[minibatch_ind]) ratio = (newlogproba - logprobs[:,minibatch_ind]).exp() # Policy loss as in OpenAI SpinUp clip_adv = torch.where(advantages[minibatch_ind] > 0, (1.+args.clip_coef) * advantages[minibatch_ind], (1.-args.clip_coef) * advantages[minibatch_ind]).to(device) # Entropy computation with resampled actions entropy = -(newlogproba.exp() * newlogproba).mean() entropys.append(entropy.item()) policy_loss = -torch.min(ratio * advantages[minibatch_ind], clip_adv) + args.ent_coef * entropy policy_loss = policy_loss.mean() pg_optimizer.zero_grad() policy_loss.backward() nn.utils.clip_grad_norm_(pg.parameters(), args.max_grad_norm) pg_optimizer.step() approx_kl = (logprobs[:,minibatch_ind] - newlogproba).mean() # Optimizing value network new_values = vf.forward(obs[minibatch_ind]).view(-1) # Value loss clipping if args.clip_vloss: v_loss_unclipped = ((new_values - returns[minibatch_ind]) ** 2) v_clipped = values[minibatch_ind] + torch.clamp(new_values - values[minibatch_ind], -args.clip_coef, args.clip_coef) v_loss_clipped = (v_clipped - returns[minibatch_ind])**2 v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) v_loss = 0.5 * v_loss_max.mean() else: v_loss = torch.mean((returns[minibatch_ind]- new_values).pow(2)) v_optimizer.zero_grad() v_loss.backward() nn.utils.clip_grad_norm_(vf.parameters(), args.max_grad_norm) v_optimizer.step() if args.kle_stop: if approx_kl > args.target_kl: break if args.kle_rollback: if (logprobs[:,minibatch_ind] - pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T, invalid_action_masks[minibatch_ind])[1]).mean() > args.target_kl: pg.load_state_dict(target_pg.state_dict()) break # TRY NOT TO MODIFY: record rewards for plotting purposes writer.add_scalar("losses/value_loss", v_loss.item(), global_step) writer.add_scalar("charts/policy_learning_rate", pg_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("charts/value_learning_rate", v_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("losses/policy_loss", policy_loss.item(), global_step) writer.add_scalar("losses/entropy", np.mean(entropys), global_step) writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) if args.kle_stop or args.kle_rollback: writer.add_scalar("debug/pg_stop_iter", i_epoch_pi, global_step) # evaluate no mask average_reward, average_invalid_action_stats = evaluate_with_no_mask() writer.add_scalar("evals/charts/episode_reward", average_reward, global_step) print(f"global_step={global_step}, eval_reward={average_reward}") for key, idx in zip(info['invalid_action_stats'], range(len(info['invalid_action_stats']))): writer.add_scalar(f"evals/stats/{key}", average_invalid_action_stats[idx], global_step) env.close() writer.close() ================================================ FILE: invalid_action_masking/ppo_no_adj_10x10.py ================================================ import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.distributions.categorical import Categorical from torch.utils.tensorboard import SummaryWriter from cleanrl.common import preprocess_obs_space, preprocess_ac_space import argparse import numpy as np import gym import gym_microrts from gym.wrappers import TimeLimit, Monitor from gym.spaces import Discrete, Box, MultiBinary, MultiDiscrete, Space import time import random import os import pandas as pd # taken from https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_normalize.py class RunningMeanStd(object): def __init__(self, epsilon=1e-4, shape=()): self.mean = np.zeros(shape, 'float64') self.var = np.ones(shape, 'float64') self.count = epsilon def update(self, x): batch_mean = np.mean([x], axis=0) batch_var = np.var([x], axis=0) batch_count = 1 self.update_from_moments(batch_mean, batch_var, batch_count) def update_from_moments(self, batch_mean, batch_var, batch_count): self.mean, self.var, self.count = update_mean_var_count_from_moments( self.mean, self.var, self.count, batch_mean, batch_var, batch_count) def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count): delta = batch_mean - mean tot_count = count + batch_count new_mean = mean + delta * batch_count / tot_count m_a = var * count m_b = batch_var * batch_count M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count new_var = M2 / tot_count new_count = tot_count return new_mean, new_var, new_count class NormalizedEnv(gym.core.Wrapper): def __init__(self, env, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): super(NormalizedEnv, self).__init__(env) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=(1,)) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(()) self.gamma = gamma self.epsilon = epsilon def step(self, action): obs, rews, news, infos = self.env.step(action) infos['real_reward'] = rews # print("before", self.ret) self.ret = self.ret * self.gamma + rews # print("after", self.ret) obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(np.array([self.ret].copy())) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret = self.ret * (1-float(news)) return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): self.ret = np.zeros(()) obs = self.env.reset() return self._obfilt(obs) if __name__ == "__main__": parser = argparse.ArgumentParser(description='PPO agent') # Common arguments parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"), help='the name of this experiment') parser.add_argument('--gym-id', type=str, default="MicrortsMining10x10F9-v0", help='the id of the gym environment') parser.add_argument('--seed', type=int, default=1, help='seed of the experiment') parser.add_argument('--episode-length', type=int, default=0, help='the maximum length of each episode') parser.add_argument('--total-timesteps', type=int, default=100000, help='total timesteps of the experiments') parser.add_argument('--no-torch-deterministic', action='store_false', dest="torch_deterministic", default=True, help='if toggled, `torch.backends.cudnn.deterministic=False`') parser.add_argument('--no-cuda', action='store_false', dest="cuda", default=True, help='if toggled, cuda will not be enabled by default') parser.add_argument('--prod-mode', action='store_true', default=False, help='run the script in production mode and use wandb to log outputs') parser.add_argument('--capture-video', action='store_true', default=False, help='weather to capture videos of the agent performances (check out `videos` folder)') parser.add_argument('--wandb-project-name', type=str, default="cleanRL", help="the wandb's project name") parser.add_argument('--wandb-entity', type=str, default=None, help="the entity (team) of wandb's project") # Algorithm specific arguments parser.add_argument('--batch-size', type=int, default=2048, help='the batch size of ppo') parser.add_argument('--minibatch-size', type=int, default=256, help='the mini batch size of ppo') parser.add_argument('--gamma', type=float, default=0.99, help='the discount factor gamma') parser.add_argument('--gae-lambda', type=float, default=0.97, help='the lambda for the general advantage estimation') parser.add_argument('--ent-coef', type=float, default=0.01, help="coefficient of the entropy") parser.add_argument('--max-grad-norm', type=float, default=0.5, help='the maximum norm for the gradient clipping') parser.add_argument('--clip-coef', type=float, default=0.2, help="the surrogate clipping coefficient") parser.add_argument('--update-epochs', type=int, default=10, help="the K epochs to update the policy") parser.add_argument('--kle-stop', action='store_true', default=False, help='If toggled, the policy updates will be early stopped w.r.t target-kl') parser.add_argument('--kle-rollback', action='store_true', default=False, help='If toggled, the policy updates will roll back to previous policy if KL exceeds target-kl') parser.add_argument('--target-kl', type=float, default=0.015, help='the target-kl variable that is referred by --kl') parser.add_argument('--gae', action='store_true', default=True, help='Use GAE for advantage computation') parser.add_argument('--policy-lr', type=float, default=3e-4, help="the learning rate of the policy optimizer") parser.add_argument('--value-lr', type=float, default=3e-4, help="the learning rate of the critic optimizer") parser.add_argument('--norm-obs', action='store_true', default=True, help="Toggles observation normalization") parser.add_argument('--norm-returns', action='store_true', default=False, help="Toggles returns normalization") parser.add_argument('--norm-adv', action='store_true', default=True, help="Toggles advantages normalization") parser.add_argument('--obs-clip', type=float, default=10.0, help="Value for reward clipping, as per the paper") parser.add_argument('--rew-clip', type=float, default=10.0, help="Value for observation clipping, as per the paper") parser.add_argument('--anneal-lr', action='store_true', default=True, help="Toggle learning rate annealing for policy and value networks") parser.add_argument('--weights-init', default="orthogonal", choices=["xavier", 'orthogonal'], help='Selects the scheme to be used for weights initialization'), parser.add_argument('--clip-vloss', action="store_true", default=True, help='Toggles wheter or not to use a clipped loss for the value function, as per the paper.') parser.add_argument('--pol-layer-norm', action='store_true', default=False, help='Enables layer normalization in the policy network') args = parser.parse_args() if not args.seed: args.seed = int(time.time()) args.features_turned_on = sum([args.kle_stop, args.kle_rollback, args.gae, args.norm_obs, args.norm_returns, args.norm_adv, args.anneal_lr, args.clip_vloss, args.pol_layer_norm]) # TRY NOT TO MODIFY: setup the environment experiment_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" writer = SummaryWriter(f"runs/{experiment_name}") writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % ( '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) if args.prod_mode: import wandb wandb.init(project=args.wandb_project_name, entity=args.wandb_entity, tensorboard=True, config=vars(args), name=experiment_name, monitor_gym=True) writer = SummaryWriter(f"/tmp/{experiment_name}") wandb.save(os.path.abspath(__file__)) # TRY NOT TO MODIFY: seeding device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu') env = gym.make(args.gym_id) # respect the default timelimit assert isinstance(env.action_space, MultiDiscrete), "only MultiDiscrete action space is supported" assert isinstance(env, TimeLimit) or int(args.episode_length), "the gym env does not have a built in TimeLimit, please specify by using --episode-length" if isinstance(env, TimeLimit): if int(args.episode_length): env._max_episode_steps = int(args.episode_length) args.episode_length = env._max_episode_steps else: env = TimeLimit(env, int(args.episode_length)) env = NormalizedEnv(env.env, ob=args.norm_obs, ret=args.norm_returns, clipob=args.obs_clip, cliprew=args.rew_clip, gamma=args.gamma) env = TimeLimit(env, int(args.episode_length)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic env.seed(args.seed) env.action_space.seed(args.seed) env.observation_space.seed(args.seed) if args.capture_video: env = Monitor(env, f'videos/{experiment_name}') # ALGO LOGIC: initialize agent here: class CategoricalMasked(Categorical): def __init__(self, probs=None, logits=None, validate_args=None, masks=[]): self.masks = masks if len(self.masks) == 0: super(CategoricalMasked, self).__init__(probs, logits, validate_args) else: self.masks = masks.type(torch.BoolTensor).to(device) logits = torch.where(self.masks, logits, torch.tensor(-1e+8).to(device)) super(CategoricalMasked, self).__init__(probs, logits, validate_args) def entropy(self): if len(self.masks) == 0: return super(CategoricalMasked, self).entropy() p_log_p = self.logits * self.probs p_log_p = torch.where(self.masks, p_log_p, torch.tensor(0.).to(device)) return -p_log_p.sum(-1) class Policy(nn.Module): def __init__(self): super(Policy, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=3,), nn.MaxPool2d(1), nn.ReLU(), nn.Conv2d(16, 32, kernel_size=3), nn.MaxPool2d(1), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(32*6*6, 128), nn.ReLU(), nn.Linear(128, env.action_space.nvec.sum()) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def get_action(self, x, action=None, invalid_action_masks=None): logits = self.forward(x) split_logits = torch.split(logits, env.action_space.nvec.tolist(), dim=1) if invalid_action_masks is not None: split_invalid_action_masks = torch.split(invalid_action_masks, env.action_space.nvec.tolist(), dim=1) multi_categoricals = [CategoricalMasked(logits=logits, masks=iam) for (logits, iam) in zip(split_logits, split_invalid_action_masks)] else: multi_categoricals = [Categorical(logits=logits) for logits in split_logits] if action is None: action = torch.stack([categorical.sample() for categorical in multi_categoricals]) logprob = torch.stack([categorical.log_prob(a) for a, categorical in zip(action, multi_categoricals)]) return action, logprob, [], multi_categoricals class Value(nn.Module): def __init__(self): super(Value, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=3,), nn.MaxPool2d(1), nn.ReLU(), nn.Conv2d(16, 32, kernel_size=3), nn.MaxPool2d(1), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(32*6*6, 128), nn.ReLU(), nn.Linear(128, 1) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def discount_cumsum(x, dones, gamma): """ computing discounted cumulative sums of vectors that resets with dones input: vector x, vector dones, [x0, [0, x1, 0, x2 1, x3 0, x4] 0] output: [x0 + discount * x1 + discount^2 * x2, x1 + discount * x2, x2, x3 + discount * x4, x4] """ discount_cumsum = np.zeros_like(x) discount_cumsum[-1] = x[-1] for t in reversed(range(x.shape[0]-1)): discount_cumsum[t] = x[t] + gamma * discount_cumsum[t+1] * (1-dones[t]) return discount_cumsum pg = Policy().to(device) vf = Value().to(device) # MODIFIED: Separate optimizer and learning rates pg_optimizer = optim.Adam(list(pg.parameters()), lr=args.policy_lr) v_optimizer = optim.Adam(list(vf.parameters()), lr=args.value_lr) # MODIFIED: Initializing learning rate anneal scheduler when need if args.anneal_lr: anneal_fn = lambda f: max(0, 1-f / args.total_timesteps) pg_lr_scheduler = optim.lr_scheduler.LambdaLR(pg_optimizer, lr_lambda=anneal_fn) vf_lr_scheduler = optim.lr_scheduler.LambdaLR(v_optimizer, lr_lambda=anneal_fn) loss_fn = nn.MSELoss() # TRY NOT TO MODIFY: start the game global_step = 0 while global_step < args.total_timesteps: if args.capture_video: env.stats_recorder.done=True next_obs = np.array(env.reset()) # ALGO Logic: Storage for epoch data obs = np.empty((args.batch_size,) + env.observation_space.shape) actions = np.empty((args.batch_size,) + env.action_space.shape) logprobs = torch.zeros((env.action_space.nvec.shape[0], args.batch_size,)).to(device) rewards = np.zeros((args.batch_size,)) raw_rewards = np.zeros((len(env.rfs),args.batch_size,)) real_rewards = [] invalid_action_stats = [] dones = np.zeros((args.batch_size,)) values = torch.zeros((args.batch_size,)).to(device) invalid_action_masks = torch.zeros((args.batch_size, env.action_space.nvec.sum())) # TRY NOT TO MODIFY: prepare the execution of the game. for step in range(args.batch_size): env.render() global_step += 1 obs[step] = next_obs.copy() # ALGO LOGIC: put action logic here invalid_action_mask = torch.ones(env.action_space.nvec.sum()) invalid_action_mask[0:env.action_space.nvec[0]] = torch.tensor(env.unit_location_mask) invalid_action_mask[-env.action_space.nvec[-1]:] = torch.tensor(env.target_unit_location_mask) invalid_action_masks[step] = invalid_action_mask with torch.no_grad(): values[step] = vf.forward(obs[step:step+1]) action, logproba, _, probs = pg.get_action(obs[step:step+1], invalid_action_masks=invalid_action_masks[step:step+1]) # CORE LOGIC: # use the action generated by CategoricalMasked, but # don't adjust the logprobability accordingly. Instead, calculate the log # probability using Categorical action, logproba, _, probs = pg.get_action(obs[step:step+1], action=action) actions[step] = action[:,0].data.cpu().numpy() logprobs[:,[step]] = logproba # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards[step], dones[step], info = env.step(action[:,0].data.cpu().numpy()) raw_rewards[:,step] = info["rewards"] real_rewards += [info['real_reward']] invalid_action_stats += [info['invalid_action_stats']] next_obs = np.array(next_obs) # Annealing the rate if instructed to do so. if args.anneal_lr: pg_lr_scheduler.step() vf_lr_scheduler.step() if dones[step]: # Computing the discounted returns: writer.add_scalar("charts/episode_reward", np.sum(real_rewards), global_step) print(f"global_step={global_step}, episode_reward={np.sum(real_rewards)}") for i in range(len(env.rfs)): writer.add_scalar(f"charts/episode_reward/{str(env.rfs[i])}", raw_rewards.sum(1)[i], global_step) real_rewards = [] for key, idx in zip(info['invalid_action_stats'], range(len(info['invalid_action_stats']))): writer.add_scalar(f"stats/{key}", pd.DataFrame(invalid_action_stats).sum(0)[idx], global_step) invalid_action_stats = [] next_obs = np.array(env.reset()) # bootstrap reward if not done. reached the batch limit last_value = 0 if not dones[step]: last_value = vf.forward(next_obs.reshape((1,)+next_obs.shape))[0].detach().cpu().numpy()[0] bootstrapped_rewards = np.append(rewards, last_value) # calculate the returns and advantages if args.gae: bootstrapped_values = np.append(values.detach().cpu().numpy(), last_value) deltas = bootstrapped_rewards[:-1] + args.gamma * bootstrapped_values[1:] * (1-dones) - bootstrapped_values[:-1] advantages = discount_cumsum(deltas, dones, args.gamma * args.gae_lambda) advantages = torch.Tensor(advantages).to(device) returns = advantages + values else: returns = discount_cumsum(bootstrapped_rewards, dones, args.gamma)[:-1] advantages = returns - values.detach().cpu().numpy() advantages = torch.Tensor(advantages).to(device) returns = torch.Tensor(returns).to(device) # Advantage normalization if args.norm_adv: EPS = 1e-10 advantages = (advantages - advantages.mean()) / (advantages.std() + EPS) # Optimizaing policy network entropys = [] target_pg = Policy().to(device) inds = np.arange(args.batch_size,) for i_epoch_pi in range(args.update_epochs): np.random.shuffle(inds) for start in range(0, args.batch_size, args.minibatch_size): end = start + args.minibatch_size minibatch_ind = inds[start:end] target_pg.load_state_dict(pg.state_dict()) _, newlogproba, _, _ = pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T,) ratio = (newlogproba - logprobs[:,minibatch_ind]).exp() # Policy loss as in OpenAI SpinUp clip_adv = torch.where(advantages[minibatch_ind] > 0, (1.+args.clip_coef) * advantages[minibatch_ind], (1.-args.clip_coef) * advantages[minibatch_ind]).to(device) # Entropy computation with resampled actions entropy = -(newlogproba.exp() * newlogproba).mean() entropys.append(entropy.item()) policy_loss = -torch.min(ratio * advantages[minibatch_ind], clip_adv) + args.ent_coef * entropy policy_loss = policy_loss.mean() pg_optimizer.zero_grad() policy_loss.backward() nn.utils.clip_grad_norm_(pg.parameters(), args.max_grad_norm) pg_optimizer.step() approx_kl = (logprobs[:,minibatch_ind] - newlogproba).mean() # Optimizing value network new_values = vf.forward(obs[minibatch_ind]).view(-1) # Value loss clipping if args.clip_vloss: v_loss_unclipped = ((new_values - returns[minibatch_ind]) ** 2) v_clipped = values[minibatch_ind] + torch.clamp(new_values - values[minibatch_ind], -args.clip_coef, args.clip_coef) v_loss_clipped = (v_clipped - returns[minibatch_ind])**2 v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) v_loss = 0.5 * v_loss_max.mean() else: v_loss = torch.mean((returns[minibatch_ind]- new_values).pow(2)) v_optimizer.zero_grad() v_loss.backward() nn.utils.clip_grad_norm_(vf.parameters(), args.max_grad_norm) v_optimizer.step() if args.kle_stop: if approx_kl > args.target_kl: break if args.kle_rollback: if (logprobs[:,minibatch_ind] - pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T, invalid_action_masks[minibatch_ind])[1]).mean() > args.target_kl: pg.load_state_dict(target_pg.state_dict()) break # TRY NOT TO MODIFY: record rewards for plotting purposes writer.add_scalar("losses/value_loss", v_loss.item(), global_step) writer.add_scalar("charts/policy_learning_rate", pg_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("charts/value_learning_rate", v_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("losses/policy_loss", policy_loss.item(), global_step) writer.add_scalar("losses/entropy", np.mean(entropys), global_step) writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) if args.kle_stop or args.kle_rollback: writer.add_scalar("debug/pg_stop_iter", i_epoch_pi, global_step) env.close() writer.close() ================================================ FILE: invalid_action_masking/ppo_no_adj_16x16.py ================================================ import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.distributions.categorical import Categorical from torch.utils.tensorboard import SummaryWriter from cleanrl.common import preprocess_obs_space, preprocess_ac_space import argparse import numpy as np import gym import gym_microrts from gym.wrappers import TimeLimit, Monitor from gym.spaces import Discrete, Box, MultiBinary, MultiDiscrete, Space import time import random import os import pandas as pd # taken from https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_normalize.py class RunningMeanStd(object): def __init__(self, epsilon=1e-4, shape=()): self.mean = np.zeros(shape, 'float64') self.var = np.ones(shape, 'float64') self.count = epsilon def update(self, x): batch_mean = np.mean([x], axis=0) batch_var = np.var([x], axis=0) batch_count = 1 self.update_from_moments(batch_mean, batch_var, batch_count) def update_from_moments(self, batch_mean, batch_var, batch_count): self.mean, self.var, self.count = update_mean_var_count_from_moments( self.mean, self.var, self.count, batch_mean, batch_var, batch_count) def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count): delta = batch_mean - mean tot_count = count + batch_count new_mean = mean + delta * batch_count / tot_count m_a = var * count m_b = batch_var * batch_count M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count new_var = M2 / tot_count new_count = tot_count return new_mean, new_var, new_count class NormalizedEnv(gym.core.Wrapper): def __init__(self, env, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): super(NormalizedEnv, self).__init__(env) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=(1,)) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(()) self.gamma = gamma self.epsilon = epsilon def step(self, action): obs, rews, news, infos = self.env.step(action) infos['real_reward'] = rews # print("before", self.ret) self.ret = self.ret * self.gamma + rews # print("after", self.ret) obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(np.array([self.ret].copy())) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret = self.ret * (1-float(news)) return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): self.ret = np.zeros(()) obs = self.env.reset() return self._obfilt(obs) if __name__ == "__main__": parser = argparse.ArgumentParser(description='PPO agent') # Common arguments parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"), help='the name of this experiment') parser.add_argument('--gym-id', type=str, default="MicrortsMining16x16F9-v0", help='the id of the gym environment') parser.add_argument('--seed', type=int, default=1, help='seed of the experiment') parser.add_argument('--episode-length', type=int, default=0, help='the maximum length of each episode') parser.add_argument('--total-timesteps', type=int, default=100000, help='total timesteps of the experiments') parser.add_argument('--no-torch-deterministic', action='store_false', dest="torch_deterministic", default=True, help='if toggled, `torch.backends.cudnn.deterministic=False`') parser.add_argument('--no-cuda', action='store_false', dest="cuda", default=True, help='if toggled, cuda will not be enabled by default') parser.add_argument('--prod-mode', action='store_true', default=False, help='run the script in production mode and use wandb to log outputs') parser.add_argument('--capture-video', action='store_true', default=False, help='weather to capture videos of the agent performances (check out `videos` folder)') parser.add_argument('--wandb-project-name', type=str, default="cleanRL", help="the wandb's project name") parser.add_argument('--wandb-entity', type=str, default=None, help="the entity (team) of wandb's project") # Algorithm specific arguments parser.add_argument('--batch-size', type=int, default=2048, help='the batch size of ppo') parser.add_argument('--minibatch-size', type=int, default=256, help='the mini batch size of ppo') parser.add_argument('--gamma', type=float, default=0.99, help='the discount factor gamma') parser.add_argument('--gae-lambda', type=float, default=0.97, help='the lambda for the general advantage estimation') parser.add_argument('--ent-coef', type=float, default=0.01, help="coefficient of the entropy") parser.add_argument('--max-grad-norm', type=float, default=0.5, help='the maximum norm for the gradient clipping') parser.add_argument('--clip-coef', type=float, default=0.2, help="the surrogate clipping coefficient") parser.add_argument('--update-epochs', type=int, default=10, help="the K epochs to update the policy") parser.add_argument('--kle-stop', action='store_true', default=False, help='If toggled, the policy updates will be early stopped w.r.t target-kl') parser.add_argument('--kle-rollback', action='store_true', default=False, help='If toggled, the policy updates will roll back to previous policy if KL exceeds target-kl') parser.add_argument('--target-kl', type=float, default=0.015, help='the target-kl variable that is referred by --kl') parser.add_argument('--gae', action='store_true', default=True, help='Use GAE for advantage computation') parser.add_argument('--policy-lr', type=float, default=3e-4, help="the learning rate of the policy optimizer") parser.add_argument('--value-lr', type=float, default=3e-4, help="the learning rate of the critic optimizer") parser.add_argument('--norm-obs', action='store_true', default=True, help="Toggles observation normalization") parser.add_argument('--norm-returns', action='store_true', default=False, help="Toggles returns normalization") parser.add_argument('--norm-adv', action='store_true', default=True, help="Toggles advantages normalization") parser.add_argument('--obs-clip', type=float, default=10.0, help="Value for reward clipping, as per the paper") parser.add_argument('--rew-clip', type=float, default=10.0, help="Value for observation clipping, as per the paper") parser.add_argument('--anneal-lr', action='store_true', default=True, help="Toggle learning rate annealing for policy and value networks") parser.add_argument('--weights-init', default="orthogonal", choices=["xavier", 'orthogonal'], help='Selects the scheme to be used for weights initialization'), parser.add_argument('--clip-vloss', action="store_true", default=True, help='Toggles wheter or not to use a clipped loss for the value function, as per the paper.') parser.add_argument('--pol-layer-norm', action='store_true', default=False, help='Enables layer normalization in the policy network') args = parser.parse_args() if not args.seed: args.seed = int(time.time()) args.features_turned_on = sum([args.kle_stop, args.kle_rollback, args.gae, args.norm_obs, args.norm_returns, args.norm_adv, args.anneal_lr, args.clip_vloss, args.pol_layer_norm]) # TRY NOT TO MODIFY: setup the environment experiment_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" writer = SummaryWriter(f"runs/{experiment_name}") writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % ( '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) if args.prod_mode: import wandb wandb.init(project=args.wandb_project_name, entity=args.wandb_entity, tensorboard=True, config=vars(args), name=experiment_name, monitor_gym=True) writer = SummaryWriter(f"/tmp/{experiment_name}") wandb.save(os.path.abspath(__file__)) # TRY NOT TO MODIFY: seeding device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu') env = gym.make(args.gym_id) # respect the default timelimit assert isinstance(env.action_space, MultiDiscrete), "only MultiDiscrete action space is supported" assert isinstance(env, TimeLimit) or int(args.episode_length), "the gym env does not have a built in TimeLimit, please specify by using --episode-length" if isinstance(env, TimeLimit): if int(args.episode_length): env._max_episode_steps = int(args.episode_length) args.episode_length = env._max_episode_steps else: env = TimeLimit(env, int(args.episode_length)) env = NormalizedEnv(env.env, ob=args.norm_obs, ret=args.norm_returns, clipob=args.obs_clip, cliprew=args.rew_clip, gamma=args.gamma) env = TimeLimit(env, int(args.episode_length)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic env.seed(args.seed) env.action_space.seed(args.seed) env.observation_space.seed(args.seed) if args.capture_video: env = Monitor(env, f'videos/{experiment_name}') # ALGO LOGIC: initialize agent here: class CategoricalMasked(Categorical): def __init__(self, probs=None, logits=None, validate_args=None, masks=[]): self.masks = masks if len(self.masks) == 0: super(CategoricalMasked, self).__init__(probs, logits, validate_args) else: self.masks = masks.type(torch.BoolTensor).to(device) logits = torch.where(self.masks, logits, torch.tensor(-1e+8).to(device)) super(CategoricalMasked, self).__init__(probs, logits, validate_args) def entropy(self): if len(self.masks) == 0: return super(CategoricalMasked, self).entropy() p_log_p = self.logits * self.probs p_log_p = torch.where(self.masks, p_log_p, torch.tensor(0.).to(device)) return -p_log_p.sum(-1) class Policy(nn.Module): def __init__(self): super(Policy, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=3), nn.MaxPool2d(2), nn.ReLU(), nn.Conv2d(16, 32, kernel_size=3), nn.MaxPool2d(1), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(32*5*5, 128), nn.ReLU(), nn.Linear(128, env.action_space.nvec.sum()) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def get_action(self, x, action=None, invalid_action_masks=None): logits = self.forward(x) split_logits = torch.split(logits, env.action_space.nvec.tolist(), dim=1) if invalid_action_masks is not None: split_invalid_action_masks = torch.split(invalid_action_masks, env.action_space.nvec.tolist(), dim=1) multi_categoricals = [CategoricalMasked(logits=logits, masks=iam) for (logits, iam) in zip(split_logits, split_invalid_action_masks)] else: multi_categoricals = [Categorical(logits=logits) for logits in split_logits] if action is None: action = torch.stack([categorical.sample() for categorical in multi_categoricals]) logprob = torch.stack([categorical.log_prob(a) for a, categorical in zip(action, multi_categoricals)]) return action, logprob, [], multi_categoricals class Value(nn.Module): def __init__(self): super(Value, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=3), nn.MaxPool2d(2), nn.ReLU(), nn.Conv2d(16, 32, kernel_size=3), nn.MaxPool2d(1), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(32*5*5, 128), nn.ReLU(), nn.Linear(128, 1) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def discount_cumsum(x, dones, gamma): """ computing discounted cumulative sums of vectors that resets with dones input: vector x, vector dones, [x0, [0, x1, 0, x2 1, x3 0, x4] 0] output: [x0 + discount * x1 + discount^2 * x2, x1 + discount * x2, x2, x3 + discount * x4, x4] """ discount_cumsum = np.zeros_like(x) discount_cumsum[-1] = x[-1] for t in reversed(range(x.shape[0]-1)): discount_cumsum[t] = x[t] + gamma * discount_cumsum[t+1] * (1-dones[t]) return discount_cumsum pg = Policy().to(device) vf = Value().to(device) # MODIFIED: Separate optimizer and learning rates pg_optimizer = optim.Adam(list(pg.parameters()), lr=args.policy_lr) v_optimizer = optim.Adam(list(vf.parameters()), lr=args.value_lr) # MODIFIED: Initializing learning rate anneal scheduler when need if args.anneal_lr: anneal_fn = lambda f: max(0, 1-f / args.total_timesteps) pg_lr_scheduler = optim.lr_scheduler.LambdaLR(pg_optimizer, lr_lambda=anneal_fn) vf_lr_scheduler = optim.lr_scheduler.LambdaLR(v_optimizer, lr_lambda=anneal_fn) loss_fn = nn.MSELoss() # TRY NOT TO MODIFY: start the game global_step = 0 while global_step < args.total_timesteps: if args.capture_video: env.stats_recorder.done=True next_obs = np.array(env.reset()) # ALGO Logic: Storage for epoch data obs = np.empty((args.batch_size,) + env.observation_space.shape) actions = np.empty((args.batch_size,) + env.action_space.shape) logprobs = torch.zeros((env.action_space.nvec.shape[0], args.batch_size,)).to(device) rewards = np.zeros((args.batch_size,)) raw_rewards = np.zeros((len(env.rfs),args.batch_size,)) real_rewards = [] invalid_action_stats = [] dones = np.zeros((args.batch_size,)) values = torch.zeros((args.batch_size,)).to(device) invalid_action_masks = torch.zeros((args.batch_size, env.action_space.nvec.sum())) # TRY NOT TO MODIFY: prepare the execution of the game. for step in range(args.batch_size): env.render() global_step += 1 obs[step] = next_obs.copy() # ALGO LOGIC: put action logic here invalid_action_mask = torch.ones(env.action_space.nvec.sum()) invalid_action_mask[0:env.action_space.nvec[0]] = torch.tensor(env.unit_location_mask) invalid_action_mask[-env.action_space.nvec[-1]:] = torch.tensor(env.target_unit_location_mask) invalid_action_masks[step] = invalid_action_mask with torch.no_grad(): values[step] = vf.forward(obs[step:step+1]) action, logproba, _, probs = pg.get_action(obs[step:step+1], invalid_action_masks=invalid_action_masks[step:step+1]) # CORE LOGIC: # use the action generated by CategoricalMasked, but # don't adjust the logprobability accordingly. Instead, calculate the log # probability using Categorical action, logproba, _, probs = pg.get_action(obs[step:step+1], action=action) actions[step] = action[:,0].data.cpu().numpy() logprobs[:,[step]] = logproba # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards[step], dones[step], info = env.step(action[:,0].data.cpu().numpy()) raw_rewards[:,step] = info["rewards"] real_rewards += [info['real_reward']] invalid_action_stats += [info['invalid_action_stats']] next_obs = np.array(next_obs) # Annealing the rate if instructed to do so. if args.anneal_lr: pg_lr_scheduler.step() vf_lr_scheduler.step() if dones[step]: # Computing the discounted returns: writer.add_scalar("charts/episode_reward", np.sum(real_rewards), global_step) print(f"global_step={global_step}, episode_reward={np.sum(real_rewards)}") for i in range(len(env.rfs)): writer.add_scalar(f"charts/episode_reward/{str(env.rfs[i])}", raw_rewards.sum(1)[i], global_step) real_rewards = [] for key, idx in zip(info['invalid_action_stats'], range(len(info['invalid_action_stats']))): writer.add_scalar(f"stats/{key}", pd.DataFrame(invalid_action_stats).sum(0)[idx], global_step) invalid_action_stats = [] next_obs = np.array(env.reset()) # bootstrap reward if not done. reached the batch limit last_value = 0 if not dones[step]: last_value = vf.forward(next_obs.reshape((1,)+next_obs.shape))[0].detach().cpu().numpy()[0] bootstrapped_rewards = np.append(rewards, last_value) # calculate the returns and advantages if args.gae: bootstrapped_values = np.append(values.detach().cpu().numpy(), last_value) deltas = bootstrapped_rewards[:-1] + args.gamma * bootstrapped_values[1:] * (1-dones) - bootstrapped_values[:-1] advantages = discount_cumsum(deltas, dones, args.gamma * args.gae_lambda) advantages = torch.Tensor(advantages).to(device) returns = advantages + values else: returns = discount_cumsum(bootstrapped_rewards, dones, args.gamma)[:-1] advantages = returns - values.detach().cpu().numpy() advantages = torch.Tensor(advantages).to(device) returns = torch.Tensor(returns).to(device) # Advantage normalization if args.norm_adv: EPS = 1e-10 advantages = (advantages - advantages.mean()) / (advantages.std() + EPS) # Optimizaing policy network entropys = [] target_pg = Policy().to(device) inds = np.arange(args.batch_size,) for i_epoch_pi in range(args.update_epochs): np.random.shuffle(inds) for start in range(0, args.batch_size, args.minibatch_size): end = start + args.minibatch_size minibatch_ind = inds[start:end] target_pg.load_state_dict(pg.state_dict()) _, newlogproba, _, _ = pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T,) ratio = (newlogproba - logprobs[:,minibatch_ind]).exp() # Policy loss as in OpenAI SpinUp clip_adv = torch.where(advantages[minibatch_ind] > 0, (1.+args.clip_coef) * advantages[minibatch_ind], (1.-args.clip_coef) * advantages[minibatch_ind]).to(device) # Entropy computation with resampled actions entropy = -(newlogproba.exp() * newlogproba).mean() entropys.append(entropy.item()) policy_loss = -torch.min(ratio * advantages[minibatch_ind], clip_adv) + args.ent_coef * entropy policy_loss = policy_loss.mean() pg_optimizer.zero_grad() policy_loss.backward() nn.utils.clip_grad_norm_(pg.parameters(), args.max_grad_norm) pg_optimizer.step() approx_kl = (logprobs[:,minibatch_ind] - newlogproba).mean() # Optimizing value network new_values = vf.forward(obs[minibatch_ind]).view(-1) # Value loss clipping if args.clip_vloss: v_loss_unclipped = ((new_values - returns[minibatch_ind]) ** 2) v_clipped = values[minibatch_ind] + torch.clamp(new_values - values[minibatch_ind], -args.clip_coef, args.clip_coef) v_loss_clipped = (v_clipped - returns[minibatch_ind])**2 v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) v_loss = 0.5 * v_loss_max.mean() else: v_loss = torch.mean((returns[minibatch_ind]- new_values).pow(2)) v_optimizer.zero_grad() v_loss.backward() nn.utils.clip_grad_norm_(vf.parameters(), args.max_grad_norm) v_optimizer.step() if args.kle_stop: if approx_kl > args.target_kl: break if args.kle_rollback: if (logprobs[:,minibatch_ind] - pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T, invalid_action_masks[minibatch_ind])[1]).mean() > args.target_kl: pg.load_state_dict(target_pg.state_dict()) break # TRY NOT TO MODIFY: record rewards for plotting purposes writer.add_scalar("losses/value_loss", v_loss.item(), global_step) writer.add_scalar("charts/policy_learning_rate", pg_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("charts/value_learning_rate", v_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("losses/policy_loss", policy_loss.item(), global_step) writer.add_scalar("losses/entropy", np.mean(entropys), global_step) writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) if args.kle_stop or args.kle_rollback: writer.add_scalar("debug/pg_stop_iter", i_epoch_pi, global_step) env.close() writer.close() ================================================ FILE: invalid_action_masking/ppo_no_adj_24x24.py ================================================ import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.distributions.categorical import Categorical from torch.utils.tensorboard import SummaryWriter from cleanrl.common import preprocess_obs_space, preprocess_ac_space import argparse import numpy as np import gym import gym_microrts from gym.wrappers import TimeLimit, Monitor from gym.spaces import Discrete, Box, MultiBinary, MultiDiscrete, Space import time import random import os import pandas as pd # taken from https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_normalize.py class RunningMeanStd(object): def __init__(self, epsilon=1e-4, shape=()): self.mean = np.zeros(shape, 'float64') self.var = np.ones(shape, 'float64') self.count = epsilon def update(self, x): batch_mean = np.mean([x], axis=0) batch_var = np.var([x], axis=0) batch_count = 1 self.update_from_moments(batch_mean, batch_var, batch_count) def update_from_moments(self, batch_mean, batch_var, batch_count): self.mean, self.var, self.count = update_mean_var_count_from_moments( self.mean, self.var, self.count, batch_mean, batch_var, batch_count) def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count): delta = batch_mean - mean tot_count = count + batch_count new_mean = mean + delta * batch_count / tot_count m_a = var * count m_b = batch_var * batch_count M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count new_var = M2 / tot_count new_count = tot_count return new_mean, new_var, new_count class NormalizedEnv(gym.core.Wrapper): def __init__(self, env, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): super(NormalizedEnv, self).__init__(env) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=(1,)) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(()) self.gamma = gamma self.epsilon = epsilon def step(self, action): obs, rews, news, infos = self.env.step(action) infos['real_reward'] = rews # print("before", self.ret) self.ret = self.ret * self.gamma + rews # print("after", self.ret) obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(np.array([self.ret].copy())) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret = self.ret * (1-float(news)) return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): self.ret = np.zeros(()) obs = self.env.reset() return self._obfilt(obs) if __name__ == "__main__": parser = argparse.ArgumentParser(description='PPO agent') # Common arguments parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"), help='the name of this experiment') parser.add_argument('--gym-id', type=str, default="MicrortsMining24x24F9-v0", help='the id of the gym environment') parser.add_argument('--seed', type=int, default=1, help='seed of the experiment') parser.add_argument('--episode-length', type=int, default=0, help='the maximum length of each episode') parser.add_argument('--total-timesteps', type=int, default=100000, help='total timesteps of the experiments') parser.add_argument('--no-torch-deterministic', action='store_false', dest="torch_deterministic", default=True, help='if toggled, `torch.backends.cudnn.deterministic=False`') parser.add_argument('--no-cuda', action='store_false', dest="cuda", default=True, help='if toggled, cuda will not be enabled by default') parser.add_argument('--prod-mode', action='store_true', default=False, help='run the script in production mode and use wandb to log outputs') parser.add_argument('--capture-video', action='store_true', default=False, help='weather to capture videos of the agent performances (check out `videos` folder)') parser.add_argument('--wandb-project-name', type=str, default="cleanRL", help="the wandb's project name") parser.add_argument('--wandb-entity', type=str, default=None, help="the entity (team) of wandb's project") # Algorithm specific arguments parser.add_argument('--batch-size', type=int, default=2048, help='the batch size of ppo') parser.add_argument('--minibatch-size', type=int, default=256, help='the mini batch size of ppo') parser.add_argument('--gamma', type=float, default=0.99, help='the discount factor gamma') parser.add_argument('--gae-lambda', type=float, default=0.97, help='the lambda for the general advantage estimation') parser.add_argument('--ent-coef', type=float, default=0.01, help="coefficient of the entropy") parser.add_argument('--max-grad-norm', type=float, default=0.5, help='the maximum norm for the gradient clipping') parser.add_argument('--clip-coef', type=float, default=0.2, help="the surrogate clipping coefficient") parser.add_argument('--update-epochs', type=int, default=10, help="the K epochs to update the policy") parser.add_argument('--kle-stop', action='store_true', default=False, help='If toggled, the policy updates will be early stopped w.r.t target-kl') parser.add_argument('--kle-rollback', action='store_true', default=False, help='If toggled, the policy updates will roll back to previous policy if KL exceeds target-kl') parser.add_argument('--target-kl', type=float, default=0.015, help='the target-kl variable that is referred by --kl') parser.add_argument('--gae', action='store_true', default=True, help='Use GAE for advantage computation') parser.add_argument('--policy-lr', type=float, default=3e-4, help="the learning rate of the policy optimizer") parser.add_argument('--value-lr', type=float, default=3e-4, help="the learning rate of the critic optimizer") parser.add_argument('--norm-obs', action='store_true', default=True, help="Toggles observation normalization") parser.add_argument('--norm-returns', action='store_true', default=False, help="Toggles returns normalization") parser.add_argument('--norm-adv', action='store_true', default=True, help="Toggles advantages normalization") parser.add_argument('--obs-clip', type=float, default=10.0, help="Value for reward clipping, as per the paper") parser.add_argument('--rew-clip', type=float, default=10.0, help="Value for observation clipping, as per the paper") parser.add_argument('--anneal-lr', action='store_true', default=True, help="Toggle learning rate annealing for policy and value networks") parser.add_argument('--weights-init', default="orthogonal", choices=["xavier", 'orthogonal'], help='Selects the scheme to be used for weights initialization'), parser.add_argument('--clip-vloss', action="store_true", default=True, help='Toggles wheter or not to use a clipped loss for the value function, as per the paper.') parser.add_argument('--pol-layer-norm', action='store_true', default=False, help='Enables layer normalization in the policy network') args = parser.parse_args() if not args.seed: args.seed = int(time.time()) args.features_turned_on = sum([args.kle_stop, args.kle_rollback, args.gae, args.norm_obs, args.norm_returns, args.norm_adv, args.anneal_lr, args.clip_vloss, args.pol_layer_norm]) # TRY NOT TO MODIFY: setup the environment experiment_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" writer = SummaryWriter(f"runs/{experiment_name}") writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % ( '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) if args.prod_mode: import wandb wandb.init(project=args.wandb_project_name, entity=args.wandb_entity, tensorboard=True, config=vars(args), name=experiment_name, monitor_gym=True) writer = SummaryWriter(f"/tmp/{experiment_name}") wandb.save(os.path.abspath(__file__)) # TRY NOT TO MODIFY: seeding device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu') env = gym.make(args.gym_id) # respect the default timelimit assert isinstance(env.action_space, MultiDiscrete), "only MultiDiscrete action space is supported" assert isinstance(env, TimeLimit) or int(args.episode_length), "the gym env does not have a built in TimeLimit, please specify by using --episode-length" if isinstance(env, TimeLimit): if int(args.episode_length): env._max_episode_steps = int(args.episode_length) args.episode_length = env._max_episode_steps else: env = TimeLimit(env, int(args.episode_length)) env = NormalizedEnv(env.env, ob=args.norm_obs, ret=args.norm_returns, clipob=args.obs_clip, cliprew=args.rew_clip, gamma=args.gamma) env = TimeLimit(env, int(args.episode_length)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic env.seed(args.seed) env.action_space.seed(args.seed) env.observation_space.seed(args.seed) if args.capture_video: env = Monitor(env, f'videos/{experiment_name}') # ALGO LOGIC: initialize agent here: class CategoricalMasked(Categorical): def __init__(self, probs=None, logits=None, validate_args=None, masks=[]): self.masks = masks if len(self.masks) == 0: super(CategoricalMasked, self).__init__(probs, logits, validate_args) else: self.masks = masks.type(torch.BoolTensor).to(device) logits = torch.where(self.masks, logits, torch.tensor(-1e+8).to(device)) super(CategoricalMasked, self).__init__(probs, logits, validate_args) def entropy(self): if len(self.masks) == 0: return super(CategoricalMasked, self).entropy() p_log_p = self.logits * self.probs p_log_p = torch.where(self.masks, p_log_p, torch.tensor(0.).to(device)) return -p_log_p.sum(-1) class Policy(nn.Module): def __init__(self): super(Policy, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=3, stride=1), nn.MaxPool2d(2), nn.ReLU(), nn.Conv2d(16, 32, kernel_size=2, stride=1), nn.MaxPool2d(2), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(32*5*5, 128), nn.ReLU(), nn.Linear(128, env.action_space.nvec.sum()) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def get_action(self, x, action=None, invalid_action_masks=None): logits = self.forward(x) split_logits = torch.split(logits, env.action_space.nvec.tolist(), dim=1) if invalid_action_masks is not None: split_invalid_action_masks = torch.split(invalid_action_masks, env.action_space.nvec.tolist(), dim=1) multi_categoricals = [CategoricalMasked(logits=logits, masks=iam) for (logits, iam) in zip(split_logits, split_invalid_action_masks)] else: multi_categoricals = [Categorical(logits=logits) for logits in split_logits] if action is None: action = torch.stack([categorical.sample() for categorical in multi_categoricals]) logprob = torch.stack([categorical.log_prob(a) for a, categorical in zip(action, multi_categoricals)]) return action, logprob, [], multi_categoricals class Value(nn.Module): def __init__(self): super(Value, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=3, stride=1), nn.MaxPool2d(2), nn.ReLU(), nn.Conv2d(16, 32, kernel_size=2, stride=1), nn.MaxPool2d(2), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(32*5*5, 128), nn.ReLU(), nn.Linear(128, 1) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def discount_cumsum(x, dones, gamma): """ computing discounted cumulative sums of vectors that resets with dones input: vector x, vector dones, [x0, [0, x1, 0, x2 1, x3 0, x4] 0] output: [x0 + discount * x1 + discount^2 * x2, x1 + discount * x2, x2, x3 + discount * x4, x4] """ discount_cumsum = np.zeros_like(x) discount_cumsum[-1] = x[-1] for t in reversed(range(x.shape[0]-1)): discount_cumsum[t] = x[t] + gamma * discount_cumsum[t+1] * (1-dones[t]) return discount_cumsum pg = Policy().to(device) vf = Value().to(device) # MODIFIED: Separate optimizer and learning rates pg_optimizer = optim.Adam(list(pg.parameters()), lr=args.policy_lr) v_optimizer = optim.Adam(list(vf.parameters()), lr=args.value_lr) # MODIFIED: Initializing learning rate anneal scheduler when need if args.anneal_lr: anneal_fn = lambda f: max(0, 1-f / args.total_timesteps) pg_lr_scheduler = optim.lr_scheduler.LambdaLR(pg_optimizer, lr_lambda=anneal_fn) vf_lr_scheduler = optim.lr_scheduler.LambdaLR(v_optimizer, lr_lambda=anneal_fn) loss_fn = nn.MSELoss() # TRY NOT TO MODIFY: start the game global_step = 0 while global_step < args.total_timesteps: if args.capture_video: env.stats_recorder.done=True next_obs = np.array(env.reset()) # ALGO Logic: Storage for epoch data obs = np.empty((args.batch_size,) + env.observation_space.shape) actions = np.empty((args.batch_size,) + env.action_space.shape) logprobs = torch.zeros((env.action_space.nvec.shape[0], args.batch_size,)).to(device) rewards = np.zeros((args.batch_size,)) raw_rewards = np.zeros((len(env.rfs),args.batch_size,)) real_rewards = [] invalid_action_stats = [] dones = np.zeros((args.batch_size,)) values = torch.zeros((args.batch_size,)).to(device) invalid_action_masks = torch.zeros((args.batch_size, env.action_space.nvec.sum())) # TRY NOT TO MODIFY: prepare the execution of the game. for step in range(args.batch_size): env.render() global_step += 1 obs[step] = next_obs.copy() # ALGO LOGIC: put action logic here invalid_action_mask = torch.ones(env.action_space.nvec.sum()) invalid_action_mask[0:env.action_space.nvec[0]] = torch.tensor(env.unit_location_mask) invalid_action_mask[-env.action_space.nvec[-1]:] = torch.tensor(env.target_unit_location_mask) invalid_action_masks[step] = invalid_action_mask with torch.no_grad(): values[step] = vf.forward(obs[step:step+1]) action, logproba, _, probs = pg.get_action(obs[step:step+1], invalid_action_masks=invalid_action_masks[step:step+1]) # CORE LOGIC: # use the action generated by CategoricalMasked, but # don't adjust the logprobability accordingly. Instead, calculate the log # probability using Categorical action, logproba, _, probs = pg.get_action(obs[step:step+1], action=action) actions[step] = action[:,0].data.cpu().numpy() logprobs[:,[step]] = logproba # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards[step], dones[step], info = env.step(action[:,0].data.cpu().numpy()) raw_rewards[:,step] = info["rewards"] real_rewards += [info['real_reward']] invalid_action_stats += [info['invalid_action_stats']] next_obs = np.array(next_obs) # Annealing the rate if instructed to do so. if args.anneal_lr: pg_lr_scheduler.step() vf_lr_scheduler.step() if dones[step]: # Computing the discounted returns: writer.add_scalar("charts/episode_reward", np.sum(real_rewards), global_step) print(f"global_step={global_step}, episode_reward={np.sum(real_rewards)}") for i in range(len(env.rfs)): writer.add_scalar(f"charts/episode_reward/{str(env.rfs[i])}", raw_rewards.sum(1)[i], global_step) real_rewards = [] for key, idx in zip(info['invalid_action_stats'], range(len(info['invalid_action_stats']))): writer.add_scalar(f"stats/{key}", pd.DataFrame(invalid_action_stats).sum(0)[idx], global_step) invalid_action_stats = [] next_obs = np.array(env.reset()) # bootstrap reward if not done. reached the batch limit last_value = 0 if not dones[step]: last_value = vf.forward(next_obs.reshape((1,)+next_obs.shape))[0].detach().cpu().numpy()[0] bootstrapped_rewards = np.append(rewards, last_value) # calculate the returns and advantages if args.gae: bootstrapped_values = np.append(values.detach().cpu().numpy(), last_value) deltas = bootstrapped_rewards[:-1] + args.gamma * bootstrapped_values[1:] * (1-dones) - bootstrapped_values[:-1] advantages = discount_cumsum(deltas, dones, args.gamma * args.gae_lambda) advantages = torch.Tensor(advantages).to(device) returns = advantages + values else: returns = discount_cumsum(bootstrapped_rewards, dones, args.gamma)[:-1] advantages = returns - values.detach().cpu().numpy() advantages = torch.Tensor(advantages).to(device) returns = torch.Tensor(returns).to(device) # Advantage normalization if args.norm_adv: EPS = 1e-10 advantages = (advantages - advantages.mean()) / (advantages.std() + EPS) # Optimizaing policy network entropys = [] target_pg = Policy().to(device) inds = np.arange(args.batch_size,) for i_epoch_pi in range(args.update_epochs): np.random.shuffle(inds) for start in range(0, args.batch_size, args.minibatch_size): end = start + args.minibatch_size minibatch_ind = inds[start:end] target_pg.load_state_dict(pg.state_dict()) _, newlogproba, _, _ = pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T,) ratio = (newlogproba - logprobs[:,minibatch_ind]).exp() # Policy loss as in OpenAI SpinUp clip_adv = torch.where(advantages[minibatch_ind] > 0, (1.+args.clip_coef) * advantages[minibatch_ind], (1.-args.clip_coef) * advantages[minibatch_ind]).to(device) # Entropy computation with resampled actions entropy = -(newlogproba.exp() * newlogproba).mean() entropys.append(entropy.item()) policy_loss = -torch.min(ratio * advantages[minibatch_ind], clip_adv) + args.ent_coef * entropy policy_loss = policy_loss.mean() pg_optimizer.zero_grad() policy_loss.backward() nn.utils.clip_grad_norm_(pg.parameters(), args.max_grad_norm) pg_optimizer.step() approx_kl = (logprobs[:,minibatch_ind] - newlogproba).mean() # Optimizing value network new_values = vf.forward(obs[minibatch_ind]).view(-1) # Value loss clipping if args.clip_vloss: v_loss_unclipped = ((new_values - returns[minibatch_ind]) ** 2) v_clipped = values[minibatch_ind] + torch.clamp(new_values - values[minibatch_ind], -args.clip_coef, args.clip_coef) v_loss_clipped = (v_clipped - returns[minibatch_ind])**2 v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) v_loss = 0.5 * v_loss_max.mean() else: v_loss = torch.mean((returns[minibatch_ind]- new_values).pow(2)) v_optimizer.zero_grad() v_loss.backward() nn.utils.clip_grad_norm_(vf.parameters(), args.max_grad_norm) v_optimizer.step() if args.kle_stop: if approx_kl > args.target_kl: break if args.kle_rollback: if (logprobs[:,minibatch_ind] - pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T, invalid_action_masks[minibatch_ind])[1]).mean() > args.target_kl: pg.load_state_dict(target_pg.state_dict()) break # TRY NOT TO MODIFY: record rewards for plotting purposes writer.add_scalar("losses/value_loss", v_loss.item(), global_step) writer.add_scalar("charts/policy_learning_rate", pg_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("charts/value_learning_rate", v_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("losses/policy_loss", policy_loss.item(), global_step) writer.add_scalar("losses/entropy", np.mean(entropys), global_step) writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) if args.kle_stop or args.kle_rollback: writer.add_scalar("debug/pg_stop_iter", i_epoch_pi, global_step) env.close() writer.close() ================================================ FILE: invalid_action_masking/ppo_no_adj_4x4.py ================================================ import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.distributions.categorical import Categorical from torch.utils.tensorboard import SummaryWriter from cleanrl.common import preprocess_obs_space, preprocess_ac_space import argparse import numpy as np import gym import gym_microrts from gym.wrappers import TimeLimit, Monitor from gym.spaces import Discrete, Box, MultiBinary, MultiDiscrete, Space import time import random import os import pandas as pd # taken from https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_normalize.py class RunningMeanStd(object): def __init__(self, epsilon=1e-4, shape=()): self.mean = np.zeros(shape, 'float64') self.var = np.ones(shape, 'float64') self.count = epsilon def update(self, x): batch_mean = np.mean([x], axis=0) batch_var = np.var([x], axis=0) batch_count = 1 self.update_from_moments(batch_mean, batch_var, batch_count) def update_from_moments(self, batch_mean, batch_var, batch_count): self.mean, self.var, self.count = update_mean_var_count_from_moments( self.mean, self.var, self.count, batch_mean, batch_var, batch_count) def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count): delta = batch_mean - mean tot_count = count + batch_count new_mean = mean + delta * batch_count / tot_count m_a = var * count m_b = batch_var * batch_count M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count new_var = M2 / tot_count new_count = tot_count return new_mean, new_var, new_count class NormalizedEnv(gym.core.Wrapper): def __init__(self, env, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): super(NormalizedEnv, self).__init__(env) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=(1,)) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(()) self.gamma = gamma self.epsilon = epsilon def step(self, action): obs, rews, news, infos = self.env.step(action) infos['real_reward'] = rews # print("before", self.ret) self.ret = self.ret * self.gamma + rews # print("after", self.ret) obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(np.array([self.ret].copy())) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret = self.ret * (1-float(news)) return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): self.ret = np.zeros(()) obs = self.env.reset() return self._obfilt(obs) if __name__ == "__main__": parser = argparse.ArgumentParser(description='PPO agent') # Common arguments parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"), help='the name of this experiment') parser.add_argument('--gym-id', type=str, default="MicrortsMining4x4F9-v0", help='the id of the gym environment') parser.add_argument('--seed', type=int, default=1, help='seed of the experiment') parser.add_argument('--episode-length', type=int, default=0, help='the maximum length of each episode') parser.add_argument('--total-timesteps', type=int, default=100000, help='total timesteps of the experiments') parser.add_argument('--no-torch-deterministic', action='store_false', dest="torch_deterministic", default=True, help='if toggled, `torch.backends.cudnn.deterministic=False`') parser.add_argument('--no-cuda', action='store_false', dest="cuda", default=True, help='if toggled, cuda will not be enabled by default') parser.add_argument('--prod-mode', action='store_true', default=False, help='run the script in production mode and use wandb to log outputs') parser.add_argument('--capture-video', action='store_true', default=False, help='weather to capture videos of the agent performances (check out `videos` folder)') parser.add_argument('--wandb-project-name', type=str, default="cleanRL", help="the wandb's project name") parser.add_argument('--wandb-entity', type=str, default=None, help="the entity (team) of wandb's project") # Algorithm specific arguments parser.add_argument('--batch-size', type=int, default=2048, help='the batch size of ppo') parser.add_argument('--minibatch-size', type=int, default=256, help='the mini batch size of ppo') parser.add_argument('--gamma', type=float, default=0.99, help='the discount factor gamma') parser.add_argument('--gae-lambda', type=float, default=0.97, help='the lambda for the general advantage estimation') parser.add_argument('--ent-coef', type=float, default=0.01, help="coefficient of the entropy") parser.add_argument('--max-grad-norm', type=float, default=0.5, help='the maximum norm for the gradient clipping') parser.add_argument('--clip-coef', type=float, default=0.2, help="the surrogate clipping coefficient") parser.add_argument('--update-epochs', type=int, default=10, help="the K epochs to update the policy") parser.add_argument('--kle-stop', action='store_true', default=False, help='If toggled, the policy updates will be early stopped w.r.t target-kl') parser.add_argument('--kle-rollback', action='store_true', default=False, help='If toggled, the policy updates will roll back to previous policy if KL exceeds target-kl') parser.add_argument('--target-kl', type=float, default=0.015, help='the target-kl variable that is referred by --kl') parser.add_argument('--gae', action='store_true', default=True, help='Use GAE for advantage computation') parser.add_argument('--policy-lr', type=float, default=3e-4, help="the learning rate of the policy optimizer") parser.add_argument('--value-lr', type=float, default=3e-4, help="the learning rate of the critic optimizer") parser.add_argument('--norm-obs', action='store_true', default=True, help="Toggles observation normalization") parser.add_argument('--norm-returns', action='store_true', default=False, help="Toggles returns normalization") parser.add_argument('--norm-adv', action='store_true', default=True, help="Toggles advantages normalization") parser.add_argument('--obs-clip', type=float, default=10.0, help="Value for reward clipping, as per the paper") parser.add_argument('--rew-clip', type=float, default=10.0, help="Value for observation clipping, as per the paper") parser.add_argument('--anneal-lr', action='store_true', default=True, help="Toggle learning rate annealing for policy and value networks") parser.add_argument('--weights-init', default="orthogonal", choices=["xavier", 'orthogonal'], help='Selects the scheme to be used for weights initialization'), parser.add_argument('--clip-vloss', action="store_true", default=True, help='Toggles wheter or not to use a clipped loss for the value function, as per the paper.') parser.add_argument('--pol-layer-norm', action='store_true', default=False, help='Enables layer normalization in the policy network') args = parser.parse_args() if not args.seed: args.seed = int(time.time()) args.features_turned_on = sum([args.kle_stop, args.kle_rollback, args.gae, args.norm_obs, args.norm_returns, args.norm_adv, args.anneal_lr, args.clip_vloss, args.pol_layer_norm]) # TRY NOT TO MODIFY: setup the environment experiment_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" writer = SummaryWriter(f"runs/{experiment_name}") writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % ( '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) if args.prod_mode: import wandb wandb.init(project=args.wandb_project_name, entity=args.wandb_entity, tensorboard=True, config=vars(args), name=experiment_name, monitor_gym=True) writer = SummaryWriter(f"/tmp/{experiment_name}") wandb.save(os.path.abspath(__file__)) # TRY NOT TO MODIFY: seeding device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu') env = gym.make(args.gym_id) # respect the default timelimit assert isinstance(env.action_space, MultiDiscrete), "only MultiDiscrete action space is supported" assert isinstance(env, TimeLimit) or int(args.episode_length), "the gym env does not have a built in TimeLimit, please specify by using --episode-length" if isinstance(env, TimeLimit): if int(args.episode_length): env._max_episode_steps = int(args.episode_length) args.episode_length = env._max_episode_steps else: env = TimeLimit(env, int(args.episode_length)) env = NormalizedEnv(env.env, ob=args.norm_obs, ret=args.norm_returns, clipob=args.obs_clip, cliprew=args.rew_clip, gamma=args.gamma) env = TimeLimit(env, int(args.episode_length)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic env.seed(args.seed) env.action_space.seed(args.seed) env.observation_space.seed(args.seed) if args.capture_video: env = Monitor(env, f'videos/{experiment_name}') # ALGO LOGIC: initialize agent here: class CategoricalMasked(Categorical): def __init__(self, probs=None, logits=None, validate_args=None, masks=[]): self.masks = masks if len(self.masks) == 0: super(CategoricalMasked, self).__init__(probs, logits, validate_args) else: self.masks = masks.type(torch.BoolTensor).to(device) logits = torch.where(self.masks, logits, torch.tensor(-1e+8).to(device)) super(CategoricalMasked, self).__init__(probs, logits, validate_args) def entropy(self): if len(self.masks) == 0: return super(CategoricalMasked, self).entropy() p_log_p = self.logits * self.probs p_log_p = torch.where(self.masks, p_log_p, torch.tensor(0.).to(device)) return -p_log_p.sum(-1) class Policy(nn.Module): def __init__(self): super(Policy, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=2,), nn.MaxPool2d(1), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(16*3*3, 128), nn.ReLU(), nn.Linear(128, env.action_space.nvec.sum()) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def get_action(self, x, action=None, invalid_action_masks=None): logits = self.forward(x) split_logits = torch.split(logits, env.action_space.nvec.tolist(), dim=1) if invalid_action_masks is not None: split_invalid_action_masks = torch.split(invalid_action_masks, env.action_space.nvec.tolist(), dim=1) multi_categoricals = [CategoricalMasked(logits=logits, masks=iam) for (logits, iam) in zip(split_logits, split_invalid_action_masks)] else: multi_categoricals = [Categorical(logits=logits) for logits in split_logits] if action is None: action = torch.stack([categorical.sample() for categorical in multi_categoricals]) logprob = torch.stack([categorical.log_prob(a) for a, categorical in zip(action, multi_categoricals)]) return action, logprob, [], multi_categoricals class Value(nn.Module): def __init__(self): super(Value, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=2,), nn.MaxPool2d(1), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(16*3*3, 128), nn.ReLU(), nn.Linear(128, 1) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def discount_cumsum(x, dones, gamma): """ computing discounted cumulative sums of vectors that resets with dones input: vector x, vector dones, [x0, [0, x1, 0, x2 1, x3 0, x4] 0] output: [x0 + discount * x1 + discount^2 * x2, x1 + discount * x2, x2, x3 + discount * x4, x4] """ discount_cumsum = np.zeros_like(x) discount_cumsum[-1] = x[-1] for t in reversed(range(x.shape[0]-1)): discount_cumsum[t] = x[t] + gamma * discount_cumsum[t+1] * (1-dones[t]) return discount_cumsum pg = Policy().to(device) vf = Value().to(device) # MODIFIED: Separate optimizer and learning rates pg_optimizer = optim.Adam(list(pg.parameters()), lr=args.policy_lr) v_optimizer = optim.Adam(list(vf.parameters()), lr=args.value_lr) # MODIFIED: Initializing learning rate anneal scheduler when need if args.anneal_lr: anneal_fn = lambda f: max(0, 1-f / args.total_timesteps) pg_lr_scheduler = optim.lr_scheduler.LambdaLR(pg_optimizer, lr_lambda=anneal_fn) vf_lr_scheduler = optim.lr_scheduler.LambdaLR(v_optimizer, lr_lambda=anneal_fn) loss_fn = nn.MSELoss() # TRY NOT TO MODIFY: start the game global_step = 0 while global_step < args.total_timesteps: if args.capture_video: env.stats_recorder.done=True next_obs = np.array(env.reset()) # ALGO Logic: Storage for epoch data obs = np.empty((args.batch_size,) + env.observation_space.shape) actions = np.empty((args.batch_size,) + env.action_space.shape) logprobs = torch.zeros((env.action_space.nvec.shape[0], args.batch_size,)).to(device) rewards = np.zeros((args.batch_size,)) raw_rewards = np.zeros((len(env.rfs),args.batch_size,)) real_rewards = [] invalid_action_stats = [] dones = np.zeros((args.batch_size,)) values = torch.zeros((args.batch_size,)).to(device) invalid_action_masks = torch.zeros((args.batch_size, env.action_space.nvec.sum())) # TRY NOT TO MODIFY: prepare the execution of the game. for step in range(args.batch_size): env.render() global_step += 1 obs[step] = next_obs.copy() # ALGO LOGIC: put action logic here invalid_action_mask = torch.ones(env.action_space.nvec.sum()) invalid_action_mask[0:env.action_space.nvec[0]] = torch.tensor(env.unit_location_mask) invalid_action_mask[-env.action_space.nvec[-1]:] = torch.tensor(env.target_unit_location_mask) invalid_action_masks[step] = invalid_action_mask with torch.no_grad(): values[step] = vf.forward(obs[step:step+1]) action, logproba, _, probs = pg.get_action(obs[step:step+1], invalid_action_masks=invalid_action_masks[step:step+1]) # CORE LOGIC: # use the action generated by CategoricalMasked, but # don't adjust the logprobability accordingly. Instead, calculate the log # probability using Categorical action, logproba, _, probs = pg.get_action(obs[step:step+1], action=action) actions[step] = action[:,0].data.cpu().numpy() logprobs[:,[step]] = logproba # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards[step], dones[step], info = env.step(action[:,0].data.cpu().numpy()) raw_rewards[:,step] = info["rewards"] real_rewards += [info['real_reward']] invalid_action_stats += [info['invalid_action_stats']] next_obs = np.array(next_obs) # Annealing the rate if instructed to do so. if args.anneal_lr: pg_lr_scheduler.step() vf_lr_scheduler.step() if dones[step]: # Computing the discounted returns: writer.add_scalar("charts/episode_reward", np.sum(real_rewards), global_step) print(f"global_step={global_step}, episode_reward={np.sum(real_rewards)}") for i in range(len(env.rfs)): writer.add_scalar(f"charts/episode_reward/{str(env.rfs[i])}", raw_rewards.sum(1)[i], global_step) real_rewards = [] for key, idx in zip(info['invalid_action_stats'], range(len(info['invalid_action_stats']))): writer.add_scalar(f"stats/{key}", pd.DataFrame(invalid_action_stats).sum(0)[idx], global_step) invalid_action_stats = [] next_obs = np.array(env.reset()) # bootstrap reward if not done. reached the batch limit last_value = 0 if not dones[step]: last_value = vf.forward(next_obs.reshape((1,)+next_obs.shape))[0].detach().cpu().numpy()[0] bootstrapped_rewards = np.append(rewards, last_value) # calculate the returns and advantages if args.gae: bootstrapped_values = np.append(values.detach().cpu().numpy(), last_value) deltas = bootstrapped_rewards[:-1] + args.gamma * bootstrapped_values[1:] * (1-dones) - bootstrapped_values[:-1] advantages = discount_cumsum(deltas, dones, args.gamma * args.gae_lambda) advantages = torch.Tensor(advantages).to(device) returns = advantages + values else: returns = discount_cumsum(bootstrapped_rewards, dones, args.gamma)[:-1] advantages = returns - values.detach().cpu().numpy() advantages = torch.Tensor(advantages).to(device) returns = torch.Tensor(returns).to(device) # Advantage normalization if args.norm_adv: EPS = 1e-10 advantages = (advantages - advantages.mean()) / (advantages.std() + EPS) # Optimizaing policy network entropys = [] target_pg = Policy().to(device) inds = np.arange(args.batch_size,) for i_epoch_pi in range(args.update_epochs): np.random.shuffle(inds) for start in range(0, args.batch_size, args.minibatch_size): end = start + args.minibatch_size minibatch_ind = inds[start:end] target_pg.load_state_dict(pg.state_dict()) _, newlogproba, _, _ = pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T,) ratio = (newlogproba - logprobs[:,minibatch_ind]).exp() # Policy loss as in OpenAI SpinUp clip_adv = torch.where(advantages[minibatch_ind] > 0, (1.+args.clip_coef) * advantages[minibatch_ind], (1.-args.clip_coef) * advantages[minibatch_ind]).to(device) # Entropy computation with resampled actions entropy = -(newlogproba.exp() * newlogproba).mean() entropys.append(entropy.item()) policy_loss = -torch.min(ratio * advantages[minibatch_ind], clip_adv) + args.ent_coef * entropy policy_loss = policy_loss.mean() pg_optimizer.zero_grad() policy_loss.backward() nn.utils.clip_grad_norm_(pg.parameters(), args.max_grad_norm) pg_optimizer.step() approx_kl = (logprobs[:,minibatch_ind] - newlogproba).mean() # Optimizing value network new_values = vf.forward(obs[minibatch_ind]).view(-1) # Value loss clipping if args.clip_vloss: v_loss_unclipped = ((new_values - returns[minibatch_ind]) ** 2) v_clipped = values[minibatch_ind] + torch.clamp(new_values - values[minibatch_ind], -args.clip_coef, args.clip_coef) v_loss_clipped = (v_clipped - returns[minibatch_ind])**2 v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) v_loss = 0.5 * v_loss_max.mean() else: v_loss = torch.mean((returns[minibatch_ind]- new_values).pow(2)) v_optimizer.zero_grad() v_loss.backward() nn.utils.clip_grad_norm_(vf.parameters(), args.max_grad_norm) v_optimizer.step() if args.kle_stop: if approx_kl > args.target_kl: break if args.kle_rollback: if (logprobs[:,minibatch_ind] - pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T, invalid_action_masks[minibatch_ind])[1]).mean() > args.target_kl: pg.load_state_dict(target_pg.state_dict()) break # TRY NOT TO MODIFY: record rewards for plotting purposes writer.add_scalar("losses/value_loss", v_loss.item(), global_step) writer.add_scalar("charts/policy_learning_rate", pg_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("charts/value_learning_rate", v_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("losses/policy_loss", policy_loss.item(), global_step) writer.add_scalar("losses/entropy", np.mean(entropys), global_step) writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) if args.kle_stop or args.kle_rollback: writer.add_scalar("debug/pg_stop_iter", i_epoch_pi, global_step) env.close() writer.close() ================================================ FILE: invalid_action_masking/ppo_no_mask_10x10.py ================================================ import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.distributions.categorical import Categorical from torch.utils.tensorboard import SummaryWriter from cleanrl.common import preprocess_obs_space, preprocess_ac_space import argparse import numpy as np import gym import gym_microrts from gym.wrappers import TimeLimit, Monitor from gym.spaces import Discrete, Box, MultiBinary, MultiDiscrete, Space import time import random import os import pandas as pd # taken from https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_normalize.py class RunningMeanStd(object): def __init__(self, epsilon=1e-4, shape=()): self.mean = np.zeros(shape, 'float64') self.var = np.ones(shape, 'float64') self.count = epsilon def update(self, x): batch_mean = np.mean([x], axis=0) batch_var = np.var([x], axis=0) batch_count = 1 self.update_from_moments(batch_mean, batch_var, batch_count) def update_from_moments(self, batch_mean, batch_var, batch_count): self.mean, self.var, self.count = update_mean_var_count_from_moments( self.mean, self.var, self.count, batch_mean, batch_var, batch_count) def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count): delta = batch_mean - mean tot_count = count + batch_count new_mean = mean + delta * batch_count / tot_count m_a = var * count m_b = batch_var * batch_count M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count new_var = M2 / tot_count new_count = tot_count return new_mean, new_var, new_count class NormalizedEnv(gym.core.Wrapper): def __init__(self, env, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): super(NormalizedEnv, self).__init__(env) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=(1,)) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(()) self.gamma = gamma self.epsilon = epsilon def step(self, action): obs, rews, news, infos = self.env.step(action) infos['real_reward'] = rews # print("before", self.ret) self.ret = self.ret * self.gamma + rews # print("after", self.ret) obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(np.array([self.ret].copy())) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret = self.ret * (1-float(news)) return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): self.ret = np.zeros(()) obs = self.env.reset() return self._obfilt(obs) if __name__ == "__main__": parser = argparse.ArgumentParser(description='PPO agent') # Common arguments parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"), help='the name of this experiment') parser.add_argument('--gym-id', type=str, default="MicrortsMining10x10F9-v0", help='the id of the gym environment') parser.add_argument('--seed', type=int, default=1, help='seed of the experiment') parser.add_argument('--episode-length', type=int, default=0, help='the maximum length of each episode') parser.add_argument('--total-timesteps', type=int, default=100000, help='total timesteps of the experiments') parser.add_argument('--no-torch-deterministic', action='store_false', dest="torch_deterministic", default=True, help='if toggled, `torch.backends.cudnn.deterministic=False`') parser.add_argument('--no-cuda', action='store_false', dest="cuda", default=True, help='if toggled, cuda will not be enabled by default') parser.add_argument('--prod-mode', action='store_true', default=False, help='run the script in production mode and use wandb to log outputs') parser.add_argument('--capture-video', action='store_true', default=False, help='weather to capture videos of the agent performances (check out `videos` folder)') parser.add_argument('--wandb-project-name', type=str, default="cleanRL", help="the wandb's project name") parser.add_argument('--wandb-entity', type=str, default=None, help="the entity (team) of wandb's project") # Algorithm specific arguments parser.add_argument('--batch-size', type=int, default=2048, help='the batch size of ppo') parser.add_argument('--minibatch-size', type=int, default=256, help='the mini batch size of ppo') parser.add_argument('--gamma', type=float, default=0.99, help='the discount factor gamma') parser.add_argument('--gae-lambda', type=float, default=0.97, help='the lambda for the general advantage estimation') parser.add_argument('--ent-coef', type=float, default=0.01, help="coefficient of the entropy") parser.add_argument('--max-grad-norm', type=float, default=0.5, help='the maximum norm for the gradient clipping') parser.add_argument('--clip-coef', type=float, default=0.2, help="the surrogate clipping coefficient") parser.add_argument('--update-epochs', type=int, default=10, help="the K epochs to update the policy") parser.add_argument('--kle-stop', action='store_true', default=False, help='If toggled, the policy updates will be early stopped w.r.t target-kl') parser.add_argument('--kle-rollback', action='store_true', default=False, help='If toggled, the policy updates will roll back to previous policy if KL exceeds target-kl') parser.add_argument('--target-kl', type=float, default=0.015, help='the target-kl variable that is referred by --kl') parser.add_argument('--gae', action='store_true', default=True, help='Use GAE for advantage computation') parser.add_argument('--policy-lr', type=float, default=3e-4, help="the learning rate of the policy optimizer") parser.add_argument('--value-lr', type=float, default=3e-4, help="the learning rate of the critic optimizer") parser.add_argument('--norm-obs', action='store_true', default=True, help="Toggles observation normalization") parser.add_argument('--norm-returns', action='store_true', default=False, help="Toggles returns normalization") parser.add_argument('--norm-adv', action='store_true', default=True, help="Toggles advantages normalization") parser.add_argument('--obs-clip', type=float, default=10.0, help="Value for reward clipping, as per the paper") parser.add_argument('--rew-clip', type=float, default=10.0, help="Value for observation clipping, as per the paper") parser.add_argument('--anneal-lr', action='store_true', default=True, help="Toggle learning rate annealing for policy and value networks") parser.add_argument('--weights-init', default="orthogonal", choices=["xavier", 'orthogonal'], help='Selects the scheme to be used for weights initialization'), parser.add_argument('--clip-vloss', action="store_true", default=True, help='Toggles wheter or not to use a clipped loss for the value function, as per the paper.') parser.add_argument('--pol-layer-norm', action='store_true', default=False, help='Enables layer normalization in the policy network') parser.add_argument('--invalid-action-penalty', type=float, default=0.00, help='the negative reward penalty for invalid actions') args = parser.parse_args() if not args.seed: args.seed = int(time.time()) args.features_turned_on = sum([args.kle_stop, args.kle_rollback, args.gae, args.norm_obs, args.norm_returns, args.norm_adv, args.anneal_lr, args.clip_vloss, args.pol_layer_norm]) # TRY NOT TO MODIFY: setup the environment experiment_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" writer = SummaryWriter(f"runs/{experiment_name}") writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % ( '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) if args.prod_mode: import wandb wandb.init(project=args.wandb_project_name, entity=args.wandb_entity, tensorboard=True, config=vars(args), name=experiment_name, monitor_gym=True) writer = SummaryWriter(f"/tmp/{experiment_name}") wandb.save(os.path.abspath(__file__)) # TRY NOT TO MODIFY: seeding device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu') env = gym.make(args.gym_id) # respect the default timelimit assert isinstance(env.action_space, MultiDiscrete), "only MultiDiscrete action space is supported" assert isinstance(env, TimeLimit) or int(args.episode_length), "the gym env does not have a built in TimeLimit, please specify by using --episode-length" if isinstance(env, TimeLimit): if int(args.episode_length): env._max_episode_steps = int(args.episode_length) args.episode_length = env._max_episode_steps else: env = TimeLimit(env, int(args.episode_length)) env = NormalizedEnv(env.env, ob=args.norm_obs, ret=args.norm_returns, clipob=args.obs_clip, cliprew=args.rew_clip, gamma=args.gamma) env = TimeLimit(env, int(args.episode_length)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic env.seed(args.seed) env.action_space.seed(args.seed) env.observation_space.seed(args.seed) if args.capture_video: env = Monitor(env, f'videos/{experiment_name}') # ALGO LOGIC: initialize agent here: class CategoricalMasked(Categorical): def __init__(self, probs=None, logits=None, validate_args=None, masks=[]): self.masks = masks if len(self.masks) == 0: super(CategoricalMasked, self).__init__(probs, logits, validate_args) else: self.masks = masks.type(torch.BoolTensor).to(device) logits = torch.where(self.masks, logits, torch.tensor(-1e+8).to(device)) super(CategoricalMasked, self).__init__(probs, logits, validate_args) def entropy(self): if len(self.masks) == 0: return super(CategoricalMasked, self).entropy() p_log_p = self.logits * self.probs p_log_p = torch.where(self.masks, p_log_p, torch.tensor(0.).to(device)) return -p_log_p.sum(-1) class Policy(nn.Module): def __init__(self): super(Policy, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=3,), nn.MaxPool2d(1), nn.ReLU(), nn.Conv2d(16, 32, kernel_size=3), nn.MaxPool2d(1), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(32*6*6, 128), nn.ReLU(), nn.Linear(128, env.action_space.nvec.sum()) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def get_action(self, x, action=None): logits = self.forward(x) split_logits = torch.split(logits, env.action_space.nvec.tolist(), dim=1) multi_categoricals = [Categorical(logits=logits) for logits in split_logits] if action is None: action = torch.stack([categorical.sample() for categorical in multi_categoricals]) logprob = torch.stack([categorical.log_prob(a) for a, categorical in zip(action, multi_categoricals)]) # entropy = torch.stack([categorical.entropy() for categorical in multi_categoricals]) return action, logprob, [], multi_categoricals class Value(nn.Module): def __init__(self): super(Value, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=3,), nn.MaxPool2d(1), nn.ReLU(), nn.Conv2d(16, 32, kernel_size=3), nn.MaxPool2d(1), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(32*6*6, 128), nn.ReLU(), nn.Linear(128, 1) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def discount_cumsum(x, dones, gamma): """ computing discounted cumulative sums of vectors that resets with dones input: vector x, vector dones, [x0, [0, x1, 0, x2 1, x3 0, x4] 0] output: [x0 + discount * x1 + discount^2 * x2, x1 + discount * x2, x2, x3 + discount * x4, x4] """ discount_cumsum = np.zeros_like(x) discount_cumsum[-1] = x[-1] for t in reversed(range(x.shape[0]-1)): discount_cumsum[t] = x[t] + gamma * discount_cumsum[t+1] * (1-dones[t]) return discount_cumsum pg = Policy().to(device) vf = Value().to(device) # MODIFIED: Separate optimizer and learning rates pg_optimizer = optim.Adam(list(pg.parameters()), lr=args.policy_lr) v_optimizer = optim.Adam(list(vf.parameters()), lr=args.value_lr) # MODIFIED: Initializing learning rate anneal scheduler when need if args.anneal_lr: anneal_fn = lambda f: max(0, 1-f / args.total_timesteps) pg_lr_scheduler = optim.lr_scheduler.LambdaLR(pg_optimizer, lr_lambda=anneal_fn) vf_lr_scheduler = optim.lr_scheduler.LambdaLR(v_optimizer, lr_lambda=anneal_fn) loss_fn = nn.MSELoss() # TRY NOT TO MODIFY: start the game global_step = 0 while global_step < args.total_timesteps: if args.capture_video: env.stats_recorder.done=True next_obs = np.array(env.reset()) # ALGO Logic: Storage for epoch data obs = np.empty((args.batch_size,) + env.observation_space.shape) actions = np.empty((args.batch_size,) + env.action_space.shape) logprobs = torch.zeros((env.action_space.nvec.shape[0], args.batch_size,)).to(device) rewards = np.zeros((args.batch_size,)) real_rewards = [] invalid_action_stats = [] dones = np.zeros((args.batch_size,)) values = torch.zeros((args.batch_size,)).to(device) # TRY NOT TO MODIFY: prepare the execution of the game. for step in range(args.batch_size): env.render() global_step += 1 obs[step] = next_obs.copy() # ALGO LOGIC: put action logic here with torch.no_grad(): values[step] = vf.forward(obs[step:step+1]) action, logproba, _, probs = pg.get_action(obs[step:step+1]) actions[step] = action[:,0].data.cpu().numpy() logprobs[:,[step]] = logproba # TRY NOT TO MODIFY: execute the game and log data. try: next_obs, rewards[step], dones[step], info = env.step(action[:,0].data.cpu().numpy()) rewards[step] += pd.DataFrame([info['invalid_action_stats']]).sum().sum() * args.invalid_action_penalty except Exception as e: print(e) print(e.stacktrace()) real_rewards += [info['real_reward']] invalid_action_stats += [info['invalid_action_stats']] next_obs = np.array(next_obs) # Annealing the rate if instructed to do so. if args.anneal_lr: pg_lr_scheduler.step() vf_lr_scheduler.step() if dones[step]: # Computing the discounted returns: writer.add_scalar("charts/episode_reward", np.sum(real_rewards), global_step) print(f"global_step={global_step}, episode_reward={np.sum(real_rewards)}") real_rewards = [] for key, idx in zip(info['invalid_action_stats'], range(len(info['invalid_action_stats']))): writer.add_scalar(f"stats/{key}", pd.DataFrame(invalid_action_stats).sum(0)[idx], global_step) invalid_action_stats = [] next_obs = np.array(env.reset()) # bootstrap reward if not done. reached the batch limit last_value = 0 if not dones[step]: last_value = vf.forward(next_obs.reshape((1,)+next_obs.shape))[0].detach().cpu().numpy()[0] bootstrapped_rewards = np.append(rewards, last_value) # calculate the returns and advantages if args.gae: bootstrapped_values = np.append(values.detach().cpu().numpy(), last_value) deltas = bootstrapped_rewards[:-1] + args.gamma * bootstrapped_values[1:] * (1-dones) - bootstrapped_values[:-1] advantages = discount_cumsum(deltas, dones, args.gamma * args.gae_lambda) advantages = torch.Tensor(advantages).to(device) returns = advantages + values else: returns = discount_cumsum(bootstrapped_rewards, dones, args.gamma)[:-1] advantages = returns - values.detach().cpu().numpy() advantages = torch.Tensor(advantages).to(device) returns = torch.Tensor(returns).to(device) # Advantage normalization if args.norm_adv: EPS = 1e-10 advantages = (advantages - advantages.mean()) / (advantages.std() + EPS) # Optimizaing policy network entropys = [] target_pg = Policy().to(device) inds = np.arange(args.batch_size,) for i_epoch_pi in range(args.update_epochs): np.random.shuffle(inds) for start in range(0, args.batch_size, args.minibatch_size): end = start + args.minibatch_size minibatch_ind = inds[start:end] target_pg.load_state_dict(pg.state_dict()) _, newlogproba, _, _ = pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T) ratio = (newlogproba - logprobs[:,minibatch_ind]).exp() # Policy loss as in OpenAI SpinUp clip_adv = torch.where(advantages[minibatch_ind] > 0, (1.+args.clip_coef) * advantages[minibatch_ind], (1.-args.clip_coef) * advantages[minibatch_ind]).to(device) # Entropy computation with resampled actions entropy = -(newlogproba.exp() * newlogproba).mean() entropys.append(entropy.item()) policy_loss = -torch.min(ratio * advantages[minibatch_ind], clip_adv) + args.ent_coef * entropy policy_loss = policy_loss.mean() pg_optimizer.zero_grad() policy_loss.backward() nn.utils.clip_grad_norm_(pg.parameters(), args.max_grad_norm) pg_optimizer.step() approx_kl = (logprobs[:,minibatch_ind] - newlogproba).mean() # Resample values new_values = vf.forward(obs[minibatch_ind]).view(-1) # Value loss clipping if args.clip_vloss: v_loss_unclipped = ((new_values - returns[minibatch_ind]) ** 2) v_clipped = values[minibatch_ind] + torch.clamp(new_values - values[minibatch_ind], -args.clip_coef, args.clip_coef) v_loss_clipped = (v_clipped - returns[minibatch_ind])**2 v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) v_loss = 0.5 * v_loss_max.mean() else: v_loss = torch.mean((returns[minibatch_ind]- new_values).pow(2)) v_optimizer.zero_grad() v_loss.backward() nn.utils.clip_grad_norm_(vf.parameters(), args.max_grad_norm) v_optimizer.step() if args.kle_stop: if approx_kl > args.target_kl: break if args.kle_rollback: if (logprobs[:,minibatch_ind] - pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T, )[1]).mean() > args.target_kl: pg.load_state_dict(target_pg.state_dict()) break # TRY NOT TO MODIFY: record rewards for plotting purposes writer.add_scalar("losses/value_loss", v_loss.item(), global_step) writer.add_scalar("charts/policy_learning_rate", pg_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("charts/value_learning_rate", v_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("losses/policy_loss", policy_loss.item(), global_step) writer.add_scalar("losses/entropy", np.mean(entropys), global_step) writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) if args.kle_stop or args.kle_rollback: writer.add_scalar("debug/pg_stop_iter", i_epoch_pi, global_step) env.close() writer.close() ================================================ FILE: invalid_action_masking/ppo_no_mask_16x16.py ================================================ import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.distributions.categorical import Categorical from torch.utils.tensorboard import SummaryWriter from cleanrl.common import preprocess_obs_space, preprocess_ac_space import argparse import numpy as np import gym import gym_microrts from gym.wrappers import TimeLimit, Monitor from gym.spaces import Discrete, Box, MultiBinary, MultiDiscrete, Space import time import random import os import pandas as pd # taken from https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_normalize.py class RunningMeanStd(object): def __init__(self, epsilon=1e-4, shape=()): self.mean = np.zeros(shape, 'float64') self.var = np.ones(shape, 'float64') self.count = epsilon def update(self, x): batch_mean = np.mean([x], axis=0) batch_var = np.var([x], axis=0) batch_count = 1 self.update_from_moments(batch_mean, batch_var, batch_count) def update_from_moments(self, batch_mean, batch_var, batch_count): self.mean, self.var, self.count = update_mean_var_count_from_moments( self.mean, self.var, self.count, batch_mean, batch_var, batch_count) def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count): delta = batch_mean - mean tot_count = count + batch_count new_mean = mean + delta * batch_count / tot_count m_a = var * count m_b = batch_var * batch_count M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count new_var = M2 / tot_count new_count = tot_count return new_mean, new_var, new_count class NormalizedEnv(gym.core.Wrapper): def __init__(self, env, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): super(NormalizedEnv, self).__init__(env) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=(1,)) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(()) self.gamma = gamma self.epsilon = epsilon def step(self, action): obs, rews, news, infos = self.env.step(action) infos['real_reward'] = rews # print("before", self.ret) self.ret = self.ret * self.gamma + rews # print("after", self.ret) obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(np.array([self.ret].copy())) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret = self.ret * (1-float(news)) return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): self.ret = np.zeros(()) obs = self.env.reset() return self._obfilt(obs) if __name__ == "__main__": parser = argparse.ArgumentParser(description='PPO agent') # Common arguments parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"), help='the name of this experiment') parser.add_argument('--gym-id', type=str, default="MicrortsMining16x16F9-v0", help='the id of the gym environment') parser.add_argument('--seed', type=int, default=1, help='seed of the experiment') parser.add_argument('--episode-length', type=int, default=0, help='the maximum length of each episode') parser.add_argument('--total-timesteps', type=int, default=100000, help='total timesteps of the experiments') parser.add_argument('--no-torch-deterministic', action='store_false', dest="torch_deterministic", default=True, help='if toggled, `torch.backends.cudnn.deterministic=False`') parser.add_argument('--no-cuda', action='store_false', dest="cuda", default=True, help='if toggled, cuda will not be enabled by default') parser.add_argument('--prod-mode', action='store_true', default=False, help='run the script in production mode and use wandb to log outputs') parser.add_argument('--capture-video', action='store_true', default=False, help='weather to capture videos of the agent performances (check out `videos` folder)') parser.add_argument('--wandb-project-name', type=str, default="cleanRL", help="the wandb's project name") parser.add_argument('--wandb-entity', type=str, default=None, help="the entity (team) of wandb's project") # Algorithm specific arguments parser.add_argument('--batch-size', type=int, default=2048, help='the batch size of ppo') parser.add_argument('--minibatch-size', type=int, default=256, help='the mini batch size of ppo') parser.add_argument('--gamma', type=float, default=0.99, help='the discount factor gamma') parser.add_argument('--gae-lambda', type=float, default=0.97, help='the lambda for the general advantage estimation') parser.add_argument('--ent-coef', type=float, default=0.01, help="coefficient of the entropy") parser.add_argument('--max-grad-norm', type=float, default=0.5, help='the maximum norm for the gradient clipping') parser.add_argument('--clip-coef', type=float, default=0.2, help="the surrogate clipping coefficient") parser.add_argument('--update-epochs', type=int, default=10, help="the K epochs to update the policy") parser.add_argument('--kle-stop', action='store_true', default=False, help='If toggled, the policy updates will be early stopped w.r.t target-kl') parser.add_argument('--kle-rollback', action='store_true', default=False, help='If toggled, the policy updates will roll back to previous policy if KL exceeds target-kl') parser.add_argument('--target-kl', type=float, default=0.015, help='the target-kl variable that is referred by --kl') parser.add_argument('--gae', action='store_true', default=True, help='Use GAE for advantage computation') parser.add_argument('--policy-lr', type=float, default=3e-4, help="the learning rate of the policy optimizer") parser.add_argument('--value-lr', type=float, default=3e-4, help="the learning rate of the critic optimizer") parser.add_argument('--norm-obs', action='store_true', default=True, help="Toggles observation normalization") parser.add_argument('--norm-returns', action='store_true', default=False, help="Toggles returns normalization") parser.add_argument('--norm-adv', action='store_true', default=True, help="Toggles advantages normalization") parser.add_argument('--obs-clip', type=float, default=10.0, help="Value for reward clipping, as per the paper") parser.add_argument('--rew-clip', type=float, default=10.0, help="Value for observation clipping, as per the paper") parser.add_argument('--anneal-lr', action='store_true', default=True, help="Toggle learning rate annealing for policy and value networks") parser.add_argument('--weights-init', default="orthogonal", choices=["xavier", 'orthogonal'], help='Selects the scheme to be used for weights initialization'), parser.add_argument('--clip-vloss', action="store_true", default=True, help='Toggles wheter or not to use a clipped loss for the value function, as per the paper.') parser.add_argument('--pol-layer-norm', action='store_true', default=False, help='Enables layer normalization in the policy network') parser.add_argument('--invalid-action-penalty', type=float, default=0.00, help='the negative reward penalty for invalid actions') args = parser.parse_args() if not args.seed: args.seed = int(time.time()) args.features_turned_on = sum([args.kle_stop, args.kle_rollback, args.gae, args.norm_obs, args.norm_returns, args.norm_adv, args.anneal_lr, args.clip_vloss, args.pol_layer_norm]) # TRY NOT TO MODIFY: setup the environment experiment_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" writer = SummaryWriter(f"runs/{experiment_name}") writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % ( '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) if args.prod_mode: import wandb wandb.init(project=args.wandb_project_name, entity=args.wandb_entity, tensorboard=True, config=vars(args), name=experiment_name, monitor_gym=True) writer = SummaryWriter(f"/tmp/{experiment_name}") wandb.save(os.path.abspath(__file__)) # TRY NOT TO MODIFY: seeding device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu') env = gym.make(args.gym_id) # respect the default timelimit assert isinstance(env.action_space, MultiDiscrete), "only MultiDiscrete action space is supported" assert isinstance(env, TimeLimit) or int(args.episode_length), "the gym env does not have a built in TimeLimit, please specify by using --episode-length" if isinstance(env, TimeLimit): if int(args.episode_length): env._max_episode_steps = int(args.episode_length) args.episode_length = env._max_episode_steps else: env = TimeLimit(env, int(args.episode_length)) env = NormalizedEnv(env.env, ob=args.norm_obs, ret=args.norm_returns, clipob=args.obs_clip, cliprew=args.rew_clip, gamma=args.gamma) env = TimeLimit(env, int(args.episode_length)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic env.seed(args.seed) env.action_space.seed(args.seed) env.observation_space.seed(args.seed) if args.capture_video: env = Monitor(env, f'videos/{experiment_name}') # ALGO LOGIC: initialize agent here: class CategoricalMasked(Categorical): def __init__(self, probs=None, logits=None, validate_args=None, masks=[]): self.masks = masks if len(self.masks) == 0: super(CategoricalMasked, self).__init__(probs, logits, validate_args) else: self.masks = masks.type(torch.BoolTensor).to(device) logits = torch.where(self.masks, logits, torch.tensor(-1e+8).to(device)) super(CategoricalMasked, self).__init__(probs, logits, validate_args) def entropy(self): if len(self.masks) == 0: return super(CategoricalMasked, self).entropy() p_log_p = self.logits * self.probs p_log_p = torch.where(self.masks, p_log_p, torch.tensor(0.).to(device)) return -p_log_p.sum(-1) class Policy(nn.Module): def __init__(self): super(Policy, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=3), nn.MaxPool2d(2), nn.ReLU(), nn.Conv2d(16, 32, kernel_size=3), nn.MaxPool2d(1), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(32*5*5, 128), nn.ReLU(), nn.Linear(128, env.action_space.nvec.sum()) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def get_action(self, x, action=None): logits = self.forward(x) split_logits = torch.split(logits, env.action_space.nvec.tolist(), dim=1) multi_categoricals = [Categorical(logits=logits) for logits in split_logits] if action is None: action = torch.stack([categorical.sample() for categorical in multi_categoricals]) logprob = torch.stack([categorical.log_prob(a) for a, categorical in zip(action, multi_categoricals)]) # entropy = torch.stack([categorical.entropy() for categorical in multi_categoricals]) return action, logprob, [], multi_categoricals class Value(nn.Module): def __init__(self): super(Value, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=3), nn.MaxPool2d(2), nn.ReLU(), nn.Conv2d(16, 32, kernel_size=3), nn.MaxPool2d(1), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(32*5*5, 128), nn.ReLU(), nn.Linear(128, 1) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def discount_cumsum(x, dones, gamma): """ computing discounted cumulative sums of vectors that resets with dones input: vector x, vector dones, [x0, [0, x1, 0, x2 1, x3 0, x4] 0] output: [x0 + discount * x1 + discount^2 * x2, x1 + discount * x2, x2, x3 + discount * x4, x4] """ discount_cumsum = np.zeros_like(x) discount_cumsum[-1] = x[-1] for t in reversed(range(x.shape[0]-1)): discount_cumsum[t] = x[t] + gamma * discount_cumsum[t+1] * (1-dones[t]) return discount_cumsum pg = Policy().to(device) vf = Value().to(device) # MODIFIED: Separate optimizer and learning rates pg_optimizer = optim.Adam(list(pg.parameters()), lr=args.policy_lr) v_optimizer = optim.Adam(list(vf.parameters()), lr=args.value_lr) # MODIFIED: Initializing learning rate anneal scheduler when need if args.anneal_lr: anneal_fn = lambda f: max(0, 1-f / args.total_timesteps) pg_lr_scheduler = optim.lr_scheduler.LambdaLR(pg_optimizer, lr_lambda=anneal_fn) vf_lr_scheduler = optim.lr_scheduler.LambdaLR(v_optimizer, lr_lambda=anneal_fn) loss_fn = nn.MSELoss() # TRY NOT TO MODIFY: start the game global_step = 0 while global_step < args.total_timesteps: if args.capture_video: env.stats_recorder.done=True next_obs = np.array(env.reset()) # ALGO Logic: Storage for epoch data obs = np.empty((args.batch_size,) + env.observation_space.shape) actions = np.empty((args.batch_size,) + env.action_space.shape) logprobs = torch.zeros((env.action_space.nvec.shape[0], args.batch_size,)).to(device) rewards = np.zeros((args.batch_size,)) real_rewards = [] invalid_action_stats = [] dones = np.zeros((args.batch_size,)) values = torch.zeros((args.batch_size,)).to(device) # TRY NOT TO MODIFY: prepare the execution of the game. for step in range(args.batch_size): env.render() global_step += 1 obs[step] = next_obs.copy() # ALGO LOGIC: put action logic here with torch.no_grad(): values[step] = vf.forward(obs[step:step+1]) action, logproba, _, probs = pg.get_action(obs[step:step+1]) actions[step] = action[:,0].data.cpu().numpy() logprobs[:,[step]] = logproba # TRY NOT TO MODIFY: execute the game and log data. try: next_obs, rewards[step], dones[step], info = env.step(action[:,0].data.cpu().numpy()) rewards[step] += pd.DataFrame([info['invalid_action_stats']]).sum().sum() * args.invalid_action_penalty except Exception as e: print(e) print(e.stacktrace()) real_rewards += [info['real_reward']] invalid_action_stats += [info['invalid_action_stats']] next_obs = np.array(next_obs) # Annealing the rate if instructed to do so. if args.anneal_lr: pg_lr_scheduler.step() vf_lr_scheduler.step() if dones[step]: # Computing the discounted returns: writer.add_scalar("charts/episode_reward", np.sum(real_rewards), global_step) print(f"global_step={global_step}, episode_reward={np.sum(real_rewards)}") real_rewards = [] for key, idx in zip(info['invalid_action_stats'], range(len(info['invalid_action_stats']))): writer.add_scalar(f"stats/{key}", pd.DataFrame(invalid_action_stats).sum(0)[idx], global_step) invalid_action_stats = [] next_obs = np.array(env.reset()) # bootstrap reward if not done. reached the batch limit last_value = 0 if not dones[step]: last_value = vf.forward(next_obs.reshape((1,)+next_obs.shape))[0].detach().cpu().numpy()[0] bootstrapped_rewards = np.append(rewards, last_value) # calculate the returns and advantages if args.gae: bootstrapped_values = np.append(values.detach().cpu().numpy(), last_value) deltas = bootstrapped_rewards[:-1] + args.gamma * bootstrapped_values[1:] * (1-dones) - bootstrapped_values[:-1] advantages = discount_cumsum(deltas, dones, args.gamma * args.gae_lambda) advantages = torch.Tensor(advantages).to(device) returns = advantages + values else: returns = discount_cumsum(bootstrapped_rewards, dones, args.gamma)[:-1] advantages = returns - values.detach().cpu().numpy() advantages = torch.Tensor(advantages).to(device) returns = torch.Tensor(returns).to(device) # Advantage normalization if args.norm_adv: EPS = 1e-10 advantages = (advantages - advantages.mean()) / (advantages.std() + EPS) # Optimizaing policy network entropys = [] target_pg = Policy().to(device) inds = np.arange(args.batch_size,) for i_epoch_pi in range(args.update_epochs): np.random.shuffle(inds) for start in range(0, args.batch_size, args.minibatch_size): end = start + args.minibatch_size minibatch_ind = inds[start:end] target_pg.load_state_dict(pg.state_dict()) _, newlogproba, _, _ = pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T) ratio = (newlogproba - logprobs[:,minibatch_ind]).exp() # Policy loss as in OpenAI SpinUp clip_adv = torch.where(advantages[minibatch_ind] > 0, (1.+args.clip_coef) * advantages[minibatch_ind], (1.-args.clip_coef) * advantages[minibatch_ind]).to(device) # Entropy computation with resampled actions entropy = -(newlogproba.exp() * newlogproba).mean() entropys.append(entropy.item()) policy_loss = -torch.min(ratio * advantages[minibatch_ind], clip_adv) + args.ent_coef * entropy policy_loss = policy_loss.mean() pg_optimizer.zero_grad() policy_loss.backward() nn.utils.clip_grad_norm_(pg.parameters(), args.max_grad_norm) pg_optimizer.step() approx_kl = (logprobs[:,minibatch_ind] - newlogproba).mean() # Resample values new_values = vf.forward(obs[minibatch_ind]).view(-1) # Value loss clipping if args.clip_vloss: v_loss_unclipped = ((new_values - returns[minibatch_ind]) ** 2) v_clipped = values[minibatch_ind] + torch.clamp(new_values - values[minibatch_ind], -args.clip_coef, args.clip_coef) v_loss_clipped = (v_clipped - returns[minibatch_ind])**2 v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) v_loss = 0.5 * v_loss_max.mean() else: v_loss = torch.mean((returns[minibatch_ind]- new_values).pow(2)) v_optimizer.zero_grad() v_loss.backward() nn.utils.clip_grad_norm_(vf.parameters(), args.max_grad_norm) v_optimizer.step() if args.kle_stop: if approx_kl > args.target_kl: break if args.kle_rollback: if (logprobs[:,minibatch_ind] - pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T, )[1]).mean() > args.target_kl: pg.load_state_dict(target_pg.state_dict()) break # TRY NOT TO MODIFY: record rewards for plotting purposes writer.add_scalar("losses/value_loss", v_loss.item(), global_step) writer.add_scalar("charts/policy_learning_rate", pg_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("charts/value_learning_rate", v_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("losses/policy_loss", policy_loss.item(), global_step) writer.add_scalar("losses/entropy", np.mean(entropys), global_step) writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) if args.kle_stop or args.kle_rollback: writer.add_scalar("debug/pg_stop_iter", i_epoch_pi, global_step) env.close() writer.close() ================================================ FILE: invalid_action_masking/ppo_no_mask_24x24.py ================================================ import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.distributions.categorical import Categorical from torch.utils.tensorboard import SummaryWriter from cleanrl.common import preprocess_obs_space, preprocess_ac_space import argparse import numpy as np import gym import gym_microrts from gym.wrappers import TimeLimit, Monitor from gym.spaces import Discrete, Box, MultiBinary, MultiDiscrete, Space import time import random import os import pandas as pd # taken from https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_normalize.py class RunningMeanStd(object): def __init__(self, epsilon=1e-4, shape=()): self.mean = np.zeros(shape, 'float64') self.var = np.ones(shape, 'float64') self.count = epsilon def update(self, x): batch_mean = np.mean([x], axis=0) batch_var = np.var([x], axis=0) batch_count = 1 self.update_from_moments(batch_mean, batch_var, batch_count) def update_from_moments(self, batch_mean, batch_var, batch_count): self.mean, self.var, self.count = update_mean_var_count_from_moments( self.mean, self.var, self.count, batch_mean, batch_var, batch_count) def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count): delta = batch_mean - mean tot_count = count + batch_count new_mean = mean + delta * batch_count / tot_count m_a = var * count m_b = batch_var * batch_count M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count new_var = M2 / tot_count new_count = tot_count return new_mean, new_var, new_count class NormalizedEnv(gym.core.Wrapper): def __init__(self, env, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): super(NormalizedEnv, self).__init__(env) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=(1,)) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(()) self.gamma = gamma self.epsilon = epsilon def step(self, action): obs, rews, news, infos = self.env.step(action) infos['real_reward'] = rews # print("before", self.ret) self.ret = self.ret * self.gamma + rews # print("after", self.ret) obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(np.array([self.ret].copy())) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret = self.ret * (1-float(news)) return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): self.ret = np.zeros(()) obs = self.env.reset() return self._obfilt(obs) if __name__ == "__main__": parser = argparse.ArgumentParser(description='PPO agent') # Common arguments parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"), help='the name of this experiment') parser.add_argument('--gym-id', type=str, default="MicrortsMining24x24F9-v0", help='the id of the gym environment') parser.add_argument('--seed', type=int, default=1, help='seed of the experiment') parser.add_argument('--episode-length', type=int, default=0, help='the maximum length of each episode') parser.add_argument('--total-timesteps', type=int, default=100000, help='total timesteps of the experiments') parser.add_argument('--no-torch-deterministic', action='store_false', dest="torch_deterministic", default=True, help='if toggled, `torch.backends.cudnn.deterministic=False`') parser.add_argument('--no-cuda', action='store_false', dest="cuda", default=True, help='if toggled, cuda will not be enabled by default') parser.add_argument('--prod-mode', action='store_true', default=False, help='run the script in production mode and use wandb to log outputs') parser.add_argument('--capture-video', action='store_true', default=False, help='weather to capture videos of the agent performances (check out `videos` folder)') parser.add_argument('--wandb-project-name', type=str, default="cleanRL", help="the wandb's project name") parser.add_argument('--wandb-entity', type=str, default=None, help="the entity (team) of wandb's project") # Algorithm specific arguments parser.add_argument('--batch-size', type=int, default=2048, help='the batch size of ppo') parser.add_argument('--minibatch-size', type=int, default=256, help='the mini batch size of ppo') parser.add_argument('--gamma', type=float, default=0.99, help='the discount factor gamma') parser.add_argument('--gae-lambda', type=float, default=0.97, help='the lambda for the general advantage estimation') parser.add_argument('--ent-coef', type=float, default=0.01, help="coefficient of the entropy") parser.add_argument('--max-grad-norm', type=float, default=0.5, help='the maximum norm for the gradient clipping') parser.add_argument('--clip-coef', type=float, default=0.2, help="the surrogate clipping coefficient") parser.add_argument('--update-epochs', type=int, default=10, help="the K epochs to update the policy") parser.add_argument('--kle-stop', action='store_true', default=False, help='If toggled, the policy updates will be early stopped w.r.t target-kl') parser.add_argument('--kle-rollback', action='store_true', default=False, help='If toggled, the policy updates will roll back to previous policy if KL exceeds target-kl') parser.add_argument('--target-kl', type=float, default=0.015, help='the target-kl variable that is referred by --kl') parser.add_argument('--gae', action='store_true', default=True, help='Use GAE for advantage computation') parser.add_argument('--policy-lr', type=float, default=3e-4, help="the learning rate of the policy optimizer") parser.add_argument('--value-lr', type=float, default=3e-4, help="the learning rate of the critic optimizer") parser.add_argument('--norm-obs', action='store_true', default=True, help="Toggles observation normalization") parser.add_argument('--norm-returns', action='store_true', default=False, help="Toggles returns normalization") parser.add_argument('--norm-adv', action='store_true', default=True, help="Toggles advantages normalization") parser.add_argument('--obs-clip', type=float, default=10.0, help="Value for reward clipping, as per the paper") parser.add_argument('--rew-clip', type=float, default=10.0, help="Value for observation clipping, as per the paper") parser.add_argument('--anneal-lr', action='store_true', default=True, help="Toggle learning rate annealing for policy and value networks") parser.add_argument('--weights-init', default="orthogonal", choices=["xavier", 'orthogonal'], help='Selects the scheme to be used for weights initialization'), parser.add_argument('--clip-vloss', action="store_true", default=True, help='Toggles wheter or not to use a clipped loss for the value function, as per the paper.') parser.add_argument('--pol-layer-norm', action='store_true', default=False, help='Enables layer normalization in the policy network') parser.add_argument('--invalid-action-penalty', type=float, default=0.00, help='the negative reward penalty for invalid actions') args = parser.parse_args() if not args.seed: args.seed = int(time.time()) args.features_turned_on = sum([args.kle_stop, args.kle_rollback, args.gae, args.norm_obs, args.norm_returns, args.norm_adv, args.anneal_lr, args.clip_vloss, args.pol_layer_norm]) # TRY NOT TO MODIFY: setup the environment experiment_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" writer = SummaryWriter(f"runs/{experiment_name}") writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % ( '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) if args.prod_mode: import wandb wandb.init(project=args.wandb_project_name, entity=args.wandb_entity, tensorboard=True, config=vars(args), name=experiment_name, monitor_gym=True) writer = SummaryWriter(f"/tmp/{experiment_name}") wandb.save(os.path.abspath(__file__)) # TRY NOT TO MODIFY: seeding device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu') env = gym.make(args.gym_id) # respect the default timelimit assert isinstance(env.action_space, MultiDiscrete), "only MultiDiscrete action space is supported" assert isinstance(env, TimeLimit) or int(args.episode_length), "the gym env does not have a built in TimeLimit, please specify by using --episode-length" if isinstance(env, TimeLimit): if int(args.episode_length): env._max_episode_steps = int(args.episode_length) args.episode_length = env._max_episode_steps else: env = TimeLimit(env, int(args.episode_length)) env = NormalizedEnv(env.env, ob=args.norm_obs, ret=args.norm_returns, clipob=args.obs_clip, cliprew=args.rew_clip, gamma=args.gamma) env = TimeLimit(env, int(args.episode_length)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic env.seed(args.seed) env.action_space.seed(args.seed) env.observation_space.seed(args.seed) if args.capture_video: env = Monitor(env, f'videos/{experiment_name}') # ALGO LOGIC: initialize agent here: class CategoricalMasked(Categorical): def __init__(self, probs=None, logits=None, validate_args=None, masks=[]): self.masks = masks if len(self.masks) == 0: super(CategoricalMasked, self).__init__(probs, logits, validate_args) else: self.masks = masks.type(torch.BoolTensor).to(device) logits = torch.where(self.masks, logits, torch.tensor(-1e+8).to(device)) super(CategoricalMasked, self).__init__(probs, logits, validate_args) def entropy(self): if len(self.masks) == 0: return super(CategoricalMasked, self).entropy() p_log_p = self.logits * self.probs p_log_p = torch.where(self.masks, p_log_p, torch.tensor(0.).to(device)) return -p_log_p.sum(-1) class Policy(nn.Module): def __init__(self): super(Policy, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=3, stride=1), nn.MaxPool2d(2), nn.ReLU(), nn.Conv2d(16, 32, kernel_size=2, stride=1), nn.MaxPool2d(2), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(32*5*5, 128), nn.ReLU(), nn.Linear(128, env.action_space.nvec.sum()) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def get_action(self, x, action=None): logits = self.forward(x) split_logits = torch.split(logits, env.action_space.nvec.tolist(), dim=1) multi_categoricals = [Categorical(logits=logits) for logits in split_logits] if action is None: action = torch.stack([categorical.sample() for categorical in multi_categoricals]) logprob = torch.stack([categorical.log_prob(a) for a, categorical in zip(action, multi_categoricals)]) # entropy = torch.stack([categorical.entropy() for categorical in multi_categoricals]) return action, logprob, [], multi_categoricals class Value(nn.Module): def __init__(self): super(Value, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=3, stride=1), nn.MaxPool2d(2), nn.ReLU(), nn.Conv2d(16, 32, kernel_size=2, stride=1), nn.MaxPool2d(2), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(32*5*5, 128), nn.ReLU(), nn.Linear(128, 1) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def discount_cumsum(x, dones, gamma): """ computing discounted cumulative sums of vectors that resets with dones input: vector x, vector dones, [x0, [0, x1, 0, x2 1, x3 0, x4] 0] output: [x0 + discount * x1 + discount^2 * x2, x1 + discount * x2, x2, x3 + discount * x4, x4] """ discount_cumsum = np.zeros_like(x) discount_cumsum[-1] = x[-1] for t in reversed(range(x.shape[0]-1)): discount_cumsum[t] = x[t] + gamma * discount_cumsum[t+1] * (1-dones[t]) return discount_cumsum pg = Policy().to(device) vf = Value().to(device) # MODIFIED: Separate optimizer and learning rates pg_optimizer = optim.Adam(list(pg.parameters()), lr=args.policy_lr) v_optimizer = optim.Adam(list(vf.parameters()), lr=args.value_lr) # MODIFIED: Initializing learning rate anneal scheduler when need if args.anneal_lr: anneal_fn = lambda f: max(0, 1-f / args.total_timesteps) pg_lr_scheduler = optim.lr_scheduler.LambdaLR(pg_optimizer, lr_lambda=anneal_fn) vf_lr_scheduler = optim.lr_scheduler.LambdaLR(v_optimizer, lr_lambda=anneal_fn) loss_fn = nn.MSELoss() # TRY NOT TO MODIFY: start the game global_step = 0 while global_step < args.total_timesteps: if args.capture_video: env.stats_recorder.done=True next_obs = np.array(env.reset()) # ALGO Logic: Storage for epoch data obs = np.empty((args.batch_size,) + env.observation_space.shape) actions = np.empty((args.batch_size,) + env.action_space.shape) logprobs = torch.zeros((env.action_space.nvec.shape[0], args.batch_size,)).to(device) rewards = np.zeros((args.batch_size,)) real_rewards = [] invalid_action_stats = [] dones = np.zeros((args.batch_size,)) values = torch.zeros((args.batch_size,)).to(device) # TRY NOT TO MODIFY: prepare the execution of the game. for step in range(args.batch_size): env.render() global_step += 1 obs[step] = next_obs.copy() # ALGO LOGIC: put action logic here with torch.no_grad(): values[step] = vf.forward(obs[step:step+1]) action, logproba, _, probs = pg.get_action(obs[step:step+1]) actions[step] = action[:,0].data.cpu().numpy() logprobs[:,[step]] = logproba # TRY NOT TO MODIFY: execute the game and log data. try: next_obs, rewards[step], dones[step], info = env.step(action[:,0].data.cpu().numpy()) rewards[step] += pd.DataFrame([info['invalid_action_stats']]).sum().sum() * args.invalid_action_penalty except Exception as e: print(e) print(e.stacktrace()) real_rewards += [info['real_reward']] invalid_action_stats += [info['invalid_action_stats']] next_obs = np.array(next_obs) # Annealing the rate if instructed to do so. if args.anneal_lr: pg_lr_scheduler.step() vf_lr_scheduler.step() if dones[step]: # Computing the discounted returns: writer.add_scalar("charts/episode_reward", np.sum(real_rewards), global_step) print(f"global_step={global_step}, episode_reward={np.sum(real_rewards)}") real_rewards = [] for key, idx in zip(info['invalid_action_stats'], range(len(info['invalid_action_stats']))): writer.add_scalar(f"stats/{key}", pd.DataFrame(invalid_action_stats).sum(0)[idx], global_step) invalid_action_stats = [] next_obs = np.array(env.reset()) # bootstrap reward if not done. reached the batch limit last_value = 0 if not dones[step]: last_value = vf.forward(next_obs.reshape((1,)+next_obs.shape))[0].detach().cpu().numpy()[0] bootstrapped_rewards = np.append(rewards, last_value) # calculate the returns and advantages if args.gae: bootstrapped_values = np.append(values.detach().cpu().numpy(), last_value) deltas = bootstrapped_rewards[:-1] + args.gamma * bootstrapped_values[1:] * (1-dones) - bootstrapped_values[:-1] advantages = discount_cumsum(deltas, dones, args.gamma * args.gae_lambda) advantages = torch.Tensor(advantages).to(device) returns = advantages + values else: returns = discount_cumsum(bootstrapped_rewards, dones, args.gamma)[:-1] advantages = returns - values.detach().cpu().numpy() advantages = torch.Tensor(advantages).to(device) returns = torch.Tensor(returns).to(device) # Advantage normalization if args.norm_adv: EPS = 1e-10 advantages = (advantages - advantages.mean()) / (advantages.std() + EPS) # Optimizaing policy network entropys = [] target_pg = Policy().to(device) inds = np.arange(args.batch_size,) for i_epoch_pi in range(args.update_epochs): np.random.shuffle(inds) for start in range(0, args.batch_size, args.minibatch_size): end = start + args.minibatch_size minibatch_ind = inds[start:end] target_pg.load_state_dict(pg.state_dict()) _, newlogproba, _, _ = pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T) ratio = (newlogproba - logprobs[:,minibatch_ind]).exp() # Policy loss as in OpenAI SpinUp clip_adv = torch.where(advantages[minibatch_ind] > 0, (1.+args.clip_coef) * advantages[minibatch_ind], (1.-args.clip_coef) * advantages[minibatch_ind]).to(device) # Entropy computation with resampled actions entropy = -(newlogproba.exp() * newlogproba).mean() entropys.append(entropy.item()) policy_loss = -torch.min(ratio * advantages[minibatch_ind], clip_adv) + args.ent_coef * entropy policy_loss = policy_loss.mean() pg_optimizer.zero_grad() policy_loss.backward() nn.utils.clip_grad_norm_(pg.parameters(), args.max_grad_norm) pg_optimizer.step() approx_kl = (logprobs[:,minibatch_ind] - newlogproba).mean() # Resample values new_values = vf.forward(obs[minibatch_ind]).view(-1) # Value loss clipping if args.clip_vloss: v_loss_unclipped = ((new_values - returns[minibatch_ind]) ** 2) v_clipped = values[minibatch_ind] + torch.clamp(new_values - values[minibatch_ind], -args.clip_coef, args.clip_coef) v_loss_clipped = (v_clipped - returns[minibatch_ind])**2 v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) v_loss = 0.5 * v_loss_max.mean() else: v_loss = torch.mean((returns[minibatch_ind]- new_values).pow(2)) v_optimizer.zero_grad() v_loss.backward() nn.utils.clip_grad_norm_(vf.parameters(), args.max_grad_norm) v_optimizer.step() if args.kle_stop: if approx_kl > args.target_kl: break if args.kle_rollback: if (logprobs[:,minibatch_ind] - pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T, )[1]).mean() > args.target_kl: pg.load_state_dict(target_pg.state_dict()) break # TRY NOT TO MODIFY: record rewards for plotting purposes writer.add_scalar("losses/value_loss", v_loss.item(), global_step) writer.add_scalar("charts/policy_learning_rate", pg_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("charts/value_learning_rate", v_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("losses/policy_loss", policy_loss.item(), global_step) writer.add_scalar("losses/entropy", np.mean(entropys), global_step) writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) if args.kle_stop or args.kle_rollback: writer.add_scalar("debug/pg_stop_iter", i_epoch_pi, global_step) env.close() writer.close() ================================================ FILE: invalid_action_masking/ppo_no_mask_4x4.py ================================================ import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.distributions.categorical import Categorical from torch.utils.tensorboard import SummaryWriter from cleanrl.common import preprocess_obs_space, preprocess_ac_space import argparse import numpy as np import gym import gym_microrts from gym.wrappers import TimeLimit, Monitor from gym.spaces import Discrete, Box, MultiBinary, MultiDiscrete, Space import time import random import os import pandas as pd # taken from https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_normalize.py class RunningMeanStd(object): def __init__(self, epsilon=1e-4, shape=()): self.mean = np.zeros(shape, 'float64') self.var = np.ones(shape, 'float64') self.count = epsilon def update(self, x): batch_mean = np.mean([x], axis=0) batch_var = np.var([x], axis=0) batch_count = 1 self.update_from_moments(batch_mean, batch_var, batch_count) def update_from_moments(self, batch_mean, batch_var, batch_count): self.mean, self.var, self.count = update_mean_var_count_from_moments( self.mean, self.var, self.count, batch_mean, batch_var, batch_count) def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count): delta = batch_mean - mean tot_count = count + batch_count new_mean = mean + delta * batch_count / tot_count m_a = var * count m_b = batch_var * batch_count M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count new_var = M2 / tot_count new_count = tot_count return new_mean, new_var, new_count class NormalizedEnv(gym.core.Wrapper): def __init__(self, env, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): super(NormalizedEnv, self).__init__(env) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=(1,)) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(()) self.gamma = gamma self.epsilon = epsilon def step(self, action): obs, rews, news, infos = self.env.step(action) infos['real_reward'] = rews # print("before", self.ret) self.ret = self.ret * self.gamma + rews # print("after", self.ret) obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(np.array([self.ret].copy())) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret = self.ret * (1-float(news)) return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): self.ret = np.zeros(()) obs = self.env.reset() return self._obfilt(obs) if __name__ == "__main__": parser = argparse.ArgumentParser(description='PPO agent') # Common arguments parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"), help='the name of this experiment') parser.add_argument('--gym-id', type=str, default="MicrortsMining4x4F9-v0", help='the id of the gym environment') parser.add_argument('--seed', type=int, default=1, help='seed of the experiment') parser.add_argument('--episode-length', type=int, default=0, help='the maximum length of each episode') parser.add_argument('--total-timesteps', type=int, default=100000, help='total timesteps of the experiments') parser.add_argument('--no-torch-deterministic', action='store_false', dest="torch_deterministic", default=True, help='if toggled, `torch.backends.cudnn.deterministic=False`') parser.add_argument('--no-cuda', action='store_false', dest="cuda", default=True, help='if toggled, cuda will not be enabled by default') parser.add_argument('--prod-mode', action='store_true', default=False, help='run the script in production mode and use wandb to log outputs') parser.add_argument('--capture-video', action='store_true', default=False, help='weather to capture videos of the agent performances (check out `videos` folder)') parser.add_argument('--wandb-project-name', type=str, default="cleanRL", help="the wandb's project name") parser.add_argument('--wandb-entity', type=str, default=None, help="the entity (team) of wandb's project") # Algorithm specific arguments parser.add_argument('--batch-size', type=int, default=2048, help='the batch size of ppo') parser.add_argument('--minibatch-size', type=int, default=256, help='the mini batch size of ppo') parser.add_argument('--gamma', type=float, default=0.99, help='the discount factor gamma') parser.add_argument('--gae-lambda', type=float, default=0.97, help='the lambda for the general advantage estimation') parser.add_argument('--ent-coef', type=float, default=0.01, help="coefficient of the entropy") parser.add_argument('--max-grad-norm', type=float, default=0.5, help='the maximum norm for the gradient clipping') parser.add_argument('--clip-coef', type=float, default=0.2, help="the surrogate clipping coefficient") parser.add_argument('--update-epochs', type=int, default=10, help="the K epochs to update the policy") parser.add_argument('--kle-stop', action='store_true', default=False, help='If toggled, the policy updates will be early stopped w.r.t target-kl') parser.add_argument('--kle-rollback', action='store_true', default=False, help='If toggled, the policy updates will roll back to previous policy if KL exceeds target-kl') parser.add_argument('--target-kl', type=float, default=0.015, help='the target-kl variable that is referred by --kl') parser.add_argument('--gae', action='store_true', default=True, help='Use GAE for advantage computation') parser.add_argument('--policy-lr', type=float, default=3e-4, help="the learning rate of the policy optimizer") parser.add_argument('--value-lr', type=float, default=3e-4, help="the learning rate of the critic optimizer") parser.add_argument('--norm-obs', action='store_true', default=True, help="Toggles observation normalization") parser.add_argument('--norm-returns', action='store_true', default=False, help="Toggles returns normalization") parser.add_argument('--norm-adv', action='store_true', default=True, help="Toggles advantages normalization") parser.add_argument('--obs-clip', type=float, default=10.0, help="Value for reward clipping, as per the paper") parser.add_argument('--rew-clip', type=float, default=10.0, help="Value for observation clipping, as per the paper") parser.add_argument('--anneal-lr', action='store_true', default=True, help="Toggle learning rate annealing for policy and value networks") parser.add_argument('--weights-init', default="orthogonal", choices=["xavier", 'orthogonal'], help='Selects the scheme to be used for weights initialization'), parser.add_argument('--clip-vloss', action="store_true", default=True, help='Toggles wheter or not to use a clipped loss for the value function, as per the paper.') parser.add_argument('--pol-layer-norm', action='store_true', default=False, help='Enables layer normalization in the policy network') parser.add_argument('--invalid-action-penalty', type=float, default=0.00, help='the negative reward penalty for invalid actions') args = parser.parse_args() if not args.seed: args.seed = int(time.time()) args.features_turned_on = sum([args.kle_stop, args.kle_rollback, args.gae, args.norm_obs, args.norm_returns, args.norm_adv, args.anneal_lr, args.clip_vloss, args.pol_layer_norm]) # TRY NOT TO MODIFY: setup the environment experiment_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" writer = SummaryWriter(f"runs/{experiment_name}") writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % ( '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) if args.prod_mode: import wandb wandb.init(project=args.wandb_project_name, entity=args.wandb_entity, tensorboard=True, config=vars(args), name=experiment_name, monitor_gym=True) writer = SummaryWriter(f"/tmp/{experiment_name}") wandb.save(os.path.abspath(__file__)) # TRY NOT TO MODIFY: seeding device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu') env = gym.make(args.gym_id) # respect the default timelimit assert isinstance(env.action_space, MultiDiscrete), "only MultiDiscrete action space is supported" assert isinstance(env, TimeLimit) or int(args.episode_length), "the gym env does not have a built in TimeLimit, please specify by using --episode-length" if isinstance(env, TimeLimit): if int(args.episode_length): env._max_episode_steps = int(args.episode_length) args.episode_length = env._max_episode_steps else: env = TimeLimit(env, int(args.episode_length)) env = NormalizedEnv(env.env, ob=args.norm_obs, ret=args.norm_returns, clipob=args.obs_clip, cliprew=args.rew_clip, gamma=args.gamma) env = TimeLimit(env, int(args.episode_length)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic env.seed(args.seed) env.action_space.seed(args.seed) env.observation_space.seed(args.seed) if args.capture_video: env = Monitor(env, f'videos/{experiment_name}') # ALGO LOGIC: initialize agent here: class CategoricalMasked(Categorical): def __init__(self, probs=None, logits=None, validate_args=None, masks=[]): self.masks = masks if len(self.masks) == 0: super(CategoricalMasked, self).__init__(probs, logits, validate_args) else: self.masks = masks.type(torch.BoolTensor).to(device) logits = torch.where(self.masks, logits, torch.tensor(-1e+8).to(device)) super(CategoricalMasked, self).__init__(probs, logits, validate_args) def entropy(self): if len(self.masks) == 0: return super(CategoricalMasked, self).entropy() p_log_p = self.logits * self.probs p_log_p = torch.where(self.masks, p_log_p, torch.tensor(0.).to(device)) return -p_log_p.sum(-1) class Policy(nn.Module): def __init__(self): super(Policy, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=2,), nn.MaxPool2d(1), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(16*3*3, 128), nn.ReLU(), nn.Linear(128, env.action_space.nvec.sum()) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def get_action(self, x, action=None): logits = self.forward(x) split_logits = torch.split(logits, env.action_space.nvec.tolist(), dim=1) multi_categoricals = [Categorical(logits=logits) for logits in split_logits] if action is None: action = torch.stack([categorical.sample() for categorical in multi_categoricals]) logprob = torch.stack([categorical.log_prob(a) for a, categorical in zip(action, multi_categoricals)]) # entropy = torch.stack([categorical.entropy() for categorical in multi_categoricals]) return action, logprob, [], multi_categoricals class Value(nn.Module): def __init__(self): super(Value, self).__init__() self.features = nn.Sequential( nn.Conv2d(27, 16, kernel_size=2,), nn.MaxPool2d(1), nn.ReLU()) self.fc = nn.Sequential( nn.Linear(16*3*3, 128), nn.ReLU(), nn.Linear(128, 1) ) def forward(self, x): x = torch.Tensor(np.moveaxis(x, -1, 1)).to(device) x = self.features(x) x = x.reshape(x.size(0), -1) x = self.fc(x) return x def discount_cumsum(x, dones, gamma): """ computing discounted cumulative sums of vectors that resets with dones input: vector x, vector dones, [x0, [0, x1, 0, x2 1, x3 0, x4] 0] output: [x0 + discount * x1 + discount^2 * x2, x1 + discount * x2, x2, x3 + discount * x4, x4] """ discount_cumsum = np.zeros_like(x) discount_cumsum[-1] = x[-1] for t in reversed(range(x.shape[0]-1)): discount_cumsum[t] = x[t] + gamma * discount_cumsum[t+1] * (1-dones[t]) return discount_cumsum pg = Policy().to(device) vf = Value().to(device) # MODIFIED: Separate optimizer and learning rates pg_optimizer = optim.Adam(list(pg.parameters()), lr=args.policy_lr) v_optimizer = optim.Adam(list(vf.parameters()), lr=args.value_lr) # MODIFIED: Initializing learning rate anneal scheduler when need if args.anneal_lr: anneal_fn = lambda f: max(0, 1-f / args.total_timesteps) pg_lr_scheduler = optim.lr_scheduler.LambdaLR(pg_optimizer, lr_lambda=anneal_fn) vf_lr_scheduler = optim.lr_scheduler.LambdaLR(v_optimizer, lr_lambda=anneal_fn) loss_fn = nn.MSELoss() # TRY NOT TO MODIFY: start the game global_step = 0 while global_step < args.total_timesteps: if args.capture_video: env.stats_recorder.done=True next_obs = np.array(env.reset()) # ALGO Logic: Storage for epoch data obs = np.empty((args.batch_size,) + env.observation_space.shape) actions = np.empty((args.batch_size,) + env.action_space.shape) logprobs = torch.zeros((env.action_space.nvec.shape[0], args.batch_size,)).to(device) rewards = np.zeros((args.batch_size,)) real_rewards = [] invalid_action_stats = [] dones = np.zeros((args.batch_size,)) values = torch.zeros((args.batch_size,)).to(device) # TRY NOT TO MODIFY: prepare the execution of the game. for step in range(args.batch_size): env.render() global_step += 1 obs[step] = next_obs.copy() # ALGO LOGIC: put action logic here with torch.no_grad(): values[step] = vf.forward(obs[step:step+1]) action, logproba, _, probs = pg.get_action(obs[step:step+1]) actions[step] = action[:,0].data.cpu().numpy() logprobs[:,[step]] = logproba # TRY NOT TO MODIFY: execute the game and log data. try: next_obs, rewards[step], dones[step], info = env.step(action[:,0].data.cpu().numpy()) rewards[step] += pd.DataFrame([info['invalid_action_stats']]).sum().sum() * args.invalid_action_penalty except Exception as e: print(e) print(e.stacktrace()) real_rewards += [info['real_reward']] invalid_action_stats += [info['invalid_action_stats']] next_obs = np.array(next_obs) # Annealing the rate if instructed to do so. if args.anneal_lr: pg_lr_scheduler.step() vf_lr_scheduler.step() if dones[step]: # Computing the discounted returns: writer.add_scalar("charts/episode_reward", np.sum(real_rewards), global_step) print(f"global_step={global_step}, episode_reward={np.sum(real_rewards)}") real_rewards = [] for key, idx in zip(info['invalid_action_stats'], range(len(info['invalid_action_stats']))): writer.add_scalar(f"stats/{key}", pd.DataFrame(invalid_action_stats).sum(0)[idx], global_step) invalid_action_stats = [] next_obs = np.array(env.reset()) # bootstrap reward if not done. reached the batch limit last_value = 0 if not dones[step]: last_value = vf.forward(next_obs.reshape((1,)+next_obs.shape))[0].detach().cpu().numpy()[0] bootstrapped_rewards = np.append(rewards, last_value) # calculate the returns and advantages if args.gae: bootstrapped_values = np.append(values.detach().cpu().numpy(), last_value) deltas = bootstrapped_rewards[:-1] + args.gamma * bootstrapped_values[1:] * (1-dones) - bootstrapped_values[:-1] advantages = discount_cumsum(deltas, dones, args.gamma * args.gae_lambda) advantages = torch.Tensor(advantages).to(device) returns = advantages + values else: returns = discount_cumsum(bootstrapped_rewards, dones, args.gamma)[:-1] advantages = returns - values.detach().cpu().numpy() advantages = torch.Tensor(advantages).to(device) returns = torch.Tensor(returns).to(device) # Advantage normalization if args.norm_adv: EPS = 1e-10 advantages = (advantages - advantages.mean()) / (advantages.std() + EPS) # Optimizaing policy network entropys = [] target_pg = Policy().to(device) inds = np.arange(args.batch_size,) for i_epoch_pi in range(args.update_epochs): np.random.shuffle(inds) for start in range(0, args.batch_size, args.minibatch_size): end = start + args.minibatch_size minibatch_ind = inds[start:end] target_pg.load_state_dict(pg.state_dict()) _, newlogproba, _, _ = pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T) ratio = (newlogproba - logprobs[:,minibatch_ind]).exp() # Policy loss as in OpenAI SpinUp clip_adv = torch.where(advantages[minibatch_ind] > 0, (1.+args.clip_coef) * advantages[minibatch_ind], (1.-args.clip_coef) * advantages[minibatch_ind]).to(device) # Entropy computation with resampled actions entropy = -(newlogproba.exp() * newlogproba).mean() entropys.append(entropy.item()) policy_loss = -torch.min(ratio * advantages[minibatch_ind], clip_adv) + args.ent_coef * entropy policy_loss = policy_loss.mean() pg_optimizer.zero_grad() policy_loss.backward() nn.utils.clip_grad_norm_(pg.parameters(), args.max_grad_norm) pg_optimizer.step() approx_kl = (logprobs[:,minibatch_ind] - newlogproba).mean() # Resample values new_values = vf.forward(obs[minibatch_ind]).view(-1) # Value loss clipping if args.clip_vloss: v_loss_unclipped = ((new_values - returns[minibatch_ind]) ** 2) v_clipped = values[minibatch_ind] + torch.clamp(new_values - values[minibatch_ind], -args.clip_coef, args.clip_coef) v_loss_clipped = (v_clipped - returns[minibatch_ind])**2 v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) v_loss = 0.5 * v_loss_max.mean() else: v_loss = torch.mean((returns[minibatch_ind]- new_values).pow(2)) v_optimizer.zero_grad() v_loss.backward() nn.utils.clip_grad_norm_(vf.parameters(), args.max_grad_norm) v_optimizer.step() if args.kle_stop: if approx_kl > args.target_kl: break if args.kle_rollback: if (logprobs[:,minibatch_ind] - pg.get_action( obs[minibatch_ind], torch.LongTensor(actions[minibatch_ind].astype(np.int)).to(device).T, )[1]).mean() > args.target_kl: pg.load_state_dict(target_pg.state_dict()) break # TRY NOT TO MODIFY: record rewards for plotting purposes writer.add_scalar("losses/value_loss", v_loss.item(), global_step) writer.add_scalar("charts/policy_learning_rate", pg_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("charts/value_learning_rate", v_optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("losses/policy_loss", policy_loss.item(), global_step) writer.add_scalar("losses/entropy", np.mean(entropys), global_step) writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) if args.kle_stop or args.kle_rollback: writer.add_scalar("debug/pg_stop_iter", i_epoch_pi, global_step) env.close() writer.close() ================================================ FILE: plots/analysis.py ================================================ import wandb import numpy as np import pandas as pd api = wandb.Api() wandb_entity = os.environ['WANDB_ENTITY'] # Project is specified by runs = api.runs(f"{wandb_entity}/invalid_action_masking") analysis = True summary_list = [] config_list = [] name_list = [] for run in runs: # run.summary are the output key/values like accuracy. # We call ._json_dict to omit large files summary_json_dict = run.summary._json_dict if analysis: summary_json_dict = summary_json_dict.copy() history = pd.DataFrame(run.scan_history()) history['rollling_e'] = history['charts/episode_reward'].dropna().rolling(10).mean() first_best_reward_idx = (history["rollling_e"] >= 40.0).idxmax() if history.iloc[first_best_reward_idx]["rollling_e"] >= 40.0: summary_json_dict["first_learned_timestep"] = history.iloc[first_best_reward_idx]["global_step"] / 500000 else: summary_json_dict["first_learned_timestep"] = 1 summary_json_dict["first_reward_timestep"] = history.iloc[(history['charts/episode_reward'] > 0).idxmax()]["global_step"] / 500000 # mask removed logic if run.config["exp_name"] == "ppo": history['evals_rollling_e'] = history['evals/charts/episode_reward'].dropna().rolling(10).mean() first_best_reward_idx = (history["evals_rollling_e"] >= 40.0).idxmax() if history.iloc[first_best_reward_idx]["evals_rollling_e"] >= 40.0: summary_json_dict["evals_first_learned_timestep"] = history.iloc[first_best_reward_idx]["global_step"] / 500000 else: summary_json_dict["evals_first_learned_timestep"] = 1 summary_json_dict["evals_first_reward_timestep"] = history.iloc[(history['charts/episode_reward'] > 0).idxmax()]["global_step"] / 500000 summary_json_dict["charts/episode_reward"] = history["charts/episode_reward"][-10:].mean() summary_json_dict["losses/approx_kl"] = history['losses/approx_kl'].astype(np.float64).dropna()[-10:].mean() summary_json_dict['stats/num_invalid_action_null'] = history['stats/num_invalid_action_null'].dropna()[-10:].mean() summary_json_dict['stats/num_invalid_action_busy_unit'] = history['stats/num_invalid_action_busy_unit'].dropna()[-10:].mean() summary_json_dict['stats/num_invalid_action_ownership'] = history['stats/num_invalid_action_ownership'].dropna()[-10:].mean() if run.config["exp_name"] == "ppo": summary_json_dict['evals/charts/episode_reward'] = history['evals/charts/episode_reward'][-10:].mean() summary_json_dict['evals/stats/num_invalid_action_null'] = history['evals/stats/num_invalid_action_null'].dropna()[-10:].mean() summary_json_dict['evals/stats/num_invalid_action_busy_unit'] = history['evals/stats/num_invalid_action_busy_unit'].dropna()[-10:].mean() summary_json_dict['evals/stats/num_invalid_action_ownership'] = history['evals/stats/num_invalid_action_ownership'].dropna()[-10:].mean() summary_list.append(summary_json_dict) # run.config is the input metrics. # We remove special values that start with _. config = {k:v for k,v in run.config.items() if not k.startswith('_')} config_list.append(config) # run.name is the name of the run. name_list.append(run.name) summary_df = pd.DataFrame.from_records(summary_list) config_df = pd.DataFrame.from_records(config_list) name_df = pd.DataFrame({'name': name_list}) all_df = pd.concat([name_df, config_df,summary_df], axis=1) all_df.to_csv("project.csv") all_df["losses/approx_kl"] = all_df["losses/approx_kl"].astype(np.float64) # mask removal mask_removed = all_df[all_df["exp_name"]=="ppo"].copy() mask_removed['charts/episode_reward'] = mask_removed['evals/charts/episode_reward'] mask_removed['stats/num_invalid_action_null'] = mask_removed['evals/stats/num_invalid_action_null'] mask_removed['stats/num_invalid_action_busy_unit'] = mask_removed['evals/stats/num_invalid_action_busy_unit'] mask_removed['stats/num_invalid_action_ownership'] = mask_removed['evals/stats/num_invalid_action_ownership'] mask_removed['first_learned_timestep'] = mask_removed['evals_first_learned_timestep'] mask_removed['first_reward_timestep'] = mask_removed['evals_first_reward_timestep'] mask_removed["exp_name"] = "masking removed" final_all_df = all_df.append(mask_removed, ignore_index=True) # change names final_all_df.loc[final_all_df["gym_id"]=="MicrortsMining4x4F9-v0", "gym_id"] = '04x04' final_all_df.loc[(final_all_df["gym_id"]=="MicrortsMining10x10F9-v0"), "gym_id"] = '10x10' final_all_df.loc[final_all_df["gym_id"]=="MicrortsMining16x16F9-v0", "gym_id"] = '16x16' final_all_df.loc[final_all_df["gym_id"]=="MicrortsMining24x24F9-v0", "gym_id"] = '24x24' final_all_df.loc[final_all_df["exp_name"]=="masking removed", "exp_name"] = 'Masking removed' final_all_df.loc[(final_all_df["exp_name"]=="ppo"), "exp_name"] = 'Invalid action masking' final_all_df.loc[final_all_df["exp_name"]=="ppo_no_adj", "exp_name"] = 'Naive invalid action masking' final_all_df.loc[final_all_df["exp_name"]=="ppo_no_mask", "exp_name"] = 'Invalid action penalty' results_df = final_all_df.fillna(0).groupby( ['exp_name','gym_id',"invalid_action_penalty"] ).mean()[[ 'charts/episode_reward', 'losses/approx_kl', 'stats/num_invalid_action_null', 'stats/num_invalid_action_busy_unit', 'stats/num_invalid_action_ownership', "first_learned_timestep", "first_reward_timestep" ]] final_print_df = results_df.round(2) final_print_df['losses/approx_kl'] = results_df['losses/approx_kl'].round(5) # final_print_df['first_learned_timestep'] = results_df['first_learned_timestep'].round(4) # final_print_df['first_reward_timestep'] = results_df['first_reward_timestep'].round(4) final_print_df['first_learned_timestep'] = pd.Series(["{0:.2f}%".format(val * 100) for val in results_df['first_learned_timestep'].round(4)], index = results_df.index) final_print_df['first_reward_timestep'] = pd.Series(["{0:.2f}%".format(val * 100) for val in results_df['first_reward_timestep'].round(4)], index = results_df.index) print(final_print_df.to_latex()) print(final_print_df.drop(columns=['losses/approx_kl']).to_latex()) # calculate the first time the algorithm solves the environment # , 'losses/value_loss', # 'losses/policy_loss', 'charts/episode_reward', # , , # , # 'charts/episode_reward/ResourceGatherRewardFunction', # 'evals/charts/episode_reward', 'evals/stats/num_invalid_action_null', # 'evals/stats/num_invalid_action_busy_unit', # 'evals/stats/num_invalid_action_ownership' ================================================ FILE: plots/approx_kl.py ================================================ from os import path import pickle import wandb import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import os import argparse from distutils.util import strtobool import matplotlib as mpl mpl.rcParams['text.usetex'] = True mpl.rcParams['text.latex.preamble'] = [r'\usepackage{amsmath}'] #for \text command parser = argparse.ArgumentParser(description='CleanRL Plots') # Common arguments parser.add_argument('--wandb-project', type=str, default="costa-huang/invalid-action-masking", help='the name of wandb project (e.g. cleanrl/cleanrl)') parser.add_argument('--feature-of-interest', type=str, default='losses/approx_kl', help='which feature to be plotted on the y-axis') parser.add_argument('--hyper-params-tuned', nargs='+', default=[], help='the hyper parameters tuned') # parser.add_argument('--scan-history', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, # help='if toggled, cuda will not be enabled by default') parser.add_argument('--interested-exp-names', nargs='+', default=[], help='the hyper parameters tuned') parser.add_argument('--samples', type=int, default=500, help='the sampled point of the run') parser.add_argument('--smooth-weight', type=float, default=0.90, help='the weight parameter of the exponential moving average') parser.add_argument('--last-n-episodes', type=int, default=50, help='for analysis only; the last n episodes from which the mean of the feature of interest is calculated') parser.add_argument('--num-points-x-axis', type=int, default=500, help='the number of points in the x-axis') parser.add_argument('--font-size', type=int, default=18, help='the font size of the plots') parser.add_argument('--x-label', type=str, default="Time Steps", help='the label of x-axis') parser.add_argument('--y-label', type=str, default="KL Divergence", help='the label of y-axis') parser.add_argument('--y-lim-bottom', type=float, default=0.0, help='the bottom limit for the y-axis') parser.add_argument('--output-format', type=str, default="pdf", help='either `pdf`, `png`, or `svg`') args = parser.parse_args() api = wandb.Api() # hacks env_dict = { # 'MicrortsAttackShapedReward-v1': 'MicrortsAttackHRL-v1', # 'MicrortsProduceCombatUnitsShapedReward-v1': 'MicrortsProduceCombatUnitHRL-v1', # 'MicrortsRandomEnemyShapedReward3-v1': 'MicrortsRandomEnemyHRL3-v1', } exp_convert_dict = { 'ppo': 'Invalid action masking', 'ppo_no_mask-0': 'Invalid action penalty, $r_{\\text{invalid}}=0$', 'ppo_no_mask--0.1': 'Invalid action penalty, $r_{\\text{invalid}}=-0.1$', 'ppo_no_mask--0.01': 'Invalid action penalty, $r_{\\text{invalid}}=-0.01$', 'ppo_no_mask--1': 'Invalid action penalty, $r_{\\text{invalid}}=-1$', 'ppo-maskrm': 'Masking removed', 'ppo_no_adj': 'Naive invalid action masking', } # args.feature_of_interest = 'charts/episode_reward' feature_name = args.feature_of_interest.replace("/", "_") if not os.path.exists(feature_name): os.makedirs(feature_name) if not path.exists(f"{feature_name}/all_df_cache.pkl"): # Change oreilly-class/cifar to runs = api.runs(args.wandb_project) summary_list = [] config_list = [] name_list = [] envs = {} data = [] exp_names = [] for idx, run in enumerate(runs): if args.feature_of_interest in run.summary: metrics_dataframe = run.history(keys=[args.feature_of_interest, 'global_step'], samples=args.samples) exp_name = run.config['exp_name'] for param in args.hyper_params_tuned: if param in run.config: exp_name += "-" + param + "-" + str(run.config[param]) + "-" # hacks if "invalid_action_penalty" in run.config: exp_name = run.config['exp_name']+"-"+str(run.config['invalid_action_penalty']) # hacks if run.config["gym_id"] in env_dict: exp_name += "shaped" run.config["gym_id"] = env_dict[run.config["gym_id"]] metrics_dataframe.insert(len(metrics_dataframe.columns), "algo", exp_name) exp_names += [exp_name] metrics_dataframe.insert(len(metrics_dataframe.columns), "seed", run.config['seed']) data += [metrics_dataframe] if run.config["gym_id"] not in envs: envs[run.config["gym_id"]] = [metrics_dataframe] envs[run.config["gym_id"]+"total_timesteps"] = run.config["total_timesteps"] else: envs[run.config["gym_id"]] += [metrics_dataframe] # run.summary are the output key/values like accuracy. We call ._json_dict to omit large files summary_list.append(run.summary._json_dict) # run.config is the input metrics. We remove special values that start with _. config_list.append({k:v for k,v in run.config.items() if not k.startswith('_')}) # run.name is the name of the run. name_list.append(run.name) summary_df = pd.DataFrame.from_records(summary_list) config_df = pd.DataFrame.from_records(config_list) name_df = pd.DataFrame({'name': name_list}) all_df = pd.concat([name_df, config_df,summary_df], axis=1) data = pd.concat(data, ignore_index=True) with open(f'{feature_name}/all_df_cache.pkl', 'wb') as handle: pickle.dump(all_df, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(f'{feature_name}/envs_cache.pkl', 'wb') as handle: pickle.dump(envs, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(f'{feature_name}/exp_names_cache.pkl', 'wb') as handle: pickle.dump(exp_names, handle, protocol=pickle.HIGHEST_PROTOCOL) else: with open(f'{feature_name}/all_df_cache.pkl', 'rb') as handle: all_df = pickle.load(handle) with open(f'{feature_name}/envs_cache.pkl', 'rb') as handle: envs = pickle.load(handle) with open(f'{feature_name}/exp_names_cache.pkl', 'rb') as handle: exp_names = pickle.load(handle) print("data loaded") # https://stackoverflow.com/questions/42281844/what-is-the-mathematics-behind-the-smoothing-parameter-in-tensorboards-scalar#_=_ def smooth(scalars, weight): # Weight between 0 and 1 last = scalars[0] # First value in the plot (first timestep) smoothed = list() for point in scalars: smoothed_val = last * weight + (1 - weight) * point # Calculate smoothed value smoothed.append(smoothed_val) # Save it last = smoothed_val # Anchor the last smoothed value return smoothed #smoothing for env in envs: if not env.endswith("total_timesteps"): for idx, metrics_dataframe in enumerate(envs[env]): envs[env][idx] = metrics_dataframe.dropna(subset=[args.feature_of_interest]) # envs[env][idx][args.feature_of_interest] = smooth(metrics_dataframe[args.feature_of_interest], 0.85) sns.set(style="darkgrid") def get_df_for_env(gym_id): env_total_timesteps = envs[gym_id+"total_timesteps"] env_increment = env_total_timesteps / 500 envs_same_x_axis = [] for sampled_run in envs[gym_id]: df = pd.DataFrame(columns=sampled_run.columns) x_axis = [i*env_increment for i in range(500-2)] current_row = 0 for timestep in x_axis: while sampled_run.iloc[current_row]["global_step"] < timestep: current_row += 1 if current_row > len(sampled_run)-2: break if current_row > len(sampled_run)-2: break temp_row = sampled_run.iloc[current_row].copy() temp_row["global_step"] = timestep df = df.append(temp_row) envs_same_x_axis += [df] return pd.concat(envs_same_x_axis, ignore_index=True) def export_legend(ax, filename="legend.pdf"): # import matplotlib as mpl # mpl.rcParams['text.usetex'] = True # mpl.rcParams['text.latex.preamble'] = [r'\usepackage{amsmath}'] #for \text command fig2 = plt.figure() ax2 = fig2.add_subplot() ax2.axis('off') handles, labels = ax.get_legend_handles_labels() legend = ax2.legend(handles=handles, labels=labels, frameon=False, loc='lower center', ncol=4, fontsize=20, handlelength=1) for text in legend.get_texts(): if text.get_text() in exp_convert_dict: text.set_text(exp_convert_dict[text.get_text()]) for line in legend.get_lines(): line.set_linewidth(4.0) fig = legend.figure fig.canvas.draw() bbox = legend.get_window_extent().transformed(fig.dpi_scale_trans.inverted()) fig.savefig(filename, dpi="figure", bbox_inches=bbox) fig.clf() if not os.path.exists(f"{feature_name}/data"): os.makedirs(f"{feature_name}/data") if not os.path.exists(f"{feature_name}/plots"): os.makedirs(f"{feature_name}/plots") if not os.path.exists(f"{feature_name}/legends"): os.makedirs(f"{feature_name}/legends") interested_exp_names = sorted(list(set(exp_names))) # ['ppo_continuous_action', 'ppo_atari_visual'] current_palette = sns.color_palette(n_colors=len(interested_exp_names)) current_palette_dict = dict(zip(interested_exp_names, current_palette)) if args.interested_exp_names: interested_exp_names = args.interested_exp_names print(current_palette_dict) legend_df = pd.DataFrame() if args.font_size: plt.rc('axes', titlesize=args.font_size) # fontsize of the axes title plt.rc('axes', labelsize=args.font_size) # fontsize of the x and y labels plt.rc('xtick', labelsize=args.font_size) # fontsize of the tick labels plt.rc('ytick', labelsize=args.font_size) # fontsize of the tick labels plt.rc('legend', fontsize=args.font_size) # legend fontsize stats = {item: [] for item in ["gym_id", "exp_name", args.feature_of_interest]} # uncommenet the following to generate all figures for env in set(all_df["gym_id"]): if not path.exists(f"{feature_name}/data/{env}.pkl"): with open(f"{feature_name}/data/{env}.pkl", 'wb') as handle: data = get_df_for_env(env) data["seed"] = data["seed"].astype(float) data[args.feature_of_interest] = data[args.feature_of_interest].astype(float) pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL) else: with open(f"{feature_name}/data/{env}.pkl", 'rb') as handle: data = pickle.load(handle) print(f"{env}'s data loaded") def _smooth(df): df[args.feature_of_interest] = smooth(list(df[args.feature_of_interest]), args.smooth_weight) return df legend_df = legend_df.append(data) ax = sns.lineplot(data=data.groupby(["seed", "algo"]).apply(_smooth).loc[data['algo'].isin(interested_exp_names)], x="global_step", y=args.feature_of_interest, hue="algo", ci='sd', palette=current_palette_dict,) ax.ticklabel_format(style='sci', scilimits=(0,0), axis='x') ax.set(xlabel=args.x_label, ylabel=args.y_label) # hack ax.set_ylim(0, 0.07) # ax.set(ylabel="") # ax.set_xticks([]) ax.legend().remove() if args.y_lim_bottom: plt.ylim(bottom=args.y_lim_bottom) # plt.title(env) plt.tight_layout() plt.savefig(f"{feature_name}/plots/{env}.{args.output_format}") plt.clf() for algo in interested_exp_names: algo_data = data.loc[data['algo'].isin([algo])] last_n_episodes_global_step = sorted(algo_data["global_step"].unique())[-args.last_n_episodes] last_n_episodes_features = algo_data[algo_data['global_step'] > last_n_episodes_global_step].groupby( ['seed'] ).mean()[args.feature_of_interest] for item in last_n_episodes_features: stats[args.feature_of_interest] += [item] if algo in exp_convert_dict: stats['exp_name'] += [exp_convert_dict[algo]] else: stats['exp_name'] += [algo] stats['gym_id'] += [env] # export legend legend_df = legend_df.reset_index() ax = sns.lineplot(data=legend_df, x="global_step", y=args.feature_of_interest, hue="algo", ci='sd', palette=current_palette_dict,) ax.set(xlabel='Time Steps', ylabel='Average Episode Reward') ax.legend().remove() export_legend(ax, f"{feature_name}/legend.{args.output_format}") plt.clf() # analysis stats_df = pd.DataFrame(stats) g = stats_df.groupby( ['gym_id','exp_name'] ).agg(lambda x: f"{np.mean(x):.2f} ± {np.std(x):.2f}") print(g.reset_index().pivot('exp_name', 'gym_id' , args.feature_of_interest).to_latex().replace("±", "$\pm$")) ================================================ FILE: plots/episode_reward.py ================================================ from os import path import pickle import wandb import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import os import argparse from distutils.util import strtobool import matplotlib as mpl mpl.rcParams['text.usetex'] = True mpl.rcParams['text.latex.preamble'] = [r'\usepackage{amsmath}'] #for \text command parser = argparse.ArgumentParser(description='CleanRL Plots') # Common arguments parser.add_argument('--wandb-project', type=str, default="costa-huang/invalid-action-masking", help='the name of wandb project (e.g. cleanrl/cleanrl)') parser.add_argument('--feature-of-interest', type=str, default='charts/episode_reward', help='which feature to be plotted on the y-axis') parser.add_argument('--hyper-params-tuned', nargs='+', default=[], help='the hyper parameters tuned') # parser.add_argument('--scan-history', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, # help='if toggled, cuda will not be enabled by default') parser.add_argument('--interested-exp-names', nargs='+', default=[], help='the hyper parameters tuned') parser.add_argument('--samples', type=int, default=500, help='the sampled point of the run') parser.add_argument('--smooth-weight', type=float, default=0.90, help='the weight parameter of the exponential moving average') parser.add_argument('--last-n-episodes', type=int, default=50, help='for analysis only; the last n episodes from which the mean of the feature of interest is calculated') parser.add_argument('--num-points-x-axis', type=int, default=500, help='the number of points in the x-axis') parser.add_argument('--font-size', type=int, default=18, help='the font size of the plots') parser.add_argument('--x-label', type=str, default="Time Steps", help='the label of x-axis') parser.add_argument('--y-label', type=str, default="Episodic Return", help='the label of y-axis') parser.add_argument('--y-lim-bottom', type=float, default=0.0, help='the bottom limit for the y-axis') parser.add_argument('--output-format', type=str, default="pdf", help='either `pdf`, `png`, or `svg`') args = parser.parse_args() api = wandb.Api() # hacks env_dict = { # 'MicrortsAttackShapedReward-v1': 'MicrortsAttackHRL-v1', # 'MicrortsProduceCombatUnitsShapedReward-v1': 'MicrortsProduceCombatUnitHRL-v1', # 'MicrortsRandomEnemyShapedReward3-v1': 'MicrortsRandomEnemyHRL3-v1', } exp_convert_dict = { 'ppo': 'Invalid action masking', 'ppo_no_mask-0': 'Invalid action penalty, $r_{\\text{invalid}}=0$', 'ppo_no_mask--0.1': 'Invalid action penalty, $r_{\\text{invalid}}=-0.1$', 'ppo_no_mask--0.01': 'Invalid action penalty, $r_{\\text{invalid}}=-0.01$', 'ppo_no_mask--1': 'Invalid action penalty, $r_{\\text{invalid}}=-1$', 'ppo-maskrm': 'Masking removed', 'ppo_no_adj': 'Naive invalid action masking', } # args.feature_of_interest = 'charts/episode_reward' feature_name = args.feature_of_interest.replace("/", "_") if not os.path.exists(feature_name): os.makedirs(feature_name) if not path.exists(f"{feature_name}/all_df_cache.pkl"): # Change oreilly-class/cifar to runs = api.runs(args.wandb_project) summary_list = [] config_list = [] name_list = [] envs = {} data = [] exp_names = [] for idx, run in enumerate(runs): if args.feature_of_interest in run.summary: metrics_dataframe = run.history(keys=[args.feature_of_interest, 'global_step'], samples=args.samples) exp_name = run.config['exp_name'] for param in args.hyper_params_tuned: if param in run.config: exp_name += "-" + param + "-" + str(run.config[param]) + "-" # hacks if "invalid_action_penalty" in run.config: exp_name = run.config['exp_name']+"-"+str(run.config['invalid_action_penalty']) # hacks if run.config["gym_id"] in env_dict: exp_name += "shaped" run.config["gym_id"] = env_dict[run.config["gym_id"]] metrics_dataframe.insert(len(metrics_dataframe.columns), "algo", exp_name) exp_names += [exp_name] metrics_dataframe.insert(len(metrics_dataframe.columns), "seed", run.config['seed']) data += [metrics_dataframe] if run.config["gym_id"] not in envs: envs[run.config["gym_id"]] = [metrics_dataframe] envs[run.config["gym_id"]+"total_timesteps"] = run.config["total_timesteps"] else: envs[run.config["gym_id"]] += [metrics_dataframe] # hacks if exp_name == "ppo": metrics_dataframe = run.history(keys=["evals/"+args.feature_of_interest, 'global_step'], samples=args.samples) exp_name = "ppo-maskrm" metrics_dataframe.insert(len(metrics_dataframe.columns), "algo", exp_name) exp_names += [exp_name] metrics_dataframe.insert(len(metrics_dataframe.columns), "seed", run.config['seed']) metrics_dataframe[args.feature_of_interest] = metrics_dataframe["evals/"+args.feature_of_interest] data += [metrics_dataframe] if run.config["gym_id"] not in envs: envs[run.config["gym_id"]] = [metrics_dataframe] envs[run.config["gym_id"]+"total_timesteps"] = run.config["total_timesteps"] else: envs[run.config["gym_id"]] += [metrics_dataframe] # run.summary are the output key/values like accuracy. We call ._json_dict to omit large files summary_list.append(run.summary._json_dict) # run.config is the input metrics. We remove special values that start with _. config_list.append({k:v for k,v in run.config.items() if not k.startswith('_')}) # run.name is the name of the run. name_list.append(run.name) summary_df = pd.DataFrame.from_records(summary_list) config_df = pd.DataFrame.from_records(config_list) name_df = pd.DataFrame({'name': name_list}) all_df = pd.concat([name_df, config_df,summary_df], axis=1) data = pd.concat(data, ignore_index=True) with open(f'{feature_name}/all_df_cache.pkl', 'wb') as handle: pickle.dump(all_df, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(f'{feature_name}/envs_cache.pkl', 'wb') as handle: pickle.dump(envs, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(f'{feature_name}/exp_names_cache.pkl', 'wb') as handle: pickle.dump(exp_names, handle, protocol=pickle.HIGHEST_PROTOCOL) else: with open(f'{feature_name}/all_df_cache.pkl', 'rb') as handle: all_df = pickle.load(handle) with open(f'{feature_name}/envs_cache.pkl', 'rb') as handle: envs = pickle.load(handle) with open(f'{feature_name}/exp_names_cache.pkl', 'rb') as handle: exp_names = pickle.load(handle) print("data loaded") # https://stackoverflow.com/questions/42281844/what-is-the-mathematics-behind-the-smoothing-parameter-in-tensorboards-scalar#_=_ def smooth(scalars, weight): # Weight between 0 and 1 last = scalars[0] # First value in the plot (first timestep) smoothed = list() for point in scalars: smoothed_val = last * weight + (1 - weight) * point # Calculate smoothed value smoothed.append(smoothed_val) # Save it last = smoothed_val # Anchor the last smoothed value return smoothed #smoothing for env in envs: if not env.endswith("total_timesteps"): for idx, metrics_dataframe in enumerate(envs[env]): envs[env][idx] = metrics_dataframe.dropna(subset=[args.feature_of_interest]) # envs[env][idx][args.feature_of_interest] = smooth(metrics_dataframe[args.feature_of_interest], 0.85) sns.set(style="darkgrid") def get_df_for_env(gym_id): env_total_timesteps = envs[gym_id+"total_timesteps"] env_increment = env_total_timesteps / 500 envs_same_x_axis = [] for sampled_run in envs[gym_id]: df = pd.DataFrame(columns=sampled_run.columns) x_axis = [i*env_increment for i in range(500-2)] current_row = 0 for timestep in x_axis: while sampled_run.iloc[current_row]["global_step"] < timestep: current_row += 1 if current_row > len(sampled_run)-2: break if current_row > len(sampled_run)-2: break temp_row = sampled_run.iloc[current_row].copy() temp_row["global_step"] = timestep df = df.append(temp_row) envs_same_x_axis += [df] return pd.concat(envs_same_x_axis, ignore_index=True) def export_legend(ax, filename="legend.pdf"): # import matplotlib as mpl # mpl.rcParams['text.usetex'] = True # mpl.rcParams['text.latex.preamble'] = [r'\usepackage{amsmath}'] #for \text command fig2 = plt.figure() ax2 = fig2.add_subplot() ax2.axis('off') handles, labels = ax.get_legend_handles_labels() legend = ax2.legend(handles=handles, labels=labels, frameon=False, loc='lower center', ncol=4, fontsize=20, handlelength=1) for text in legend.get_texts(): if text.get_text() in exp_convert_dict: text.set_text(exp_convert_dict[text.get_text()]) for line in legend.get_lines(): line.set_linewidth(4.0) fig = legend.figure fig.canvas.draw() bbox = legend.get_window_extent().transformed(fig.dpi_scale_trans.inverted()) fig.savefig(filename, dpi="figure", bbox_inches=bbox) fig.clf() if not os.path.exists(f"{feature_name}/data"): os.makedirs(f"{feature_name}/data") if not os.path.exists(f"{feature_name}/plots"): os.makedirs(f"{feature_name}/plots") if not os.path.exists(f"{feature_name}/legends"): os.makedirs(f"{feature_name}/legends") interested_exp_names = sorted(list(set(exp_names))) # ['ppo_continuous_action', 'ppo_atari_visual'] current_palette = sns.color_palette(n_colors=len(interested_exp_names)) current_palette_dict = dict(zip(interested_exp_names, current_palette)) if args.interested_exp_names: interested_exp_names = args.interested_exp_names print(current_palette_dict) legend_df = pd.DataFrame() if args.font_size: plt.rc('axes', titlesize=args.font_size) # fontsize of the axes title plt.rc('axes', labelsize=args.font_size) # fontsize of the x and y labels plt.rc('xtick', labelsize=args.font_size) # fontsize of the tick labels plt.rc('ytick', labelsize=args.font_size) # fontsize of the tick labels plt.rc('legend', fontsize=args.font_size) # legend fontsize stats = {item: [] for item in ["gym_id", "exp_name", args.feature_of_interest]} # uncommenet the following to generate all figures for env in set(all_df["gym_id"]): if not path.exists(f"{feature_name}/data/{env}.pkl"): with open(f"{feature_name}/data/{env}.pkl", 'wb') as handle: data = get_df_for_env(env) data["seed"] = data["seed"].astype(float) data[args.feature_of_interest] = data[args.feature_of_interest].astype(float) pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL) else: with open(f"{feature_name}/data/{env}.pkl", 'rb') as handle: data = pickle.load(handle) print(f"{env}'s data loaded") def _smooth(df): df[args.feature_of_interest] = smooth(list(df[args.feature_of_interest]), args.smooth_weight) return df legend_df = legend_df.append(data) ax = sns.lineplot(data=data.groupby(["seed", "algo"]).apply(_smooth).loc[data['algo'].isin(interested_exp_names)], x="global_step", y=args.feature_of_interest, hue="algo", ci='sd', palette=current_palette_dict,) ax.ticklabel_format(style='sci', scilimits=(0,0), axis='x') ax.set(xlabel=args.x_label, ylabel=args.y_label) # hack ax.set(xlabel="") ax.set_xticks([]) ax.legend().remove() if args.y_lim_bottom: plt.ylim(bottom=args.y_lim_bottom) # plt.title(env) plt.tight_layout() plt.savefig(f"{feature_name}/plots/{env}.{args.output_format}") plt.clf() for algo in interested_exp_names: algo_data = data.loc[data['algo'].isin([algo])] last_n_episodes_global_step = sorted(algo_data["global_step"].unique())[-args.last_n_episodes] last_n_episodes_features = algo_data[algo_data['global_step'] > last_n_episodes_global_step].groupby( ['seed'] ).mean()[args.feature_of_interest] for item in last_n_episodes_features: stats[args.feature_of_interest] += [item] if algo in exp_convert_dict: stats['exp_name'] += [exp_convert_dict[algo]] else: stats['exp_name'] += [algo] stats['gym_id'] += [env] # export legend legend_df = legend_df.reset_index() ax = sns.lineplot(data=legend_df, x="global_step", y=args.feature_of_interest, hue="algo", ci='sd', palette=current_palette_dict,) ax.set(xlabel='Time Steps', ylabel='Average Episode Reward') ax.legend().remove() export_legend(ax, f"{feature_name}/legend.{args.output_format}") plt.clf() # analysis stats_df = pd.DataFrame(stats) g = stats_df.groupby( ['gym_id','exp_name'] ).agg(lambda x: f"{np.mean(x):.2f} ± {np.std(x):.2f}") print(g.reset_index().pivot('exp_name', 'gym_id' , args.feature_of_interest).to_latex().replace("±", "$\pm$")) ================================================ FILE: ppo.py ================================================ import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.distributions.categorical import Categorical from torch.utils.tensorboard import SummaryWriter import argparse from distutils.util import strtobool import numpy as np import gym import gym_microrts from gym.wrappers import TimeLimit, Monitor from gym.spaces import Discrete, Box, MultiBinary, MultiDiscrete, Space import time import random import os from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecEnvWrapper if __name__ == "__main__": parser = argparse.ArgumentParser(description='PPO agent') # Common arguments parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"), help='the name of this experiment') parser.add_argument('--gym-id', type=str, default="MicrortsMining10x10F9-v0", help='the id of the gym environment') parser.add_argument('--learning-rate', type=float, default=2.5e-4, help='the learning rate of the optimizer') parser.add_argument('--seed', type=int, default=1, help='seed of the experiment') parser.add_argument('--total-timesteps', type=int, default=10000000, help='total timesteps of the experiments') parser.add_argument('--torch-deterministic', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, help='if toggled, `torch.backends.cudnn.deterministic=False`') parser.add_argument('--cuda', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, help='if toggled, cuda will not be enabled by default') parser.add_argument('--prod-mode', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, help='run the script in production mode and use wandb to log outputs') parser.add_argument('--capture-video', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, help='weather to capture videos of the agent performances (check out `videos` folder)') parser.add_argument('--wandb-project-name', type=str, default="cleanRL", help="the wandb's project name") parser.add_argument('--wandb-entity', type=str, default=None, help="the entity (team) of wandb's project") # Algorithm specific arguments parser.add_argument('--n-minibatch', type=int, default=4, help='the number of mini batch') parser.add_argument('--num-envs', type=int, default=8, help='the number of parallel game environment') parser.add_argument('--num-steps', type=int, default=128, help='the number of steps per game environment') parser.add_argument('--gamma', type=float, default=0.99, help='the discount factor gamma') parser.add_argument('--gae-lambda', type=float, default=0.95, help='the lambda for the general advantage estimation') parser.add_argument('--ent-coef', type=float, default=0.01, help="coefficient of the entropy") parser.add_argument('--vf-coef', type=float, default=0.5, help="coefficient of the value function") parser.add_argument('--max-grad-norm', type=float, default=0.5, help='the maximum norm for the gradient clipping') parser.add_argument('--clip-coef', type=float, default=0.1, help="the surrogate clipping coefficient") parser.add_argument('--update-epochs', type=int, default=4, help="the K epochs to update the policy") parser.add_argument('--kle-stop', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, help='If toggled, the policy updates will be early stopped w.r.t target-kl') parser.add_argument('--kle-rollback', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, help='If toggled, the policy updates will roll back to previous policy if KL exceeds target-kl') parser.add_argument('--target-kl', type=float, default=0.03, help='the target-kl variable that is referred by --kl') parser.add_argument('--gae', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, help='Use GAE for advantage computation') parser.add_argument('--norm-adv', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, help="Toggles advantages normalization") parser.add_argument('--anneal-lr', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, help="Toggle learning rate annealing for policy and value networks") parser.add_argument('--clip-vloss', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, help='Toggles wheter or not to use a clipped loss for the value function, as per the paper.') args = parser.parse_args() if not args.seed: args.seed = int(time.time()) args.batch_size = int(args.num_envs * args.num_steps) args.minibatch_size = int(args.batch_size // args.n_minibatch) class ImageToPyTorch(gym.ObservationWrapper): def __init__(self, env): super(ImageToPyTorch, self).__init__(env) old_shape = self.observation_space.shape self.observation_space = gym.spaces.Box( low=0, high=1, shape=(old_shape[-1], old_shape[0], old_shape[1]), dtype=np.int32, ) def observation(self, observation): return np.transpose(observation, axes=(2, 0, 1)) class VecPyTorch(VecEnvWrapper): def __init__(self, venv, device): super(VecPyTorch, self).__init__(venv) self.device = device def reset(self): obs = self.venv.reset() obs = torch.from_numpy(obs).float().to(self.device) return obs def step_async(self, actions): actions = actions.cpu().numpy() self.venv.step_async(actions) def step_wait(self): obs, reward, done, info = self.venv.step_wait() obs = torch.from_numpy(obs).float().to(self.device) reward = torch.from_numpy(reward).unsqueeze(dim=1).float() return obs, reward, done, info class MicroRTSStatsRecorder(gym.Wrapper): def reset(self, **kwargs): observation = super(MicroRTSStatsRecorder, self).reset(**kwargs) self.raw_rewards = [] return observation def step(self, action): observation, reward, done, info = super(MicroRTSStatsRecorder, self).step(action) self.raw_rewards += [info["raw_rewards"]] if done: raw_rewards = np.array(self.raw_rewards).sum(0) raw_names = [str(rf) for rf in self.rfs] info['microrts_stats'] = dict(zip(raw_names, raw_rewards)) self.raw_rewards = [] return observation, reward, done, info # TRY NOT TO MODIFY: setup the environment experiment_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" writer = SummaryWriter(f"runs/{experiment_name}") writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % ( '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) if args.prod_mode: import wandb wandb.init(project=args.wandb_project_name, entity=args.wandb_entity, sync_tensorboard=True, config=vars(args), name=experiment_name, monitor_gym=True, save_code=True) writer = SummaryWriter(f"/tmp/{experiment_name}") # TRY NOT TO MODIFY: seeding device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic def make_env(gym_id, seed, idx): def thunk(): env = gym.make(gym_id) env = ImageToPyTorch(env) env = gym.wrappers.RecordEpisodeStatistics(env) env = MicroRTSStatsRecorder(env) if args.capture_video: if idx == 0: env = Monitor(env, f'videos/{experiment_name}') env.seed(seed) env.action_space.seed(seed) env.observation_space.seed(seed) return env return thunk envs = VecPyTorch(DummyVecEnv([make_env(args.gym_id, args.seed+i, i) for i in range(args.num_envs)]), device) # if args.prod_mode: # envs = VecPyTorch( # SubprocVecEnv([make_env(args.gym_id, args.seed+i, i) for i in range(args.num_envs)], "fork"), # device # ) assert isinstance(envs.action_space, MultiDiscrete), "only MultiDiscrete action space is supported" # ALGO LOGIC: initialize agent here: class CategoricalMasked(Categorical): def __init__(self, probs=None, logits=None, validate_args=None, masks=[]): self.masks = masks if len(self.masks) == 0: super(CategoricalMasked, self).__init__(probs, logits, validate_args) else: self.masks = masks.type(torch.BoolTensor).to(device) logits = torch.where(self.masks, logits, torch.tensor(-1e+8).to(device)) super(CategoricalMasked, self).__init__(probs, logits, validate_args) def entropy(self): if len(self.masks) == 0: return super(CategoricalMasked, self).entropy() p_log_p = self.logits * self.probs p_log_p = torch.where(self.masks, p_log_p, torch.tensor(0.).to(device)) return -p_log_p.sum(-1) class Scale(nn.Module): def __init__(self, scale): super().__init__() self.scale = scale def forward(self, x): return x * self.scale def layer_init(layer, std=np.sqrt(2), bias_const=0.0): torch.nn.init.orthogonal_(layer.weight, std) torch.nn.init.constant_(layer.bias, bias_const) return layer class Agent(nn.Module): def __init__(self, frames=4): super(Agent, self).__init__() self.network = nn.Sequential( layer_init(nn.Conv2d(27, 16, kernel_size=3, stride=2)), nn.ReLU(), layer_init(nn.Conv2d(16, 32, kernel_size=2)), nn.ReLU(), nn.Flatten(), layer_init(nn.Linear(32*3*3, 128)), nn.ReLU(),) self.actor = layer_init(nn.Linear(128, envs.action_space.nvec.sum()), std=0.01) self.critic = layer_init(nn.Linear(128, 1), std=1) def forward(self, x): return self.network(x) def get_action(self, x, action=None, invalid_action_masks=None): logits = self.actor(self.forward(x)) split_logits = torch.split(logits, envs.action_space.nvec.tolist(), dim=1) if invalid_action_masks is not None: split_invalid_action_masks = torch.split(invalid_action_masks, envs.action_space.nvec.tolist(), dim=1) multi_categoricals = [CategoricalMasked(logits=logits, masks=iam) for (logits, iam) in zip(split_logits, split_invalid_action_masks)] else: multi_categoricals = [Categorical(logits=logits) for logits in split_logits] if action is None: action = torch.stack([categorical.sample() for categorical in multi_categoricals]) logprob = torch.stack([categorical.log_prob(a) for a, categorical in zip(action, multi_categoricals)]) entropy = torch.stack([categorical.entropy() for categorical in multi_categoricals]) return action, logprob.sum(0), entropy.sum(0) def get_value(self, x): return self.critic(self.forward(x)) agent = Agent().to(device) optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5) if args.anneal_lr: # https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/ppo2/defaults.py#L20 lr = lambda f: f * args.learning_rate # ALGO Logic: Storage for epoch data obs = torch.zeros((args.num_steps, args.num_envs) + envs.observation_space.shape).to(device) actions = torch.zeros((args.num_steps, args.num_envs) + envs.action_space.shape).to(device) logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device) rewards = torch.zeros((args.num_steps, args.num_envs)).to(device) dones = torch.zeros((args.num_steps, args.num_envs)).to(device) values = torch.zeros((args.num_steps, args.num_envs)).to(device) invalid_action_masks = torch.zeros((args.num_steps, args.num_envs) + (envs.action_space.nvec.sum(),)).to(device) # TRY NOT TO MODIFY: start the game global_step = 0 # Note how `next_obs` and `next_done` are used; their usage is equivalent to # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/84a7582477fb0d5c82ad6d850fe476829dddd2e1/a2c_ppo_acktr/storage.py#L60 next_obs = envs.reset() next_done = torch.zeros(args.num_envs).to(device) num_updates = args.total_timesteps // args.batch_size for update in range(1, num_updates+1): # Annealing the rate if instructed to do so. if args.anneal_lr: frac = 1.0 - (update - 1.0) / num_updates lrnow = lr(frac) optimizer.param_groups[0]['lr'] = lrnow # TRY NOT TO MODIFY: prepare the execution of the game. for step in range(0, args.num_steps): envs.env_method("render", indices=0) global_step += 1 * args.num_envs obs[step] = next_obs dones[step] = next_done invalid_action_masks[step] = torch.Tensor(np.array(envs.get_attr("action_mask"))) # ALGO LOGIC: put action logic here with torch.no_grad(): values[step] = agent.get_value(obs[step]).flatten() action, logproba, _ = agent.get_action(obs[step], invalid_action_masks=invalid_action_masks[step]) actions[step] = action.T logprobs[step] = logproba # TRY NOT TO MODIFY: execute the game and log data. next_obs, rs, ds, infos = envs.step(action.T) rewards[step], next_done = rs.view(-1), torch.Tensor(ds).to(device) for info in infos: if 'episode' in info.keys(): print(f"global_step={global_step}, episode_reward={info['episode']['r']}") writer.add_scalar("charts/episode_reward", info['episode']['r'], global_step) for key in info['microrts_stats']: writer.add_scalar(f"charts/episode_reward/{key}", info['microrts_stats'][key], global_step) break # bootstrap reward if not done. reached the batch limit with torch.no_grad(): last_value = agent.get_value(next_obs.to(device)).reshape(1, -1) if args.gae: advantages = torch.zeros_like(rewards).to(device) lastgaelam = 0 for t in reversed(range(args.num_steps)): if t == args.num_steps - 1: nextnonterminal = 1.0 - next_done nextvalues = last_value else: nextnonterminal = 1.0 - dones[t+1] nextvalues = values[t+1] delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam returns = advantages + values else: returns = torch.zeros_like(rewards).to(device) for t in reversed(range(args.num_steps)): if t == args.num_steps - 1: nextnonterminal = 1.0 - next_done next_return = last_value else: nextnonterminal = 1.0 - dones[t+1] next_return = returns[t+1] returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return advantages = returns - values # flatten the batch b_obs = obs.reshape((-1,)+envs.observation_space.shape) b_logprobs = logprobs.reshape(-1) b_actions = actions.reshape((-1,)+envs.action_space.shape) b_advantages = advantages.reshape(-1) b_returns = returns.reshape(-1) b_values = values.reshape(-1) b_invalid_action_masks = invalid_action_masks.reshape((-1, invalid_action_masks.shape[-1])) # Optimizaing the policy and value network target_agent = Agent().to(device) inds = np.arange(args.batch_size,) for i_epoch_pi in range(args.update_epochs): np.random.shuffle(inds) target_agent.load_state_dict(agent.state_dict()) for start in range(0, args.batch_size, args.minibatch_size): end = start + args.minibatch_size minibatch_ind = inds[start:end] mb_advantages = b_advantages[minibatch_ind] if args.norm_adv: mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8) _, newlogproba, entropy = agent.get_action( b_obs[minibatch_ind], b_actions.long()[minibatch_ind].T, b_invalid_action_masks[minibatch_ind]) ratio = (newlogproba - b_logprobs[minibatch_ind]).exp() # Stats approx_kl = (b_logprobs[minibatch_ind] - newlogproba).mean() # Policy loss pg_loss1 = -mb_advantages * ratio pg_loss2 = -mb_advantages * torch.clamp(ratio, 1-args.clip_coef, 1+args.clip_coef) pg_loss = torch.max(pg_loss1, pg_loss2).mean() entropy_loss = entropy.mean() # Value loss new_values = agent.get_value(b_obs[minibatch_ind]).view(-1) if args.clip_vloss: v_loss_unclipped = ((new_values - b_returns[minibatch_ind]) ** 2) v_clipped = b_values[minibatch_ind] + torch.clamp(new_values - b_values[minibatch_ind], -args.clip_coef, args.clip_coef) v_loss_clipped = (v_clipped - b_returns[minibatch_ind])**2 v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) v_loss = 0.5 * v_loss_max.mean() else: v_loss = 0.5 *((new_values - b_returns[minibatch_ind]) ** 2) loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) optimizer.step() if args.kle_stop: if approx_kl > args.target_kl: break if args.kle_rollback: if (b_logprobs[minibatch_ind] - agent.get_action( b_obs[minibatch_ind], b_actions.long()[minibatch_ind].T, b_invalid_action_masks[minibatch_ind])[1]).mean() > args.target_kl: agent.load_state_dict(target_agent.state_dict()) break # TRY NOT TO MODIFY: record rewards for plotting purposes writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]['lr'], global_step) writer.add_scalar("losses/value_loss", v_loss.item(), global_step) writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step) writer.add_scalar("losses/entropy", entropy.mean().item(), global_step) writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) if args.kle_stop or args.kle_rollback: writer.add_scalar("debug/pg_stop_iter", i_epoch_pi, global_step) envs.close() writer.close() ================================================ FILE: pyproject.toml ================================================ [tool.poetry] name = "invalid-action-masking" version = "0.1.0" description = "" authors = ["Costa Huang "] [tool.poetry.dependencies] python = "^3.8" torch = "1.7.1" Pillow = "^8.3.1" cleanrl = {git = "https://github.com/vwxyzjn/cleanrl.git", rev = "V0.1"} gym-microrts = {git = "https://github.com/vwxyzjn/gym-microrts.git", rev = "b0cabbabc363177709b3132201d57d024b5212e9"} tensorboard = "^2.5.0" pandas = "^1.3.0" stable-baselines3 = "^1.1.0" wandb = "^0.12.2" seaborn = "^0.11.2" spyder = "^5.1.5" setuptools = "59.5.0" [tool.poetry.dev-dependencies] [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" ================================================ FILE: requirements.txt ================================================ absl-py==0.13.0 cachetools==4.2.2 certifi==2021.5.30 charset-normalizer==2.0.1; python_version >= "3" cleanrl @ git+https://github.com/vwxyzjn/cleanrl.git@V0.1 cloudpickle==1.6.0 dacite==1.6.0 future==0.18.2 google-auth-oauthlib==0.4.4 google-auth==1.32.1 grpcio==1.38.1 gym-microrts @ git+https://github.com/vwxyzjn/gym-microrts.git@b0cabbabc363177709b3132201d57d024b5212e9 gym==0.17.3 hilbertcurve==2.0.5 idna==3.2; python_version >= "3" jpype1==1.3.0 markdown==3.3.4 numpy==1.21.0 oauthlib==3.1.1 pandas==1.3.0 pillow==8.3.1 protobuf==3.17.3 pyasn1-modules==0.2.8 pyasn1==0.4.8 pyglet==1.5.0 python-dateutil==2.8.2 pytz==2021.1 requests-oauthlib==1.3.0 requests==2.26.0 rsa==4.7.2; python_version >= "3.6" scipy==1.6.1 six==1.16.0 tensorboard-data-server==0.6.1 tensorboard-plugin-wit==1.8.0 tensorboard==2.5.0 torch==1.7.1 typing-extensions==3.10.0.0 urllib3==1.26.6 werkzeug==2.0.1 ================================================ FILE: test.py ================================================ # suppose action 1 is invalid import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.distributions.categorical import Categorical action = 0 advantage = torch.tensor(1.) device = "cpu" # no invalid action masking print("=============regular=============") target_logits = torch.tensor([1., 1., 1., 1.,] , requires_grad=True) target_probs = Categorical(logits=target_logits) log_prob = target_probs.log_prob(torch.tensor(action)) print("log_prob", log_prob) (log_prob*advantage).backward() print("gradient", target_logits.grad) print() # invalid action masking via logits print("==================invalid action masking=============") target_logits = torch.tensor([1., 1., 1., 1.,] , requires_grad=True) invalid_action_masks = torch.tensor([1., 1., 0., 1.,]) invalid_action_masks = invalid_action_masks.type(torch.BoolTensor) adjusted_logits = torch.where(invalid_action_masks, target_logits, torch.tensor(-1e+8)) adjusted_probs = Categorical(logits=adjusted_logits) adjusted_log_prob = adjusted_probs.log_prob(torch.tensor(action)) print("log_prob", adjusted_log_prob) (adjusted_log_prob*advantage).backward() print("gradient", target_logits.grad) print() # invalid action masking via importance sampling print("==================regular importance sampling=============") target_logits = torch.tensor([1., 1., 1., 1.,] , requires_grad=True) target_probs = Categorical(logits=target_logits) invalid_action_masks = torch.tensor([1., 1., 0., 1.,]) invalid_action_masks = invalid_action_masks.type(torch.BoolTensor) adjusted_logits = torch.where(invalid_action_masks, target_logits, torch.tensor(-1e+8)) adjusted_probs = Categorical(logits=adjusted_logits) log_prob = target_probs.log_prob(torch.tensor(action)) adjusted_log_prob = adjusted_probs.log_prob(torch.tensor(action)) importance_sampling = target_probs.probs[torch.tensor(action)] / (adjusted_probs.probs[torch.tensor(action)]) print("log_prob", log_prob) (importance_sampling.detach()*log_prob*advantage).backward() print("gradient", target_logits.grad) print() # invalid action masking via logits print("==================invalid action masking=============") target_logits = torch.tensor([1., 1., 1., 1.,] , requires_grad=True) invalid_action_masks = torch.tensor([1., 1., 0., 1.,]) invalid_action_masks = invalid_action_masks.type(torch.BoolTensor) adjusted_logits = torch.where(invalid_action_masks, target_logits, torch.tensor(-2.)) adjusted_probs = Categorical(logits=adjusted_logits) adjusted_log_prob = adjusted_probs.log_prob(torch.tensor(action)) print("adjusted_probs", adjusted_probs.probs) (adjusted_log_prob*advantage).backward() print("gradient", target_logits.grad) print() # no invalid action masking with different parameterization print("=============regular but differrent parameterization=============") target_logits = torch.tensor([1., 1., -2., 1.,] , requires_grad=True) target_probs = Categorical(logits=target_logits) log_prob = target_probs.log_prob(torch.tensor(action)) print("target_probs", target_probs.probs) (log_prob*advantage).backward() print("gradient", target_logits.grad) print() new_target_logits = target_logits + target_logits.grad new_target_probs = Categorical(logits=new_target_logits) print("target_probs", new_target_probs.probs)