Repository: facebookresearch/EmbodiedQA Branch: main Commit: 306fd6ef3064 Files: 17 Total size: 253.9 KB Directory structure: gitextract_mc95tebh/ ├── .gitignore ├── .gitmodules ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── requirements.txt ├── training/ │ ├── data.py │ ├── metrics.py │ ├── models.py │ ├── train_eqa.py │ ├── train_nav.py │ ├── train_vqa.py │ └── utils/ │ ├── preprocess_questions.py │ └── preprocess_questions_pkl.py └── utils/ ├── house3d.py └── make_houses.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ tmp data logs checkpoints *.pem *.sh *autoenv* # PYTHON # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ # NODE # Logs logs *.log npm-debug.log* yarn-debug.log* yarn-error.log* # Runtime data pids *.pid *.seed *.pid.lock # Directory for instrumented libs generated by jscoverage/JSCover lib-cov # Coverage directory used by tools like istanbul coverage # nyc test coverage .nyc_output # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) .grunt # Bower dependency directory (https://bower.io/) bower_components # node-waf configuration .lock-wscript # Compiled binary addons (http://nodejs.org/api/addons.html) build/Release # Dependency directories node_modules/ jspm_packages/ # Typescript v1 declaration files typings/ # Optional npm cache directory .npm # Optional eslint cache .eslintcache # Optional REPL history .node_repl_history # Output of 'npm pack' *.tgz # Yarn Integrity file .yarn-integrity # dotenv environment variables file .env ================================================ FILE: .gitmodules ================================================ [submodule "House3D"] path = House3D url = git@github.com:abhshkdz/House3D.git ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Code of Conduct Facebook has adopted a Code of Conduct that we expect project participants to adhere to. Please read the [full text](https://code.facebook.com/pages/876921332402685/open-source-code-of-conduct) so that you can understand what actions will and will not be tolerated. ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to EmbodiedQA We want to make contributing to this project as easy and transparent as possible. ## Our Development Process Minor changes and improvements will be released on an ongoing basis. Larger changes (e.g., changesets implementing a new paper) will be released on a more periodic basis. ## Pull Requests We actively welcome your pull requests. 1. Fork the repo and create your branch from `master`. 2. If you've added code that should be tested, add tests. 3. If you've changed APIs, update the documentation. 4. Ensure the test suite passes. 5. Make sure your code lints. 6. If you haven't already, complete the Contributor License Agreement ("CLA"). ## Contributor License Agreement ("CLA") In order to accept your pull request, we need you to submit a CLA. You only need to do this once to work on any of Facebook's open source projects. Complete your CLA here: ## Issues We use GitHub issues to track public bugs. Please ensure your description is clear and has sufficient instructions to be able to reproduce the issue. Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe disclosure of security bugs. In those cases, please go through the process outlined on that page and do not file a public issue. ## Coding Style * 4 spaces for indentation rather than tabs * 80 character line length ## License By contributing to EmbodiedQA, you agree that your contributions will be licensed under the LICENSE file in the root directory of this source tree. ================================================ FILE: LICENSE ================================================ BSD License For EmbodiedQA software Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name Facebook nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: README.md ================================================ # EmbodiedQA Code for the paper **[Embodied Question Answering][1]** Abhishek Das, Samyak Datta, Georgia Gkioxari, Stefan Lee, Devi Parikh, Dhruv Batra [arxiv.org/abs/1711.11543][2] CVPR 2018 (Oral) In Embodied Question Answering (EmbodiedQA), an agent is spawned at a random location in a 3D environment and asked a question (for e.g. "What color is the car?"). In order to answer, the agent must first intelligently navigate to explore the environment, gather necessary visual information through first-person vision, and then answer the question ("orange"). ![](https://i.imgur.com/jeI7bxm.jpg) This repository provides - [Pretrained CNN](#pretrained-cnn) for [House3D][house3d] - Code for [generating EQA questions](#question-generation) - EQA v1: location, color, place preposition - EQA v1-extended: existence, logical, object counts, room counts, distance comparison - Code to train and evaluate [navigation](#navigation) and [question-answering](#visual-question-answering) models - [independently with supervised learning](#supervised-learning) on shortest paths - jointly using [reinforcement learning](#reinforce) If you find this code useful, consider citing our work: ``` @inproceedings{embodiedqa, title={{E}mbodied {Q}uestion {A}nswering}, author={Abhishek Das and Samyak Datta and Georgia Gkioxari and Stefan Lee and Devi Parikh and Dhruv Batra}, booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, year={2018} } ``` ## Setup ``` virtualenv -p python3 .env source .env/bin/activate pip install -r requirements.txt ``` Download the [SUNCG v1 dataset](https://github.com/facebookresearch/House3D/blob/master/INSTRUCTION.md#usage-instructions) and [install House3D](https://github.com/abhshkdz/House3D/tree/master/renderer#rendering-code-of-house3d). NOTE: This code uses a [fork of House3D](https://github.com/abhshkdz/house3d) with a few changes to support arbitrary map discretization resolutions. ## Question generation Questions for EmbodiedQA are generated programmatically, in a manner similar to [CLEVR (Johnson et al., 2017)][clevr]. NOTE: Pre-generated EQA v1 questions are available for download [here][eqav1]. ### Generating questions for all templates in EQA v1, v1-extended ``` cd data/question-gen ./run_me.sh MM_DD ``` ### List defined question templates ``` from engine import Engine E = Engine() for i in E.template_defs: print(i, E.template_defs[i]) ``` ### Generate questions for a particular template (say `location`) ``` from house_parse import HouseParse from engine import Engine Hp = HouseParse(dataDir='/path/to/suncg') Hp.parse('0aa5e04f06a805881285402096eac723') E = Engine() E.cacheHouse(Hp) qns = E.executeFn(E.template_defs['location']) print(qns[0]['question'], qns[0]['answer']) # what room is the clock located in? bedroom ``` ## Pretrained CNN We trained a shallow encoder-decoder CNN from scratch in the House3D environment, for RGB reconstruction, semantic segmentation and depth estimation. Once trained, we throw away the decoders, and use the encoder as a frozen feature extractor for navigation and question answering. The CNN is available for download here: `wget https://www.dropbox.com/s/ju1zw4iipxlj966/03_13_h3d_hybrid_cnn.pt` The training code expects the checkpoint to be present in `training/models/`. ## Supervised Learning ### Download and preprocess the dataset Download [EQA v1][eqav1] and shortest path navigations: ``` wget https://www.dropbox.com/s/6zu1b1jzl0qt7t1/eqa_v1.json wget https://www.dropbox.com/s/lhajthx7cdlnhns/a-star-500.zip unzip a-star-500.zip ``` If this is the first time you are using SUNCG, you will have to clone and use the [SUNCG toolbox](https://github.com/shurans/SUNCGtoolbox#convert-to-objmtl) to generate obj + mtl files for the houses in EQA. NOTE: Shortest paths have been updated. Earlier we computed shortest paths using a discrete grid world, but we found that these shortest paths were sometimes innacurate. Old shortest paths are [here](https://www.dropbox.com/s/vgp2ygh1bht1jyb/shortest-paths.zip). ``` cd utils python make_houses.py \ -eqa_path /path/to/eqa.json \ -suncg_toolbox_path /path/to/SUNCGtoolbox \ -suncg_data_path /path/to/suncg/data_root ``` Preprocess the dataset for training ``` cd training python utils/preprocess_questions_pkl.py \ -input_json /path/to/eqa_v1.json \ -shortest_path_dir /path/to/shortest/paths/a-star-500 \ -output_train_h5 data/train.h5 \ -output_val_h5 data/val.h5 \ -output_test_h5 data/test.h5 \ -output_data_json data/data.json \ -output_vocab data/vocab.json ``` ### Visual question answering Update pretrained CNN path in `models.py`. `python train_vqa.py -input_type ques,image -identifier ques-image -log -cache` This model computes question-conditioned attention over last 5 frames from oracle navigation (shortest paths), and predicts answer. Assuming shortest paths are optimal for answering the question -- which is predominantly true for most questions in EQA v1 (`location`, `color`, `place preposition`) with the exception of a few `location` questions that might need more visual context than walking right up till the object -- this can be thought of as an upper bound on expected accuracy, and performance will get worse when navigation trajectories are sampled from trained policies. A pretrained VQA model is available for download [here](https://www.dropbox.com/s/jd15af00r7m8neh/vqa_11_18_2018_va0.6154.pt). This gets a top-1 accuracy of 61.54% on val, and 58.46% on test (with GT navigation). Note that keeping the `cache` flag ON caches images as they are rendered in the first training epoch, so that subsequent epochs are very fast. This is memory-intensive though, and consumes ~100-120G RAM. ### Navigation Download potential maps for evaluating navigation and training with REINFORCE. ``` wget https://www.dropbox.com/s/53edqtr04jts4q0/target-obj-conn-maps-500.zip ``` #### Planner-controller policy `python train_nav.py -model_type pacman -identifier pacman -log` ## REINFORCE ``` python train_eqa.py \ -nav_checkpoint_path /path/to/nav/ques-image-pacman/checkpoint.pt \ -ans_checkpoint_path /path/to/vqa/ques-image/checkpoint.pt \ -identifier ques-image-eqa \ -log ``` ## Changelog ### 09/07 - We added the baseline models from the CVPR paper (Reactive and LSTM). - With the LSTM model, we achieved d_T values of: 0.74693/3.99891/8.10669 on the test set for d equal to 10/30/50 respectively training with behavior cloning (no reinforcement learning). - We also updated the shortest paths to fix an issue with the shortest path algorithm we initially used. Code to generate shortest paths is [here](https://github.com/facebookresearch/EmbodiedQA/blob/master/data/shortest-path-gen/generate-paths-a-star.py). ### 06/13 This code release contains the following changes over the CVPR version - Larger dataset of questions + shortest paths - Color names as answers to color questions (earlier they were hex strings) ## Acknowledgements - Parts of this code are adapted from [pytorch-a3c][pytorch-a3c] by Ilya Kostrikov - [Lisa Anne Hendricks](https://people.eecs.berkeley.edu/~lisa_anne/) and [Licheng Yu](http://www.cs.unc.edu/~licheng/) helped with running / testing / debugging code prior to release ## License BSD [1]: https://embodiedqa.org [2]: https://arxiv.org/abs/1711.11543 [house3d]: https://github.com/facebookresearch/house3d [dijkstar]: https://bitbucket.org/wyatt/dijkstar [pytorch-a3c]: https://github.com/ikostrikov/pytorch-a3c [eqav1]: https://embodiedqa.org/data [clevr]: https://github.com/facebookresearch/clevr-dataset-gen ================================================ FILE: requirements.txt ================================================ certifi==2018.4.16 chardet==3.0.4 future==0.16.0 gym==0.10.5 h5py==2.8.0 idna==2.6 numpy==1.14.4 opencv-python==3.4.1.15 Pillow==5.1.0 pyglet==1.3.2 requests==2.18.4 scipy==1.1.0 six==1.11.0 torch==0.3.1 torchvision==0.2.1 tqdm==4.23.4 urllib3==1.22 ================================================ FILE: training/data.py ================================================ import math import time import h5py import logging import argparse import numpy as np import os, sys, json from tqdm import tqdm from scipy.misc import imread, imresize import torch from torch.utils.data import Dataset, DataLoader from torch.utils.data.dataloader import default_collate from torch.autograd import Variable sys.path.insert(0, '../../House3D/') from House3D import objrender, Environment, load_config from House3D.core import local_create_house sys.path.insert(0, '../utils/') from house3d import House3DUtils from models import MultitaskCNN import pdb def load_vocab(path): with open(path, 'r') as f: vocab = json.load(f) vocab['questionIdxToToken'] = invert_dict(vocab['questionTokenToIdx']) vocab['answerIdxToToken'] = invert_dict(vocab['answerTokenToIdx']) assert vocab['questionTokenToIdx'][''] == 0 assert vocab['questionTokenToIdx'][''] == 1 assert vocab['questionTokenToIdx'][''] == 2 return vocab def invert_dict(d): return {v: k for k, v in d.items()} """ if the action sequence is [f, f, l, l, f, f, f, r] input sequence to planner is [, f, l, f, r] output sequence for planner is [f, l, f, r, ] input sequences to controller are [f, f, l, l, f, f, f, r] output sequences for controller are [1, 0, 1, 0, 1, 1, 0, 0] """ def flat_to_hierarchical_actions(actions, controller_action_lim): assert len(actions) != 0 controller_action_ctr = 0 planner_actions, controller_actions = [1], [] prev_action = 1 pq_idx, cq_idx, ph_idx = [], [], [] ph_trck = 0 for i in range(1, len(actions)): if actions[i] != prev_action: planner_actions.append(actions[i]) pq_idx.append(i-1) if i > 1: ph_idx.append(ph_trck) if actions[i] == prev_action: controller_actions.append(1) controller_action_ctr += 1 else: controller_actions.append(0) controller_action_ctr = 0 ph_trck += 1 cq_idx.append(i-1) prev_action = actions[i] if controller_action_ctr == controller_action_lim-1: prev_action = False return planner_actions, controller_actions, pq_idx, cq_idx, ph_idx def _dataset_to_tensor(dset, mask=None, dtype=np.int64): arr = np.asarray(dset, dtype=dtype) if mask is not None: arr = arr[mask] if dtype == np.float32: tensor = torch.FloatTensor(arr) else: tensor = torch.LongTensor(arr) return tensor def eqaCollateCnn(batch): transposed = list(zip(*batch)) idx_batch = default_collate(transposed[0]) question_batch = default_collate(transposed[1]) answer_batch = default_collate(transposed[2]) images_batch = default_collate(transposed[3]) actions_in_batch = default_collate(transposed[4]) actions_out_batch = default_collate(transposed[5]) action_lengths_batch = default_collate(transposed[6]) return [ idx_batch, question_batch, answer_batch, images_batch, actions_in_batch, actions_out_batch, action_lengths_batch ] def eqaCollateSeq2seq(batch): transposed = list(zip(*batch)) idx_batch = default_collate(transposed[0]) questions_batch = default_collate(transposed[1]) answers_batch = default_collate(transposed[2]) images_batch = default_collate(transposed[3]) actions_in_batch = default_collate(transposed[4]) actions_out_batch = default_collate(transposed[5]) action_lengths_batch = default_collate(transposed[6]) mask_batch = default_collate(transposed[7]) return [ idx_batch, questions_batch, answers_batch, images_batch, actions_in_batch, actions_out_batch, action_lengths_batch, mask_batch ] class EqaDataset(Dataset): def __init__(self, questions_h5, vocab, num_frames=1, data_json=False, split='train', gpu_id=0, input_type='ques', max_threads_per_gpu=10, to_cache=False, target_obj_conn_map_dir=False, map_resolution=1000, overfit=False, max_controller_actions=5, max_actions=None): self.questions_h5 = questions_h5 self.vocab = load_vocab(vocab) self.num_frames = num_frames self.max_controller_actions = max_controller_actions np.random.seed() self.data_json = data_json self.split = split self.gpu_id = gpu_id self.input_type = input_type self.max_threads_per_gpu = max_threads_per_gpu self.target_obj_conn_map_dir = target_obj_conn_map_dir self.map_resolution = map_resolution self.overfit = overfit self.to_cache = to_cache self.img_data_cache = {} print('Reading question data into memory') self.idx = _dataset_to_tensor(questions_h5['idx']) self.questions = _dataset_to_tensor(questions_h5['questions']) self.answers = _dataset_to_tensor(questions_h5['answers']) self.actions = _dataset_to_tensor(questions_h5['action_labels']) self.action_lengths = _dataset_to_tensor( questions_h5['action_lengths']) if max_actions: #max actions will allow us to create arrays of a certain length. Helpful if you only want to train with 10 actions. assert isinstance(max_actions, int) num_data_items = self.actions.shape[0] new_actions = np.zeros((num_data_items, max_actions+2), dtype=np.int64) new_lengths = np.ones((num_data_items,), dtype=np.int64)*max_actions for i in range(num_data_items): action_length = int(self.action_lengths[i]) new_actions[i,0] = 1 new_actions[i,1:max_actions+1] = self.actions[i, action_length-max_actions: action_length].numpy() self.actions = torch.LongTensor(new_actions) self.action_lengths = torch.LongTensor(new_lengths) if self.data_json != False: data = json.load(open(self.data_json, 'r')) self.envs = data['envs'] self.env_idx = data[self.split + '_env_idx'] self.env_list = [self.envs[x] for x in self.env_idx] self.env_set = list(set(self.env_list)) self.env_set.sort() if self.overfit == True: self.env_idx = self.env_idx[:1] self.env_set = self.env_list = [self.envs[x] for x in self.env_idx] print('Trying to overfit to [house %s]' % self.env_set[0]) logging.info('Trying to overfit to [house {}]'.format(self.env_set[0])) print('Total envs: %d' % len(list(set(self.envs)))) print('Envs in %s: %d' % (self.split, len(list(set(self.env_idx))))) if input_type != 'ques': '''' If training, randomly sample and load a subset of environments, train on those, and then cycle through to load the rest. On the validation and test set, load in order, and cycle through. For both, add optional caching so that if all environments have been cycled through once, then no need to re-load and instead, just the cache can be used. ''' self.api_threads = [] self._load_envs(start_idx=0, in_order=True) cnn_kwargs = {'num_classes': 191, 'pretrained': True} self.cnn = MultitaskCNN(**cnn_kwargs) self.cnn.eval() self.cnn.cuda() self.pos_queue = data[self.split + '_pos_queue'] self.boxes = data[self.split + '_boxes'] if max_actions: for i in range(len(self.pos_queue)): self.pos_queue[i] = self.pos_queue[i][-1*max_actions:] if input_type == 'pacman': self.planner_actions = self.actions.clone().fill_(0) self.controller_actions = self.actions.clone().fill_(-1) self.planner_action_lengths = self.action_lengths.clone().fill_(0) self.controller_action_lengths = self.action_lengths.clone().fill_( 0) self.planner_hidden_idx = self.actions.clone().fill_(0) self.planner_pos_queue_idx, self.controller_pos_queue_idx = [], [] # parsing flat actions to planner-controller hierarchy for i in tqdm(range(len(self.actions))): pa, ca, pq_idx, cq_idx, ph_idx = flat_to_hierarchical_actions( actions=self.actions[i][:self.action_lengths[i]+1], controller_action_lim=max_controller_actions) self.planner_actions[i][:len(pa)] = torch.Tensor(pa) self.controller_actions[i][:len(ca)] = torch.Tensor(ca) self.planner_action_lengths[i] = len(pa)-1 self.controller_action_lengths[i] = len(ca) self.planner_pos_queue_idx.append(pq_idx) self.controller_pos_queue_idx.append(cq_idx) self.planner_hidden_idx[i][:len(ca)] = torch.Tensor(ph_idx) def _pick_envs_to_load(self, split='train', max_envs=10, start_idx=0, in_order=False): if split in ['val', 'test'] or in_order == True: pruned_env_set = self.env_set[start_idx:start_idx + max_envs] else: if max_envs < len(self.env_set): env_inds = np.random.choice( len(self.env_set), max_envs, replace=False) else: env_inds = np.random.choice( len(self.env_set), max_envs, replace=True) pruned_env_set = [self.env_set[x] for x in env_inds] return pruned_env_set def _load_envs(self, start_idx=-1, in_order=False): #self._clear_memory() if start_idx == -1: start_idx = self.env_set.index(self.pruned_env_set[-1]) + 1 # Pick envs self.pruned_env_set = self._pick_envs_to_load( split=self.split, max_envs=self.max_threads_per_gpu, start_idx=start_idx, in_order=in_order) if len(self.pruned_env_set) == 0: return # Load api threads start = time.time() if len(self.api_threads) == 0: for i in range(self.max_threads_per_gpu): self.api_threads.append( objrender.RenderAPIThread( w=224, h=224, device=self.gpu_id)) try: self.cfg = load_config('../House3D/tests/config.json') except: self.cfg = load_config('../../House3D/tests/config.json') #Sorry guys; this is so Lisa can run on her system; maybe we should make this an input somewhere? print('[%.02f] Loaded %d api threads' % (time.time() - start, len(self.api_threads))) start = time.time() # Load houses from multiprocessing import Pool _args = ([h, self.cfg, self.map_resolution] for h in self.pruned_env_set) with Pool(len(self.pruned_env_set)) as pool: self.all_houses = pool.starmap(local_create_house, _args) print('[%.02f] Loaded %d houses' % (time.time() - start, len(self.all_houses))) start = time.time() # Load envs self.env_loaded = {} for i in range(len(self.all_houses)): print('[%02d/%d][split:%s][gpu:%d][house:%s]' % (i + 1, len(self.all_houses), self.split, self.gpu_id, self.all_houses[i].house['id'])) environment = Environment(self.api_threads[i], self.all_houses[i], self.cfg) self.env_loaded[self.all_houses[i].house['id']] = House3DUtils( environment, target_obj_conn_map_dir=self.target_obj_conn_map_dir, build_graph=False) # [TODO] Unused till now self.env_ptr = -1 print('[%.02f] Loaded %d house3d envs' % (time.time() - start, len(self.env_loaded))) # Mark available data indices self.available_idx = [ i for i, v in enumerate(self.env_list) if v in self.env_loaded ] # [TODO] only keeping legit sequences # needed for things to play well with old data temp_available_idx = self.available_idx.copy() for i in range(len(temp_available_idx)): if self.action_lengths[temp_available_idx[i]] < 5: self.available_idx.remove(temp_available_idx[i]) print('Available inds: %d' % len(self.available_idx)) # Flag to check if loaded envs have been cycled through or not # [TODO] Unused till now self.all_envs_loaded = False def _clear_api_threads(self): for i in range(len(self.api_threads)): del self.api_threads[0] self.api_threads = [] def _clear_memory(self): if hasattr(self, 'episode_house'): del self.episode_house if hasattr(self, 'env_loaded'): del self.env_loaded if hasattr(self, 'api_threads'): del self.api_threads self.api_threads = [] def _check_if_all_envs_loaded(self): print('[CHECK][Cache:%d][Total:%d]' % (len(self.img_data_cache), len(self.env_list))) if len(self.img_data_cache) == len(self.env_list): self.available_idx = [i for i, v in enumerate(self.env_list)] return True else: return False def set_camera(self, e, pos, robot_height=1.0): assert len(pos) == 4 e.env.cam.pos.x = pos[0] e.env.cam.pos.y = robot_height e.env.cam.pos.z = pos[2] e.env.cam.yaw = pos[3] e.env.cam.updateDirection() def render(self, e): return e.env.render() def get_frames(self, e, pos_queue, preprocess=True): if isinstance(pos_queue, list) == False: pos_queue = [pos_queue] res = [] for i in range(len(pos_queue)): self.set_camera(e, pos_queue[i]) img = np.array(self.render(e), copy=False, dtype=np.float32) if preprocess == True: img = img.transpose(2, 0, 1) img = img / 255.0 res.append(img) return np.array(res) def get_hierarchical_features_till_spawn(self, actions, backtrack_steps=0, max_controller_actions=5): action_length = len(actions)-1 pa, ca, pq_idx, cq_idx, ph_idx = flat_to_hierarchical_actions( actions=actions, controller_action_lim=max_controller_actions) # count how many actions of same type have been encountered pefore starting navigation backtrack_controller_steps = actions[1:action_length - backtrack_steps + 1:][::-1] counter = 0 if len(backtrack_controller_steps) > 0: while (counter <= self.max_controller_actions) and (counter < len(backtrack_controller_steps) and (backtrack_controller_steps[counter] == backtrack_controller_steps[0])): counter += 1 target_pos_idx = action_length - backtrack_steps controller_step = True if target_pos_idx in pq_idx: controller_step = False pq_idx_pruned = [v for v in pq_idx if v <= target_pos_idx] pa_pruned = pa[:len(pq_idx_pruned)+1] images = self.get_frames( self.episode_house, self.episode_pos_queue, preprocess=True) raw_img_feats = self.cnn( Variable(torch.FloatTensor(images) .cuda())).data.cpu().numpy().copy() controller_img_feat = torch.from_numpy(raw_img_feats[target_pos_idx].copy()) controller_action_in = pa_pruned[-1] - 2 planner_img_feats = torch.from_numpy(raw_img_feats[pq_idx_pruned].copy()) planner_actions_in = torch.from_numpy(np.array(pa_pruned[:-1]) - 1) return planner_actions_in, planner_img_feats, controller_step, controller_action_in, \ controller_img_feat, self.episode_pos_queue[target_pos_idx], counter def __getitem__(self, index): # [VQA] question-only if self.input_type == 'ques': idx = self.idx[index] question = self.questions[index] answer = self.answers[index] return (idx, question, answer) # [VQA] question+image elif self.input_type == 'ques,image': index = self.available_idx[index] idx = self.idx[index] question = self.questions[index] answer = self.answers[index] action_length = self.action_lengths[index] actions = self.actions[index] actions_in = actions[action_length - self.num_frames:action_length] actions_out = actions[action_length - self.num_frames + 1: action_length + 1] if self.to_cache == True and index in self.img_data_cache: images = self.img_data_cache[index] else: pos_queue = self.pos_queue[index][ -self.num_frames:] # last 5 frames images = self.get_frames( self.env_loaded[self.env_list[index]], pos_queue, preprocess=True) if self.to_cache == True: self.img_data_cache[index] = images.copy() return (idx, question, answer, images, actions_in, actions_out, action_length) # [NAV] question+cnn elif self.input_type in ['cnn', 'cnn+q']: index = self.available_idx[index] idx = self.idx[index] question = self.questions[index] answer = self.answers[index] action_length = self.action_lengths[index] actions = self.actions[index] if self.to_cache == True and index in self.img_data_cache: img_feats = self.img_data_cache[index] else: pos_queue = self.pos_queue[index] images = self.get_frames( self.env_loaded[self.env_list[index]], pos_queue, preprocess=True) img_feats = self.cnn( Variable(torch.FloatTensor(images) .cuda())).data.cpu().numpy().copy() if self.to_cache == True: self.img_data_cache[index] = img_feats # for val or test (evaluation), or # when target_obj_conn_map_dir is defined (reinforce), # load entire shortest path navigation trajectory # and load connectivity map for intermediate rewards if self.split in ['val', 'test' ] or self.target_obj_conn_map_dir != False: target_obj_id, target_room = False, False bbox_obj = [ x for x in self.boxes[index] if x['type'] == 'object' and x['target'] == True ][0]['box'] for obj_id in self.env_loaded[self.env_list[index]].objects: box2 = self.env_loaded[self.env_list[index]].objects[ obj_id]['bbox'] if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \ all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True: target_obj_id = obj_id break bbox_room = [ x for x in self.boxes[index] if x['type'] == 'room' and x['target'] == False ][0] for room in self.env_loaded[self.env_list[ index]].env.house.all_rooms: if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \ all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]): target_room = room break assert target_obj_id != False assert target_room != False self.env_loaded[self.env_list[index]].set_target_object( self.env_loaded[self.env_list[index]].objects[ target_obj_id], target_room) # [NOTE] only works for batch size = 1 self.episode_pos_queue = self.pos_queue[index] self.episode_house = self.env_loaded[self.env_list[index]] self.target_room = target_room self.target_obj = self.env_loaded[self.env_list[ index]].objects[target_obj_id] actions_in = actions[:action_length] actions_out = actions[1:action_length + 1] - 2 return (idx, question, answer, img_feats, actions_in, actions_out, action_length) # if action_length is n # images.shape[0] is also n # actions[0] is # actions[n] is # grab 5 random frames # [NOTE]: this'll break for longer-than-5 navigation sequences start_idx = np.random.choice(img_feats.shape[0] + 1 - self.num_frames) img_feats = img_feats[start_idx:start_idx + self.num_frames] actions_in = actions[start_idx:start_idx + self.num_frames] actions_out = actions[start_idx + self.num_frames] - 2 return (idx, question, answer, img_feats, actions_in, actions_out, action_length) # [NAV] question+lstm elif self.input_type in ['lstm', 'lstm+q']: index = self.available_idx[index] idx = self.idx[index] question = self.questions[index] answer = self.answers[index] action_length = self.action_lengths[index] actions = self.actions[index] if self.split == 'train': if self.to_cache == True and index in self.img_data_cache: img_feats = self.img_data_cache[index] else: pos_queue = self.pos_queue[index] images = self.get_frames( self.env_loaded[self.env_list[index]], pos_queue, preprocess=True) raw_img_feats = self.cnn( Variable(torch.FloatTensor(images) .cuda())).data.cpu().numpy().copy() img_feats = np.zeros( (self.actions.shape[1], raw_img_feats.shape[1]), dtype=np.float32) img_feats[:raw_img_feats.shape[ 0], :] = raw_img_feats.copy() if self.to_cache == True: self.img_data_cache[index] = img_feats actions_in = actions.clone() - 1 actions_out = actions[1:].clone() - 2 actions_in[action_length:].fill_(0) mask = actions_out.clone().gt(-1) if len(actions_out) > action_length: actions_out[action_length:].fill_(0) # for val or test (evaluation), or # when target_obj_conn_map_dir is defined (reinforce), # load entire shortest path navigation trajectory # and load connectivity map for intermediate rewards if self.split in ['val', 'test' ] or self.target_obj_conn_map_dir != False: target_obj_id, target_room = False, False bbox_obj = [ x for x in self.boxes[index] if x['type'] == 'object' and x['target'] == True ][0]['box'] for obj_id in self.env_loaded[self.env_list[index]].objects: box2 = self.env_loaded[self.env_list[index]].objects[ obj_id]['bbox'] if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \ all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True: target_obj_id = obj_id break bbox_room = [ x for x in self.boxes[index] if x['type'] == 'room' and x['target'] == False ][0] for room in self.env_loaded[self.env_list[ index]].env.house.all_rooms: if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \ all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]): target_room = room break assert target_obj_id != False assert target_room != False self.env_loaded[self.env_list[index]].set_target_object( self.env_loaded[self.env_list[index]].objects[ target_obj_id], target_room) # [NOTE] only works for batch size = 1 self.episode_pos_queue = self.pos_queue[index] self.episode_house = self.env_loaded[self.env_list[index]] self.target_room = target_room self.target_obj = self.env_loaded[self.env_list[ index]].objects[target_obj_id] return (idx, question, answer, False, actions_in, actions_out, action_length, mask) return (idx, question, answer, img_feats, actions_in, actions_out, action_length, mask) # [NAV] planner-controller elif self.input_type in ['pacman']: index = self.available_idx[index] idx = self.idx[index] question = self.questions[index] answer = self.answers[index] action_length = self.action_lengths[index] actions = self.actions[index] planner_actions = self.planner_actions[index] controller_actions = self.controller_actions[index] planner_action_length = self.planner_action_lengths[index] controller_action_length = self.controller_action_lengths[index] planner_hidden_idx = self.planner_hidden_idx[index] if self.split == 'train': if self.to_cache == True and index in self.img_data_cache: img_feats = self.img_data_cache[index] else: pos_queue = self.pos_queue[index] images = self.get_frames( self.env_loaded[self.env_list[index]], pos_queue, preprocess=True) raw_img_feats = self.cnn( Variable(torch.FloatTensor(images) .cuda())).data.cpu().numpy().copy() img_feats = np.zeros( (self.actions.shape[1], raw_img_feats.shape[1]), dtype=np.float32) img_feats[:raw_img_feats.shape[ 0], :] = raw_img_feats.copy() if self.to_cache == True: self.img_data_cache[index] = img_feats if self.split in ['val', 'test' ] or self.target_obj_conn_map_dir != False: target_obj_id, target_room = False, False bbox_obj = [ x for x in self.boxes[index] if x['type'] == 'object' and x['target'] == True ][0]['box'] for obj_id in self.env_loaded[self.env_list[index]].objects: box2 = self.env_loaded[self.env_list[index]].objects[ obj_id]['bbox'] if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \ all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True: target_obj_id = obj_id break bbox_room = [ x for x in self.boxes[index] if x['type'] == 'room' and x['target'] == False ][0] for room in self.env_loaded[self.env_list[ index]].env.house.all_rooms: if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \ all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]): target_room = room break assert target_obj_id != False assert target_room != False self.env_loaded[self.env_list[index]].set_target_object( self.env_loaded[self.env_list[index]].objects[ target_obj_id], target_room) # [NOTE] only works for batch size = 1 self.episode_pos_queue = self.pos_queue[index] self.episode_house = self.env_loaded[self.env_list[index]] self.target_room = target_room self.target_obj = self.env_loaded[self.env_list[ index]].objects[target_obj_id] return (idx, question, answer, actions, action_length) planner_pos_queue_idx = self.planner_pos_queue_idx[index] controller_pos_queue_idx = self.controller_pos_queue_idx[index] planner_img_feats = np.zeros( (self.actions.shape[1], img_feats.shape[1]), dtype=np.float32) planner_img_feats[:planner_action_length] = img_feats[ planner_pos_queue_idx] planner_actions_in = planner_actions.clone() - 1 planner_actions_out = planner_actions[1:].clone() - 2 planner_actions_in[planner_action_length:].fill_(0) planner_mask = planner_actions_out.clone().gt(-1) if len(planner_actions_out) > planner_action_length: planner_actions_out[planner_action_length:].fill_(0) controller_img_feats = np.zeros( (self.actions.shape[1], img_feats.shape[1]), dtype=np.float32) controller_img_feats[:controller_action_length] = img_feats[ controller_pos_queue_idx] controller_actions_in = actions[1:].clone() - 2 if len(controller_actions_in) > controller_action_length: controller_actions_in[controller_action_length:].fill_(0) controller_out = controller_actions controller_mask = controller_out.clone().gt(-1) if len(controller_out) > controller_action_length: controller_out[controller_action_length:].fill_(0) # zero out forced controller return for i in range(controller_action_length): if i >= self.max_controller_actions - 1 and controller_out[i] == 0 and \ (self.max_controller_actions == 1 or controller_out[i - self.max_controller_actions + 1:i].sum() == self.max_controller_actions - 1): controller_mask[i] = 0 return (idx, question, answer, planner_img_feats, planner_actions_in, planner_actions_out, planner_action_length, planner_mask, controller_img_feats, controller_actions_in, planner_hidden_idx, controller_out, controller_action_length, controller_mask) def __len__(self): if self.input_type == 'ques': return len(self.questions) else: return len(self.available_idx) class EqaDataLoader(DataLoader): def __init__(self, **kwargs): if 'questions_h5' not in kwargs: raise ValueError('Must give questions_h5') if 'data_json' not in kwargs: raise ValueError('Must give data_json') if 'vocab' not in kwargs: raise ValueError('Must give vocab') if 'input_type' not in kwargs: raise ValueError('Must give input_type') if 'split' not in kwargs: raise ValueError('Must give split') if 'gpu_id' not in kwargs: raise ValueError('Must give gpu_id') questions_h5_path = kwargs.pop('questions_h5') data_json = kwargs.pop('data_json') input_type = kwargs.pop('input_type') split = kwargs.pop('split') vocab = kwargs.pop('vocab') gpu_id = kwargs.pop('gpu_id') if 'max_threads_per_gpu' in kwargs: max_threads_per_gpu = kwargs.pop('max_threads_per_gpu') else: max_threads_per_gpu = 10 if 'to_cache' in kwargs: to_cache = kwargs.pop('to_cache') else: to_cache = False if 'target_obj_conn_map_dir' in kwargs: target_obj_conn_map_dir = kwargs.pop('target_obj_conn_map_dir') else: target_obj_conn_map_dir = False if 'map_resolution' in kwargs: map_resolution = kwargs.pop('map_resolution') else: map_resolution = 1000 if 'image' in input_type or 'cnn' in input_type: kwargs['collate_fn'] = eqaCollateCnn elif 'lstm' in input_type: kwargs['collate_fn'] = eqaCollateSeq2seq if 'overfit' in kwargs: overfit = kwargs.pop('overfit') else: overfit = False if 'max_controller_actions' in kwargs: max_controller_actions = kwargs.pop('max_controller_actions') else: max_controller_actions = 5 if 'max_actions' in kwargs: max_actions = kwargs.pop('max_actions') else: max_actions = None print('Reading questions from ', questions_h5_path) with h5py.File(questions_h5_path, 'r') as questions_h5: self.dataset = EqaDataset( questions_h5, vocab, num_frames=kwargs.pop('num_frames'), data_json=data_json, split=split, gpu_id=gpu_id, input_type=input_type, max_threads_per_gpu=max_threads_per_gpu, to_cache=to_cache, target_obj_conn_map_dir=target_obj_conn_map_dir, map_resolution=map_resolution, overfit=overfit, max_controller_actions=max_controller_actions, max_actions=max_actions) super(EqaDataLoader, self).__init__(self.dataset, **kwargs) def close(self): pass def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-train_h5', default='data/04_22/train_v1.h5') parser.add_argument('-val_h5', default='data/04_22/val_v1.h5') parser.add_argument('-data_json', default='data/04_22/data_v1.json') parser.add_argument('-vocab_json', default='data/04_22/vocab_v1.json') parser.add_argument( '-input_type', default='ques', choices=['ques', 'ques,image']) parser.add_argument( '-num_frames', default=5, type=int) # -1 = all frames of navigation sequence parser.add_argument('-batch_size', default=50, type=int) parser.add_argument('-max_threads_per_gpu', default=10, type=int) args = parser.parse_args() try: args.gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',') args.gpus = [int(x) for x in args.gpus] except KeyError: print("CPU not supported") exit() train_loader_kwargs = { 'questions_h5': args.train_h5, 'data_json': args.data_json, 'vocab': args.vocab_json, 'batch_size': args.batch_size, 'input_type': args.input_type, 'num_frames': args.num_frames, 'split': 'train', 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[0], 'cache_path': False, } train_loader = EqaDataLoader(**train_loader_kwargs) train_loader.dataset._load_envs(start_idx=0, in_order=True) t = 0 while True: done = False all_envs_loaded = train_loader.dataset._check_if_all_envs_loaded() while done == False: print('[Size:%d][t:%d][Cache:%d]' % (len(train_loader.dataset), t, len(train_loader.dataset.img_data_cache))) for batch in train_loader: t += 1 if all_envs_loaded == False: train_loader.dataset._load_envs(in_order=True) if len(train_loader.dataset.pruned_env_set) == 0: done = True else: done = True ================================================ FILE: training/metrics.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. import pdb import copy import json import time import os, sys import argparse import numpy as np class Metric(): def __init__(self, info={}, metric_names=[], log_json=None): self.info = info self.metric_names = metric_names self.metrics = [[None,None,None] for _ in self.metric_names] self.stats = [] self.num_iters = 0 self.log_json = log_json def update(self, values): assert isinstance(values, list) self.num_iters += 1 current_stats = [] for i in range(len(values)): if values[i] is None: continue if isinstance(values[i], list) == False: values[i] = [values[i]] if self.metrics[i][0] == None: self.metrics[i][0] = np.mean(values[i]) self.metrics[i][1] = np.mean(values[i]) self.metrics[i][2] = np.mean(values[i]) else: self.metrics[i][0] = (self.metrics[i][0] * (self.num_iters - 1) + np.mean(values[i])) / self.num_iters self.metrics[i][1] = 0.95 * self.metrics[i][1] + 0.05 * np.mean(values[i]) self.metrics[i][2] = np.mean(values[i]) self.metrics[i][0] = float(self.metrics[i][0]) self.metrics[i][1] = float(self.metrics[i][1]) self.metrics[i][2] = float(self.metrics[i][2]) current_stats.append(self.metrics[i]) self.stats.append(copy.deepcopy(current_stats)) def get_stat_string(self, mode=1): stat_string = '' for k, v in self.info.items(): stat_string += '[%s:%s]' % (k, v) stat_string += '[iters:%d]' % self.num_iters for i in range(len(self.metric_names)): stat_string += '[%s:%.05f]' % (self.metric_names[i], self.metrics[i][mode]) return stat_string def dump_log(self): if self.log_json == None: return False dict_to_save = { 'metric_names': self.metric_names, 'stats': self.stats } json.dump(dict_to_save, open(self.log_json, 'w')) return True class VqaMetric(Metric): def __init__(self, info={}, metric_names=[], log_json=None): Metric.__init__(self, info, metric_names, log_json) def compute_ranks(self, scores, labels): accuracy = np.zeros(len(labels)) ranks = np.full(len(labels), scores.shape[1]) for i in range(scores.shape[0]): ranks[i] = scores[i].gt(scores[i][labels[i]]).sum() + 1 if ranks[i] == 1: accuracy[i] = 1 return accuracy, ranks class NavMetric(Metric): def __init__(self, info={}, metric_names=[], log_json=None): Metric.__init__(self, info, metric_names, log_json) ================================================ FILE: training/models.py ================================================ # Model defs for navigation and question answering # Navigation: CNN, LSTM, Planner-controller # VQA: question-only, 5-frame + attention import time import h5py import math import argparse import numpy as np import os, sys, json import torch import torchvision import torch.nn as nn import torch.nn.functional as F from torch.autograd import Variable from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence import pdb def build_mlp(input_dim, hidden_dims, output_dim, use_batchnorm=False, dropout=0, add_sigmoid=1): layers = [] D = input_dim if dropout > 0: layers.append(nn.Dropout(p=dropout)) if use_batchnorm: layers.append(nn.BatchNorm1d(input_dim)) for dim in hidden_dims: layers.append(nn.Linear(D, dim)) if use_batchnorm: layers.append(nn.BatchNorm1d(dim)) if dropout > 0: layers.append(nn.Dropout(p=dropout)) layers.append(nn.ReLU(inplace=True)) D = dim layers.append(nn.Linear(D, output_dim)) if add_sigmoid == 1: layers.append(nn.Sigmoid()) return nn.Sequential(*layers) def get_state(m): if m is None: return None state = {} for k, v in m.state_dict().items(): state[k] = v.clone() return state def repackage_hidden(h, batch_size): # wraps hidden states in new Variables, to detach them from their history if type(h) == Variable: return Variable( h.data.resize_(h.size(0), batch_size, h.size(2)).zero_()) else: return tuple(repackage_hidden(v, batch_size) for v in h) def ensure_shared_grads(model, shared_model): for param, shared_param in zip(model.parameters(), shared_model.parameters()): if shared_param.grad is not None: return shared_param._grad = param.grad class MaskedNLLCriterion(nn.Module): def __init__(self): super(MaskedNLLCriterion, self).__init__() def forward(self, input, target, mask): logprob_select = torch.gather(input, 1, target) out = torch.masked_select(logprob_select, mask) loss = -torch.sum(out) / mask.float().sum() return loss class MultitaskCNNOutput(nn.Module): def __init__( self, num_classes=191, pretrained=True, checkpoint_path='models/03_13_h3d_hybrid_cnn.pt' ): super(MultitaskCNNOutput, self).__init__() self.num_classes = num_classes self.conv_block1 = nn.Sequential( nn.Conv2d(3, 8, 5), nn.BatchNorm2d(8), nn.ReLU(inplace=True), nn.MaxPool2d(2, 2)) self.conv_block2 = nn.Sequential( nn.Conv2d(8, 16, 5), nn.BatchNorm2d(16), nn.ReLU(inplace=True), nn.MaxPool2d(2, 2)) self.conv_block3 = nn.Sequential( nn.Conv2d(16, 32, 5), nn.BatchNorm2d(32), nn.ReLU(inplace=True), nn.MaxPool2d(2, 2)) self.conv_block4 = nn.Sequential( nn.Conv2d(32, 32, 5), nn.BatchNorm2d(32), nn.ReLU(inplace=True), nn.MaxPool2d(2, 2)) self.classifier = nn.Sequential( nn.Conv2d(32, 512, 5), nn.BatchNorm2d(512), nn.ReLU(inplace=True), nn.Dropout2d(), nn.Conv2d(512, 512, 1), nn.BatchNorm2d(512), nn.ReLU(inplace=True), nn.Dropout2d()) self.encoder_seg = nn.Conv2d(512, self.num_classes, 1) self.encoder_depth = nn.Conv2d(512, 1, 1) self.encoder_ae = nn.Conv2d(512, 3, 1) self.score_pool2_seg = nn.Conv2d(16, self.num_classes, 1) self.score_pool3_seg = nn.Conv2d(32, self.num_classes, 1) self.score_pool2_depth = nn.Conv2d(16, 1, 1) self.score_pool3_depth = nn.Conv2d(32, 1, 1) self.score_pool2_ae = nn.Conv2d(16, 3, 1) self.score_pool3_ae = nn.Conv2d(32, 3, 1) self.pretrained = pretrained if self.pretrained == True: print('Loading CNN weights from %s' % checkpoint_path) checkpoint = torch.load( checkpoint_path, map_location={'cuda:0': 'cpu'}) self.load_state_dict(checkpoint['model_state']) for param in self.parameters(): param.requires_grad = False else: for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * ( m.out_channels + m.in_channels) m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() def forward(self, x): conv1 = self.conv_block1(x) conv2 = self.conv_block2(conv1) conv3 = self.conv_block3(conv2) conv4 = self.conv_block4(conv3) encoder_output = self.classifier(conv4) encoder_output_seg = self.encoder_seg(encoder_output) encoder_output_depth = self.encoder_depth(encoder_output) encoder_output_ae = self.encoder_ae(encoder_output) score_pool2_seg = self.score_pool2_seg(conv2) score_pool3_seg = self.score_pool3_seg(conv3) score_pool2_depth = self.score_pool2_depth(conv2) score_pool3_depth = self.score_pool3_depth(conv3) score_pool2_ae = self.score_pool2_ae(conv2) score_pool3_ae = self.score_pool3_ae(conv3) score_seg = F.upsample(encoder_output_seg, score_pool3_seg.size()[2:], mode='bilinear') score_seg += score_pool3_seg score_seg = F.upsample(score_seg, score_pool2_seg.size()[2:], mode='bilinear') score_seg += score_pool2_seg out_seg = F.upsample(score_seg, x.size()[2:], mode='bilinear') score_depth = F.upsample(encoder_output_depth, score_pool3_depth.size()[2:], mode='bilinear') score_depth += score_pool3_depth score_depth = F.upsample(score_depth, score_pool2_depth.size()[2:], mode='bilinear') score_depth += score_pool2_depth out_depth = F.sigmoid(F.upsample(score_depth, x.size()[2:], mode='bilinear')) score_ae = F.upsample(encoder_output_ae, score_pool3_ae.size()[2:], mode='bilinear') score_ae += score_pool3_ae score_ae = F.upsample(score_ae, score_pool2_ae.size()[2:], mode='bilinear') score_ae += score_pool2_ae out_ae = F.sigmoid(F.upsample(score_ae, x.size()[2:], mode='bilinear')) return out_seg, out_depth, out_ae class MultitaskCNN(nn.Module): def __init__( self, num_classes=191, pretrained=True, checkpoint_path='models/03_13_h3d_hybrid_cnn.pt' ): super(MultitaskCNN, self).__init__() self.num_classes = num_classes self.conv_block1 = nn.Sequential( nn.Conv2d(3, 8, 5), nn.BatchNorm2d(8), nn.ReLU(inplace=True), nn.MaxPool2d(2, 2)) self.conv_block2 = nn.Sequential( nn.Conv2d(8, 16, 5), nn.BatchNorm2d(16), nn.ReLU(inplace=True), nn.MaxPool2d(2, 2)) self.conv_block3 = nn.Sequential( nn.Conv2d(16, 32, 5), nn.BatchNorm2d(32), nn.ReLU(inplace=True), nn.MaxPool2d(2, 2)) self.conv_block4 = nn.Sequential( nn.Conv2d(32, 32, 5), nn.BatchNorm2d(32), nn.ReLU(inplace=True), nn.MaxPool2d(2, 2)) self.classifier = nn.Sequential( nn.Conv2d(32, 512, 5), nn.BatchNorm2d(512), nn.ReLU(inplace=True), nn.Dropout2d(), nn.Conv2d(512, 512, 1), nn.BatchNorm2d(512), nn.ReLU(inplace=True), nn.Dropout2d()) self.encoder_seg = nn.Conv2d(512, self.num_classes, 1) self.encoder_depth = nn.Conv2d(512, 1, 1) self.encoder_ae = nn.Conv2d(512, 3, 1) self.score_pool2_seg = nn.Conv2d(16, self.num_classes, 1) self.score_pool3_seg = nn.Conv2d(32, self.num_classes, 1) self.score_pool2_depth = nn.Conv2d(16, 1, 1) self.score_pool3_depth = nn.Conv2d(32, 1, 1) self.score_pool2_ae = nn.Conv2d(16, 3, 1) self.score_pool3_ae = nn.Conv2d(32, 3, 1) self.pretrained = pretrained if self.pretrained == True: print('Loading CNN weights from %s' % checkpoint_path) checkpoint = torch.load( checkpoint_path, map_location={'cuda:0': 'cpu'}) self.load_state_dict(checkpoint['model_state']) for param in self.parameters(): param.requires_grad = False else: for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * ( m.out_channels + m.in_channels) m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() def forward(self, x): assert self.training == False conv1 = self.conv_block1(x) conv2 = self.conv_block2(conv1) conv3 = self.conv_block3(conv2) conv4 = self.conv_block4(conv3) return conv4.view(-1, 32 * 10 * 10) # encoder_output = self.classifier(conv4) # encoder_output_seg = self.encoder_seg(encoder_output) # encoder_output_depth = self.encoder_depth(encoder_output) # encoder_output_ae = self.encoder_ae(encoder_output) # score_pool2_seg = self.score_pool2_seg(conv2) # score_pool3_seg = self.score_pool3_seg(conv3) # score_pool2_depth = self.score_pool2_depth(conv2) # score_pool3_depth = self.score_pool3_depth(conv3) # score_pool2_ae = self.score_pool2_ae(conv2) # score_pool3_ae = self.score_pool3_ae(conv3) # score_seg = F.upsample(encoder_output_seg, score_pool3_seg.size()[2:], mode='bilinear') # score_seg += score_pool3_seg # score_seg = F.upsample(score_seg, score_pool2_seg.size()[2:], mode='bilinear') # score_seg += score_pool2_seg # out_seg = F.upsample(score_seg, x.size()[2:], mode='bilinear') # score_depth = F.upsample(encoder_output_depth, score_pool3_depth.size()[2:], mode='bilinear') # score_depth += score_pool3_depth # score_depth = F.upsample(score_depth, score_pool2_depth.size()[2:], mode='bilinear') # score_depth += score_pool2_depth # out_depth = F.sigmoid(F.upsample(score_depth, x.size()[2:], mode='bilinear')) # score_ae = F.upsample(encoder_output_ae, score_pool3_ae.size()[2:], mode='bilinear') # score_ae += score_pool3_ae # score_ae = F.upsample(score_ae, score_pool2_ae.size()[2:], mode='bilinear') # score_ae += score_pool2_ae # out_ae = F.sigmoid(F.upsample(score_ae, x.size()[2:], mode='bilinear')) # return out_seg, out_depth, out_ae class QuestionLstmEncoder(nn.Module): def __init__(self, token_to_idx, wordvec_dim=64, rnn_dim=64, rnn_num_layers=2, rnn_dropout=0): super(QuestionLstmEncoder, self).__init__() self.token_to_idx = token_to_idx self.NULL = token_to_idx[''] self.START = token_to_idx[''] self.END = token_to_idx[''] self.embed = nn.Embedding(len(token_to_idx), wordvec_dim) self.rnn = nn.LSTM( wordvec_dim, rnn_dim, rnn_num_layers, dropout=rnn_dropout, batch_first=True) self.init_weights() def init_weights(self): initrange = 0.1 self.embed.weight.data.uniform_(-initrange, initrange) def forward(self, x): N, T = x.size() idx = torch.LongTensor(N).fill_(T - 1) # Find the last non-null element in each sequence x_cpu = x.data.cpu() for i in range(N): for t in range(T - 1): if x_cpu[i, t] != self.NULL and x_cpu[i, t + 1] == self.NULL: idx[i] = t break idx = idx.type_as(x.data).long() idx = Variable(idx, requires_grad=False) hs, _ = self.rnn(self.embed(x)) idx = idx.view(N, 1, 1).expand(N, 1, hs.size(2)) H = hs.size(2) return hs.gather(1, idx).view(N, H) # ----------- VQA ----------- class VqaLstmModel(nn.Module): def __init__(self, vocab, rnn_wordvec_dim=64, rnn_dim=64, rnn_num_layers=2, rnn_dropout=0.5, fc_use_batchnorm=False, fc_dropout=0.5, fc_dims=(64, )): super(VqaLstmModel, self).__init__() rnn_kwargs = { 'token_to_idx': vocab['questionTokenToIdx'], 'wordvec_dim': rnn_wordvec_dim, 'rnn_dim': rnn_dim, 'rnn_num_layers': rnn_num_layers, 'rnn_dropout': rnn_dropout, } self.rnn = QuestionLstmEncoder(**rnn_kwargs) classifier_kwargs = { 'input_dim': rnn_dim, 'hidden_dims': fc_dims, 'output_dim': len(vocab['answerTokenToIdx']), 'use_batchnorm': fc_use_batchnorm, 'dropout': fc_dropout, 'add_sigmoid': 0 } self.classifier = build_mlp(**classifier_kwargs) def forward(self, questions): q_feats = self.rnn(questions) scores = self.classifier(q_feats) return scores class VqaLstmCnnAttentionModel(nn.Module): def __init__(self, vocab, image_feat_dim=64, question_wordvec_dim=64, question_hidden_dim=64, question_num_layers=2, question_dropout=0.5, fc_use_batchnorm=False, fc_dropout=0.5, fc_dims=(64, )): super(VqaLstmCnnAttentionModel, self).__init__() cnn_kwargs = {'num_classes': 191, 'pretrained': True} self.cnn = MultitaskCNN(**cnn_kwargs) self.cnn_fc_layer = nn.Sequential( nn.Linear(32 * 10 * 10, 64), nn.ReLU(), nn.Dropout(p=0.5)) q_rnn_kwargs = { 'token_to_idx': vocab['questionTokenToIdx'], 'wordvec_dim': question_wordvec_dim, 'rnn_dim': question_hidden_dim, 'rnn_num_layers': question_num_layers, 'rnn_dropout': question_dropout, } self.q_rnn = QuestionLstmEncoder(**q_rnn_kwargs) self.img_tr = nn.Sequential(nn.Linear(64, 64), nn.Dropout(p=0.5)) self.ques_tr = nn.Sequential(nn.Linear(64, 64), nn.Dropout(p=0.5)) classifier_kwargs = { 'input_dim': 64, 'hidden_dims': fc_dims, 'output_dim': len(vocab['answerTokenToIdx']), 'use_batchnorm': fc_use_batchnorm, 'dropout': fc_dropout, 'add_sigmoid': 0 } self.classifier = build_mlp(**classifier_kwargs) self.att = nn.Sequential( nn.Tanh(), nn.Dropout(p=0.5), nn.Linear(128, 1)) def forward(self, images, questions): N, T, _, _, _ = images.size() # bs x 5 x 3 x 224 x 224 img_feats = self.cnn(images.contiguous().view( -1, images.size(2), images.size(3), images.size(4))) img_feats = self.cnn_fc_layer(img_feats) img_feats_tr = self.img_tr(img_feats) ques_feats = self.q_rnn(questions) ques_feats_repl = ques_feats.view(N, 1, -1).repeat(1, T, 1) ques_feats_repl = ques_feats_repl.view(N * T, -1) ques_feats_tr = self.ques_tr(ques_feats_repl) ques_img_feats = torch.cat([ques_feats_tr, img_feats_tr], 1) att_feats = self.att(ques_img_feats) att_probs = F.softmax(att_feats.view(N, T), dim=1) att_probs2 = att_probs.view(N, T, 1).repeat(1, 1, 64) att_img_feats = torch.mul(att_probs2, img_feats.view(N, T, 64)) att_img_feats = torch.sum(att_img_feats, dim=1) mul_feats = torch.mul(ques_feats, att_img_feats) scores = self.classifier(mul_feats) return scores, att_probs # ----------- Nav ----------- class NavCnnModel(nn.Module): def __init__(self, num_frames=5, num_actions=4, question_input=False, question_vocab=False, question_wordvec_dim=64, question_hidden_dim=64, question_num_layers=2, question_dropout=0.5, fc_use_batchnorm=False, fc_dropout=0.5, fc_dims=(64, )): super(NavCnnModel, self).__init__() # cnn_kwargs = {'num_classes': 191, 'pretrained': True} # self.cnn = MultitaskCNN(**cnn_kwargs) self.cnn_fc_layer = nn.Sequential( nn.Linear(32 * 10 * 10, 64), nn.ReLU(), nn.Dropout(p=0.5)) self.question_input = question_input if self.question_input == True: q_rnn_kwargs = { 'token_to_idx': question_vocab['questionTokenToIdx'], 'wordvec_dim': question_wordvec_dim, 'rnn_dim': question_hidden_dim, 'rnn_num_layers': question_num_layers, 'rnn_dropout': question_dropout, } self.q_rnn = QuestionLstmEncoder(**q_rnn_kwargs) self.ques_tr = nn.Sequential( nn.Linear(64, 64), nn.ReLU(), nn.Dropout(p=0.5)) classifier_kwargs = { 'input_dim': 64 * num_frames + self.question_input * 64, 'hidden_dims': fc_dims, 'output_dim': num_actions, 'use_batchnorm': fc_use_batchnorm, 'dropout': fc_dropout, 'add_sigmoid': 0 } self.classifier = build_mlp(**classifier_kwargs) # batch forward, for supervised learning def forward(self, img_feats, questions=None): # bs x 5 x 3200 N, T, _ = img_feats.size() img_feats = self.cnn_fc_layer(img_feats) img_feats = img_feats.view(N, T, -1) img_feats = img_feats.view(N, -1) if self.question_input == True: ques_feats = self.q_rnn(questions) ques_feats = self.ques_tr(ques_feats) img_feats = torch.cat([ques_feats, img_feats], 1) scores = self.classifier(img_feats) return scores class NavRnnMult(nn.Module): def __init__(self, image_input=False, image_feat_dim=128, question_input=False, question_embed_dim=128, action_input=False, action_embed_dim=32, num_actions=4, mode='sl', rnn_type='LSTM', rnn_hidden_dim=128, rnn_num_layers=2, rnn_dropout=0, return_states=False): super(NavRnnMult, self).__init__() self.image_input = image_input self.image_feat_dim = image_feat_dim self.question_input = question_input self.question_embed_dim = question_embed_dim self.action_input = action_input self.action_embed_dim = action_embed_dim self.num_actions = num_actions self.rnn_type = rnn_type self.rnn_hidden_dim = rnn_hidden_dim self.rnn_num_layers = rnn_num_layers self.return_states = return_states rnn_input_dim = 0 if self.image_input == True: rnn_input_dim += image_feat_dim print('Adding input to %s: image, rnn dim: %d' % (self.rnn_type, rnn_input_dim)) if self.question_input == True: #rnn_input_dim += question_embed_dim print('Adding input to %s: question, rnn dim: %d' % (self.rnn_type, rnn_input_dim)) if self.action_input == True: self.action_embed = nn.Embedding(num_actions, action_embed_dim) rnn_input_dim += action_embed_dim print('Adding input to %s: action, rnn dim: %d' % (self.rnn_type, rnn_input_dim)) self.rnn = getattr(nn, self.rnn_type)( rnn_input_dim, self.rnn_hidden_dim, self.rnn_num_layers, dropout=rnn_dropout, batch_first=True) print('Building %s with hidden dim: %d' % (self.rnn_type, rnn_hidden_dim)) self.decoder = nn.Linear(self.rnn_hidden_dim, self.num_actions) def init_hidden(self, bsz): weight = next(self.parameters()).data if self.rnn_type == 'LSTM': return (Variable( weight.new(self.rnn_num_layers, bsz, self.rnn_hidden_dim) .zero_()), Variable( weight.new(self.rnn_num_layers, bsz, self.rnn_hidden_dim) .zero_())) elif self.rnn_type == 'GRU': return Variable( weight.new(self.rnn_num_layers, bsz, self.rnn_hidden_dim) .zero_()) def forward(self, img_feats, question_feats, actions_in, action_lengths, hidden=False): input_feats = Variable() T = False if self.image_input == True: N, T, _ = img_feats.size() input_feats = img_feats if self.question_input == True: N, D = question_feats.size() question_feats = question_feats.view(N, 1, D) if T == False: T = actions_in.size(1) question_feats = question_feats.repeat(1, T, 1) if len(input_feats) == 0: input_feats = question_feats else: #input_feats = torch.cat([input_feats, question_feats], 2) input_feats = torch.mul(input_feats, question_feats) if self.action_input == True: if len(input_feats) == 0: input_feats = self.action_embed(actions_in) else: input_feats = torch.cat( [input_feats, self.action_embed(actions_in)], 2) packed_input_feats = pack_padded_sequence( input_feats, action_lengths, batch_first=True) packed_output, hidden = self.rnn(packed_input_feats) rnn_output, _ = pad_packed_sequence(packed_output, batch_first=True) output = self.decoder(rnn_output.contiguous().view( rnn_output.size(0) * rnn_output.size(1), rnn_output.size(2))) if self.return_states == True: return rnn_output, output, hidden else: return output, hidden def step_forward(self, img_feats, question_feats, actions_in, hidden): input_feats = Variable() T = False if self.image_input == True: N, T, _ = img_feats.size() input_feats = img_feats if self.question_input == True: N, D = question_feats.size() question_feats = question_feats.view(N, 1, D) if T == False: T = actions_in.size(1) question_feats = question_feats.repeat(1, T, 1) if len(input_feats) == 0: input_feats = question_feats else: #input_feats = torch.cat([input_feats, question_feats], 2) input_feats = torch.mul(input_feats, question_feats) if self.action_input == True: if len(input_feats) == 0: input_feats = self.action_embed(actions_in) else: input_feats = torch.cat( [input_feats, self.action_embed(actions_in)], 2) output, hidden = self.rnn(input_feats, hidden) output = self.decoder(output.contiguous().view( output.size(0) * output.size(1), output.size(2))) return output, hidden class NavRnn(nn.Module): def __init__(self, image_input=False, image_feat_dim=128, question_input=False, question_embed_dim=128, action_input=False, action_embed_dim=32, num_actions=4, mode='sl', rnn_type='LSTM', rnn_hidden_dim=128, rnn_num_layers=2, rnn_dropout=0, return_states=False): super(NavRnn, self).__init__() self.image_input = image_input self.image_feat_dim = image_feat_dim self.question_input = question_input self.question_embed_dim = question_embed_dim self.action_input = action_input self.action_embed_dim = action_embed_dim self.num_actions = num_actions self.rnn_type = rnn_type self.rnn_hidden_dim = rnn_hidden_dim self.rnn_num_layers = rnn_num_layers self.return_states = return_states rnn_input_dim = 0 if self.image_input == True: rnn_input_dim += image_feat_dim print('Adding input to %s: image, rnn dim: %d' % (self.rnn_type, rnn_input_dim)) if self.question_input == True: rnn_input_dim += question_embed_dim print('Adding input to %s: question, rnn dim: %d' % (self.rnn_type, rnn_input_dim)) if self.action_input == True: self.action_embed = nn.Embedding(num_actions, action_embed_dim) rnn_input_dim += action_embed_dim print('Adding input to %s: action, rnn dim: %d' % (self.rnn_type, rnn_input_dim)) self.rnn = getattr(nn, self.rnn_type)( rnn_input_dim, self.rnn_hidden_dim, self.rnn_num_layers, dropout=rnn_dropout, batch_first=True) print('Building %s with hidden dim: %d' % (self.rnn_type, rnn_hidden_dim)) self.decoder = nn.Linear(self.rnn_hidden_dim, self.num_actions) def init_hidden(self, bsz): weight = next(self.parameters()).data if self.rnn_type == 'LSTM': return (Variable( weight.new(self.rnn_num_layers, bsz, self.rnn_hidden_dim) .zero_()), Variable( weight.new(self.rnn_num_layers, bsz, self.rnn_hidden_dim) .zero_())) elif self.rnn_type == 'GRU': return Variable( weight.new(self.rnn_num_layers, bsz, self.rnn_hidden_dim) .zero_()) def forward(self, img_feats, question_feats, actions_in, action_lengths, hidden=False): input_feats = Variable() T = False if self.image_input == True: N, T, _ = img_feats.size() input_feats = img_feats if self.question_input == True: N, D = question_feats.size() question_feats = question_feats.view(N, 1, D) if T == False: T = actions_in.size(1) question_feats = question_feats.repeat(1, T, 1) if len(input_feats) == 0: input_feats = question_feats else: input_feats = torch.cat([input_feats, question_feats], 2) if self.action_input == True: if len(input_feats) == 0: input_feats = self.action_embed(actions_in) else: input_feats = torch.cat( [input_feats, self.action_embed(actions_in)], 2) packed_input_feats = pack_padded_sequence( input_feats, action_lengths, batch_first=True) packed_output, hidden = self.rnn(packed_input_feats) rnn_output, _ = pad_packed_sequence(packed_output, batch_first=True) output = self.decoder(rnn_output.contiguous().view( rnn_output.size(0) * rnn_output.size(1), rnn_output.size(2))) if self.return_states == True: return rnn_output, output, hidden else: return output, hidden def step_forward(self, img_feats, question_feats, actions_in, hidden): input_feats = Variable() T = False if self.image_input == True: N, T, _ = img_feats.size() input_feats = img_feats if self.question_input == True: N, D = question_feats.size() question_feats = question_feats.view(N, 1, D) if T == False: T = actions_in.size(1) question_feats = question_feats.repeat(1, T, 1) if len(input_feats) == 0: input_feats = question_feats else: input_feats = torch.cat([input_feats, question_feats], 2) if self.action_input == True: if len(input_feats) == 0: input_feats = self.action_embed(actions_in) else: input_feats = torch.cat( [input_feats, self.action_embed(actions_in)], 2) output, hidden = self.rnn(input_feats, hidden) output = self.decoder(output.contiguous().view( output.size(0) * output.size(1), output.size(2))) return output, hidden class NavCnnRnnMultModel(nn.Module): def __init__( self, num_output=4, # forward, left, right, stop rnn_image_input=True, rnn_image_feat_dim=128, question_input=False, question_vocab=False, question_wordvec_dim=64, question_hidden_dim=64, question_num_layers=2, question_dropout=0.5, rnn_question_embed_dim=128, rnn_action_input=True, rnn_action_embed_dim=32, rnn_type='LSTM', rnn_hidden_dim=1024, rnn_num_layers=1, rnn_dropout=0): super(NavCnnRnnMultModel, self).__init__() self.cnn_fc_layer = nn.Sequential( nn.Linear(32 * 10 * 10, rnn_image_feat_dim), nn.ReLU(), nn.Dropout(p=0.5)) self.rnn_hidden_dim = rnn_hidden_dim self.question_input = question_input if self.question_input == True: q_rnn_kwargs = { 'token_to_idx': question_vocab['questionTokenToIdx'], 'wordvec_dim': question_wordvec_dim, 'rnn_dim': question_hidden_dim, 'rnn_num_layers': question_num_layers, 'rnn_dropout': question_dropout, } self.q_rnn = QuestionLstmEncoder(**q_rnn_kwargs) self.ques_tr = nn.Sequential( nn.Linear(64, rnn_image_feat_dim), nn.ReLU(), nn.Dropout(p=0.5)) self.nav_rnn = NavRnnMult( image_input=rnn_image_input, image_feat_dim=rnn_image_feat_dim, question_input=question_input, question_embed_dim=question_hidden_dim, action_input=rnn_action_input, action_embed_dim=rnn_action_embed_dim, num_actions=num_output, rnn_type=rnn_type, rnn_hidden_dim=rnn_hidden_dim, rnn_num_layers=rnn_num_layers, rnn_dropout=rnn_dropout) def forward(self, img_feats, questions, actions_in, action_lengths, hidden=False, step=False): N, T, _ = img_feats.size() # B x T x 128 img_feats = self.cnn_fc_layer(img_feats) if self.question_input == True: ques_feats = self.q_rnn(questions) ques_feats = self.ques_tr(ques_feats) if step == True: output, hidden = self.nav_rnn.step_forward( img_feats, ques_feats, actions_in, hidden) else: output, hidden = self.nav_rnn(img_feats, ques_feats, actions_in, action_lengths) else: if step == True: output, hidden = self.nav_rnn.step_forward( img_feats, False, actions_in, hidden) else: output, hidden = self.nav_rnn(img_feats, False, actions_in, action_lengths) return output, hidden class NavCnnRnnModel(nn.Module): def __init__( self, num_output=4, # forward, left, right, stop rnn_image_input=True, rnn_image_feat_dim=128, question_input=False, question_vocab=False, question_wordvec_dim=64, question_hidden_dim=64, question_num_layers=2, question_dropout=0.5, rnn_question_embed_dim=128, rnn_action_input=True, rnn_action_embed_dim=32, rnn_type='LSTM', rnn_hidden_dim=1024, rnn_num_layers=1, rnn_dropout=0): super(NavCnnRnnModel, self).__init__() self.cnn_fc_layer = nn.Sequential( nn.Linear(32 * 10 * 10, rnn_image_feat_dim), nn.ReLU(), nn.Dropout(p=0.5)) self.rnn_hidden_dim = rnn_hidden_dim self.question_input = question_input if self.question_input == True: q_rnn_kwargs = { 'token_to_idx': question_vocab['questionTokenToIdx'], 'wordvec_dim': question_wordvec_dim, 'rnn_dim': question_hidden_dim, 'rnn_num_layers': question_num_layers, 'rnn_dropout': question_dropout, } self.q_rnn = QuestionLstmEncoder(**q_rnn_kwargs) self.ques_tr = nn.Sequential( nn.Linear(64, 64), nn.ReLU(), nn.Dropout(p=0.5)) self.nav_rnn = NavRnn( image_input=rnn_image_input, image_feat_dim=rnn_image_feat_dim, question_input=question_input, question_embed_dim=question_hidden_dim, action_input=rnn_action_input, action_embed_dim=rnn_action_embed_dim, num_actions=num_output, rnn_type=rnn_type, rnn_hidden_dim=rnn_hidden_dim, rnn_num_layers=rnn_num_layers, rnn_dropout=rnn_dropout) def forward(self, img_feats, questions, actions_in, action_lengths, hidden=False, step=False): N, T, _ = img_feats.size() # B x T x 128 img_feats = self.cnn_fc_layer(img_feats) if self.question_input == True: ques_feats = self.q_rnn(questions) ques_feats = self.ques_tr(ques_feats) if step == True: output, hidden = self.nav_rnn.step_forward( img_feats, ques_feats, actions_in, hidden) else: output, hidden = self.nav_rnn(img_feats, ques_feats, actions_in, action_lengths) else: if step == True: output, hidden = self.nav_rnn.step_forward( img_feats, False, actions_in, hidden) else: output, hidden = self.nav_rnn(img_feats, False, actions_in, action_lengths) return output, hidden class NavPlannerControllerModel(nn.Module): def __init__(self, question_vocab, num_output=4, question_wordvec_dim=64, question_hidden_dim=64, question_num_layers=2, question_dropout=0.5, planner_rnn_image_feat_dim=128, planner_rnn_action_embed_dim=32, planner_rnn_type='GRU', planner_rnn_hidden_dim=1024, planner_rnn_num_layers=1, planner_rnn_dropout=0, controller_fc_dims=(256, )): super(NavPlannerControllerModel, self).__init__() self.cnn_fc_layer = nn.Sequential( nn.Linear(32 * 10 * 10, planner_rnn_image_feat_dim), nn.ReLU(), nn.Dropout(p=0.5)) q_rnn_kwargs = { 'token_to_idx': question_vocab['questionTokenToIdx'], 'wordvec_dim': question_wordvec_dim, 'rnn_dim': question_hidden_dim, 'rnn_num_layers': question_num_layers, 'rnn_dropout': question_dropout, } self.q_rnn = QuestionLstmEncoder(**q_rnn_kwargs) self.ques_tr = nn.Sequential( nn.Linear(question_hidden_dim, question_hidden_dim), nn.ReLU(), nn.Dropout(p=0.5)) self.planner_nav_rnn = NavRnn( image_input=True, image_feat_dim=planner_rnn_image_feat_dim, question_input=True, question_embed_dim=question_hidden_dim, action_input=True, action_embed_dim=planner_rnn_action_embed_dim, num_actions=num_output, rnn_type=planner_rnn_type, rnn_hidden_dim=planner_rnn_hidden_dim, rnn_num_layers=planner_rnn_num_layers, rnn_dropout=planner_rnn_dropout, return_states=True) controller_kwargs = { 'input_dim': planner_rnn_image_feat_dim + planner_rnn_action_embed_dim + planner_rnn_hidden_dim, 'hidden_dims': controller_fc_dims, 'output_dim': 2, 'add_sigmoid': 0 } self.controller = build_mlp(**controller_kwargs) def forward(self, questions, planner_img_feats, planner_actions_in, planner_action_lengths, planner_hidden_index, controller_img_feats, controller_actions_in, controller_action_lengths, planner_hidden=False): # ts = time.time() N_p, T_p, _ = planner_img_feats.size() planner_img_feats = self.cnn_fc_layer(planner_img_feats) controller_img_feats = self.cnn_fc_layer(controller_img_feats) ques_feats = self.q_rnn(questions) ques_feats = self.ques_tr(ques_feats) planner_states, planner_scores, planner_hidden = self.planner_nav_rnn( planner_img_feats, ques_feats, planner_actions_in, planner_action_lengths) planner_hidden_index = planner_hidden_index[:, : controller_action_lengths. max()] controller_img_feats = controller_img_feats[:, : controller_action_lengths. max()] controller_actions_in = controller_actions_in[:, : controller_action_lengths. max()] N_c, T_c, _ = controller_img_feats.size() assert planner_hidden_index.max().data[0] < planner_states.size(1) planner_hidden_index = planner_hidden_index.contiguous().view( N_p, planner_hidden_index.size(1), 1).repeat( 1, 1, planner_states.size(2)) controller_hidden_in = planner_states.gather(1, planner_hidden_index) controller_hidden_in = controller_hidden_in.view( N_c * T_c, controller_hidden_in.size(2)) controller_img_feats = controller_img_feats.contiguous().view( N_c * T_c, -1) controller_actions_embed = self.planner_nav_rnn.action_embed( controller_actions_in).view(N_c * T_c, -1) controller_in = torch.cat([ controller_img_feats, controller_actions_embed, controller_hidden_in ], 1) controller_scores = self.controller(controller_in) return planner_scores, controller_scores, planner_hidden def planner_step(self, questions, img_feats, actions_in, planner_hidden): img_feats = self.cnn_fc_layer(img_feats) ques_feats = self.q_rnn(questions) ques_feats = self.ques_tr(ques_feats) planner_scores, planner_hidden = self.planner_nav_rnn.step_forward( img_feats, ques_feats, actions_in, planner_hidden) return planner_scores, planner_hidden def controller_step(self, img_feats, actions_in, hidden_in): img_feats = self.cnn_fc_layer(img_feats) actions_embed = self.planner_nav_rnn.action_embed(actions_in) img_feats = img_feats.view(1, -1) actions_embed = actions_embed.view(1, -1) hidden_in = hidden_in.view(1, -1) controller_in = torch.cat([img_feats, actions_embed, hidden_in], 1) controller_scores = self.controller(controller_in) return controller_scores ================================================ FILE: training/train_eqa.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. import h5py import time import argparse import numpy as np import os, sys, json from tqdm import tqdm import torch import torch.nn.functional as F from torch.autograd import Variable torch.backends.cudnn.enabled = False import torch.multiprocessing as mp from models import NavCnnModel, NavCnnRnnModel, NavPlannerControllerModel, VqaLstmCnnAttentionModel from data import EqaDataset, EqaDataLoader from metrics import NavMetric, VqaMetric from models import MaskedNLLCriterion from models import get_state, repackage_hidden, ensure_shared_grads from data import load_vocab, flat_to_hierarchical_actions def eval(rank, args, shared_nav_model, shared_ans_model): torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) if args.model_type == 'pacman': model_kwargs = {'question_vocab': load_vocab(args.vocab_json)} nav_model = NavPlannerControllerModel(**model_kwargs) else: exit() model_kwargs = {'vocab': load_vocab(args.vocab_json)} ans_model = VqaLstmCnnAttentionModel(**model_kwargs) eval_loader_kwargs = { 'questions_h5': getattr(args, args.eval_split + '_h5'), 'data_json': args.data_json, 'vocab': args.vocab_json, 'target_obj_conn_map_dir': args.target_obj_conn_map_dir, 'map_resolution': args.map_resolution, 'batch_size': 1, 'input_type': args.model_type, 'num_frames': 5, 'split': args.eval_split, 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank % len(args.gpus)], 'to_cache': False, 'max_controller_actions': args.max_controller_actions, 'max_actions': args.max_actions } eval_loader = EqaDataLoader(**eval_loader_kwargs) print('eval_loader has %d samples' % len(eval_loader.dataset)) args.output_nav_log_path = os.path.join(args.log_dir, 'nav_eval_' + str(rank) + '.json') args.output_ans_log_path = os.path.join(args.log_dir, 'ans_eval_' + str(rank) + '.json') t, epoch, best_eval_acc = 0, 0, 0.0 while epoch < int(args.max_epochs): start_time = time.time() invalids = [] nav_model.load_state_dict(shared_nav_model.state_dict()) nav_model.eval() ans_model.load_state_dict(shared_ans_model.state_dict()) ans_model.eval() ans_model.cuda() # that's a lot of numbers nav_metrics = NavMetric( info={'split': args.eval_split, 'thread': rank}, metric_names=[ 'd_0_10', 'd_0_30', 'd_0_50', 'd_T_10', 'd_T_30', 'd_T_50', 'd_D_10', 'd_D_30', 'd_D_50', 'd_min_10', 'd_min_30', 'd_min_50', 'r_T_10', 'r_T_30', 'r_T_50', 'r_e_10', 'r_e_30', 'r_e_50', 'stop_10', 'stop_30', 'stop_50', 'ep_len_10', 'ep_len_30', 'ep_len_50' ], log_json=args.output_nav_log_path) vqa_metrics = VqaMetric( info={'split': args.eval_split, 'thread': rank}, metric_names=[ 'accuracy_10', 'accuracy_30', 'accuracy_50', 'mean_rank_10', 'mean_rank_30', 'mean_rank_50', 'mean_reciprocal_rank_10', 'mean_reciprocal_rank_30', 'mean_reciprocal_rank_50' ], log_json=args.output_ans_log_path) if 'pacman' in args.model_type: done = False while done == False: for batch in tqdm(eval_loader): nav_model.load_state_dict(shared_nav_model.state_dict()) nav_model.eval() nav_model.cuda() idx, question, answer, actions, action_length = batch metrics_slug = {} h3d = eval_loader.dataset.episode_house # evaluate at multiple initializations for i in [10, 30, 50]: t += 1 if i > action_length[0]: invalids.append([idx[0], i]) continue question_var = Variable(question.cuda()) controller_step = False planner_hidden = nav_model.planner_nav_rnn.init_hidden( 1) # forward through planner till spawn ( planner_actions_in, planner_img_feats, controller_step, controller_action_in, controller_img_feat, init_pos, controller_action_counter ) = eval_loader.dataset.get_hierarchical_features_till_spawn( actions[0, :action_length[0] + 1].numpy(), i, args.max_controller_actions ) planner_actions_in_var = Variable( planner_actions_in.cuda()) planner_img_feats_var = Variable( planner_img_feats.cuda()) for step in range(planner_actions_in.size(0)): planner_scores, planner_hidden = nav_model.planner_step( question_var, planner_img_feats_var[step].view( 1, 1, 3200), planner_actions_in_var[step].view( 1, 1), planner_hidden) if controller_step == True: controller_img_feat_var = Variable( controller_img_feat.cuda()) controller_action_in_var = Variable( torch.LongTensor(1, 1).fill_( int(controller_action_in)).cuda()) controller_scores = nav_model.controller_step( controller_img_feat_var.view(1, 1, 3200), controller_action_in_var.view(1, 1), planner_hidden[0]) prob = F.softmax(controller_scores, dim=1) controller_action = int( prob.max(1)[1].data.cpu().numpy()[0]) if controller_action == 1: controller_step = True else: controller_step = False action = int(controller_action_in) action_in = torch.LongTensor( 1, 1).fill_(action + 1).cuda() else: prob = F.softmax(planner_scores, dim=1) action = int(prob.max(1)[1].data.cpu().numpy()[0]) action_in = torch.LongTensor( 1, 1).fill_(action + 1).cuda() h3d.env.reset( x=init_pos[0], y=init_pos[2], yaw=init_pos[3]) init_dist_to_target = h3d.get_dist_to_target( h3d.env.cam.pos) if init_dist_to_target < 0: # unreachable invalids.append([idx[0], i]) continue episode_length = 0 episode_done = True controller_action_counter = 0 dists_to_target, pos_queue, pred_actions = [ init_dist_to_target ], [init_pos], [] planner_actions, controller_actions = [], [] if action != 3: # take the first step img, _, _ = h3d.step(action) img = torch.from_numpy(img.transpose( 2, 0, 1)).float() / 255.0 img_feat_var = eval_loader.dataset.cnn( Variable(img.view(1, 3, 224, 224).cuda())).view( 1, 1, 3200) for step in range(args.max_episode_length): episode_length += 1 if controller_step == False: planner_scores, planner_hidden = nav_model.planner_step( question_var, img_feat_var, Variable(action_in), planner_hidden) prob = F.softmax(planner_scores, dim=1) action = int( prob.max(1)[1].data.cpu().numpy()[0]) planner_actions.append(action) pred_actions.append(action) img, _, episode_done = h3d.step(action) episode_done = episode_done or episode_length >= args.max_episode_length img = torch.from_numpy(img.transpose( 2, 0, 1)).float() / 255.0 img_feat_var = eval_loader.dataset.cnn( Variable(img.view(1, 3, 224, 224) .cuda())).view(1, 1, 3200) dists_to_target.append( h3d.get_dist_to_target(h3d.env.cam.pos)) pos_queue.append([ h3d.env.cam.pos.x, h3d.env.cam.pos.y, h3d.env.cam.pos.z, h3d.env.cam.yaw ]) if episode_done == True: break # query controller to continue or not controller_action_in = Variable( torch.LongTensor(1, 1).fill_(action).cuda()) controller_scores = nav_model.controller_step( img_feat_var, controller_action_in, planner_hidden[0]) prob = F.softmax(controller_scores, dim=1) controller_action = int( prob.max(1)[1].data.cpu().numpy()[0]) if controller_action == 1 and controller_action_counter < 4: controller_action_counter += 1 controller_step = True else: controller_action_counter = 0 controller_step = False controller_action = 0 controller_actions.append(controller_action) action_in = torch.LongTensor( 1, 1).fill_(action + 1).cuda() # run answerer here if len(pos_queue) < 5: pos_queue = eval_loader.dataset.episode_pos_queue[len( pos_queue) - 5:] + pos_queue images = eval_loader.dataset.get_frames( h3d, pos_queue[-5:], preprocess=True) images_var = Variable( torch.from_numpy(images).cuda()).view( 1, 5, 3, 224, 224) scores, att_probs = ans_model(images_var, question_var) ans_acc, ans_rank = vqa_metrics.compute_ranks( scores.data.cpu(), answer) pred_answer = scores.max(1)[1].data[0] print('[Q_GT]', ' '.join([ eval_loader.dataset.vocab['questionIdxToToken'][x] for x in question[0] if x != 0 ])) print('[A_GT]', eval_loader.dataset.vocab[ 'answerIdxToToken'][answer[0]]) print('[A_PRED]', eval_loader.dataset.vocab[ 'answerIdxToToken'][pred_answer]) # compute stats metrics_slug['accuracy_' + str(i)] = ans_acc[0] metrics_slug['mean_rank_' + str(i)] = ans_rank[0] metrics_slug['mean_reciprocal_rank_' + str(i)] = 1.0 / ans_rank[0] metrics_slug['d_0_' + str(i)] = dists_to_target[0] metrics_slug['d_T_' + str(i)] = dists_to_target[-1] metrics_slug['d_D_' + str( i)] = dists_to_target[0] - dists_to_target[-1] metrics_slug['d_min_' + str(i)] = np.array( dists_to_target).min() metrics_slug['ep_len_' + str(i)] = episode_length if action == 3: metrics_slug['stop_' + str(i)] = 1 else: metrics_slug['stop_' + str(i)] = 0 inside_room = [] for p in pos_queue: inside_room.append( h3d.is_inside_room( p, eval_loader.dataset.target_room)) if inside_room[-1] == True: metrics_slug['r_T_' + str(i)] = 1 else: metrics_slug['r_T_' + str(i)] = 0 if any([x == True for x in inside_room]) == True: metrics_slug['r_e_' + str(i)] = 1 else: metrics_slug['r_e_' + str(i)] = 0 # navigation metrics metrics_list = [] for i in nav_metrics.metric_names: if i not in metrics_slug: metrics_list.append(nav_metrics.metrics[ nav_metrics.metric_names.index(i)][0]) else: metrics_list.append(metrics_slug[i]) nav_metrics.update(metrics_list) # vqa metrics metrics_list = [] for i in vqa_metrics.metric_names: if i not in metrics_slug: metrics_list.append(vqa_metrics.metrics[ vqa_metrics.metric_names.index(i)][0]) else: metrics_list.append(metrics_slug[i]) vqa_metrics.update(metrics_list) try: print(nav_metrics.get_stat_string(mode=0)) print(vqa_metrics.get_stat_string(mode=0)) except: pass print('epoch', epoch) print('invalids', len(invalids)) eval_loader.dataset._load_envs() if len(eval_loader.dataset.pruned_env_set) == 0: done = True epoch += 1 # checkpoint if best val accuracy if vqa_metrics.metrics[2][0] > best_eval_acc: # ans_acc_50 best_eval_acc = vqa_metrics.metrics[2][0] if epoch % args.eval_every == 0 and args.log == True: vqa_metrics.dump_log() nav_metrics.dump_log() model_state = get_state(nav_model) aad = dict(args.__dict__) ad = {} for i in aad: if i[0] != '_': ad[i] = aad[i] checkpoint = {'args': ad, 'state': model_state, 'epoch': epoch} checkpoint_path = '%s/epoch_%d_ans_50_%.04f.pt' % ( args.checkpoint_dir, epoch, best_eval_acc) print('Saving checkpoint to %s' % checkpoint_path) torch.save(checkpoint, checkpoint_path) print('[best_eval_ans_acc_50:%.04f]' % best_eval_acc) eval_loader.dataset._load_envs(start_idx=0, in_order=True) def train(rank, args, shared_nav_model, shared_ans_model): torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) if args.model_type == 'pacman': model_kwargs = {'question_vocab': load_vocab(args.vocab_json)} nav_model = NavPlannerControllerModel(**model_kwargs) else: exit() model_kwargs = {'vocab': load_vocab(args.vocab_json)} ans_model = VqaLstmCnnAttentionModel(**model_kwargs) optim = torch.optim.SGD( filter(lambda p: p.requires_grad, shared_nav_model.parameters()), lr=args.learning_rate) train_loader_kwargs = { 'questions_h5': args.train_h5, 'data_json': args.data_json, 'vocab': args.vocab_json, 'target_obj_conn_map_dir': args.target_obj_conn_map_dir, 'map_resolution': args.map_resolution, 'batch_size': 1, 'input_type': args.model_type, 'num_frames': 5, 'split': 'train', 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank % len(args.gpus)], 'to_cache': args.cache, 'max_controller_actions': args.max_controller_actions, 'max_actions': args.max_actions } args.output_nav_log_path = os.path.join(args.log_dir, 'nav_train_' + str(rank) + '.json') args.output_ans_log_path = os.path.join(args.log_dir, 'ans_train_' + str(rank) + '.json') nav_model.load_state_dict(shared_nav_model.state_dict()) nav_model.cuda() ans_model.load_state_dict(shared_ans_model.state_dict()) ans_model.eval() ans_model.cuda() nav_metrics = NavMetric( info={'split': 'train', 'thread': rank}, metric_names=[ 'planner_loss', 'controller_loss', 'reward', 'episode_length' ], log_json=args.output_nav_log_path) vqa_metrics = VqaMetric( info={'split': 'train', 'thread': rank}, metric_names=['accuracy', 'mean_rank', 'mean_reciprocal_rank'], log_json=args.output_ans_log_path) train_loader = EqaDataLoader(**train_loader_kwargs) print('train_loader has %d samples' % len(train_loader.dataset)) t, epoch = 0, 0 p_losses, c_losses, reward_list, episode_length_list = [], [], [], [] nav_metrics.update([10.0, 10.0, 0, 100]) mult = 0.1 while epoch < int(args.max_epochs): if 'pacman' in args.model_type: planner_lossFn = MaskedNLLCriterion().cuda() controller_lossFn = MaskedNLLCriterion().cuda() done = False all_envs_loaded = train_loader.dataset._check_if_all_envs_loaded() while done == False: for batch in train_loader: nav_model.load_state_dict(shared_nav_model.state_dict()) nav_model.eval() nav_model.cuda() idx, question, answer, actions, action_length = batch metrics_slug = {} h3d = train_loader.dataset.episode_house # evaluate at multiple initializations # for i in [10, 30, 50]: t += 1 question_var = Variable(question.cuda()) controller_step = False planner_hidden = nav_model.planner_nav_rnn.init_hidden(1) # forward through planner till spawn ( planner_actions_in, planner_img_feats, controller_step, controller_action_in, controller_img_feat, init_pos, controller_action_counter ) = train_loader.dataset.get_hierarchical_features_till_spawn( actions[0, :action_length[0] + 1].numpy(), max(3, int(mult * action_length[0])), args.max_controller_actions ) planner_actions_in_var = Variable( planner_actions_in.cuda()) planner_img_feats_var = Variable(planner_img_feats.cuda()) for step in range(planner_actions_in.size(0)): planner_scores, planner_hidden = nav_model.planner_step( question_var, planner_img_feats_var[step].view( 1, 1, 3200), planner_actions_in_var[step].view( 1, 1), planner_hidden) if controller_step == True: controller_img_feat_var = Variable( controller_img_feat.cuda()) controller_action_in_var = Variable( torch.LongTensor(1, 1).fill_( int(controller_action_in)).cuda()) controller_scores = nav_model.controller_step( controller_img_feat_var.view(1, 1, 3200), controller_action_in_var.view(1, 1), planner_hidden[0]) prob = F.softmax(controller_scores, dim=1) controller_action = int( prob.max(1)[1].data.cpu().numpy()[0]) if controller_action == 1: controller_step = True else: controller_step = False action = int(controller_action_in) action_in = torch.LongTensor( 1, 1).fill_(action + 1).cuda() else: prob = F.softmax(planner_scores, dim=1) action = int(prob.max(1)[1].data.cpu().numpy()[0]) action_in = torch.LongTensor( 1, 1).fill_(action + 1).cuda() h3d.env.reset( x=init_pos[0], y=init_pos[2], yaw=init_pos[3]) init_dist_to_target = h3d.get_dist_to_target( h3d.env.cam.pos) if init_dist_to_target < 0: # unreachable # invalids.append([idx[0], i]) continue episode_length = 0 episode_done = True controller_action_counter = 0 dists_to_target, pos_queue = [init_dist_to_target], [ init_pos ] rewards, planner_actions, planner_log_probs, controller_actions, controller_log_probs = [], [], [], [], [] if action != 3: # take the first step img, rwd, episode_done = h3d.step(action, step_reward=True) img = torch.from_numpy(img.transpose( 2, 0, 1)).float() / 255.0 img_feat_var = train_loader.dataset.cnn( Variable(img.view(1, 3, 224, 224).cuda())).view( 1, 1, 3200) for step in range(args.max_episode_length): episode_length += 1 if controller_step == False: planner_scores, planner_hidden = nav_model.planner_step( question_var, img_feat_var, Variable(action_in), planner_hidden) planner_prob = F.softmax(planner_scores, dim=1) planner_log_prob = F.log_softmax( planner_scores, dim=1) action = planner_prob.multinomial().data planner_log_prob = planner_log_prob.gather( 1, Variable(action)) planner_log_probs.append( planner_log_prob.cpu()) action = int(action.cpu().numpy()[0, 0]) planner_actions.append(action) img, rwd, episode_done = h3d.step(action, step_reward=True) episode_done = episode_done or episode_length >= args.max_episode_length rewards.append(rwd) img = torch.from_numpy(img.transpose( 2, 0, 1)).float() / 255.0 img_feat_var = train_loader.dataset.cnn( Variable(img.view(1, 3, 224, 224) .cuda())).view(1, 1, 3200) dists_to_target.append( h3d.get_dist_to_target(h3d.env.cam.pos)) pos_queue.append([ h3d.env.cam.pos.x, h3d.env.cam.pos.y, h3d.env.cam.pos.z, h3d.env.cam.yaw ]) if episode_done == True: break # query controller to continue or not controller_action_in = Variable( torch.LongTensor(1, 1).fill_(action).cuda()) controller_scores = nav_model.controller_step( img_feat_var, controller_action_in, planner_hidden[0]) controller_prob = F.softmax( controller_scores, dim=1) controller_log_prob = F.log_softmax( controller_scores, dim=1) controller_action = controller_prob.multinomial( ).data if int(controller_action[0] ) == 1 and controller_action_counter < 4: controller_action_counter += 1 controller_step = True else: controller_action_counter = 0 controller_step = False controller_action.fill_(0) controller_log_prob = controller_log_prob.gather( 1, Variable(controller_action)) controller_log_probs.append( controller_log_prob.cpu()) controller_action = int( controller_action.cpu().numpy()[0, 0]) controller_actions.append(controller_action) action_in = torch.LongTensor( 1, 1).fill_(action + 1).cuda() # run answerer here ans_acc = [0] if action == 3: if len(pos_queue) < 5: pos_queue = train_loader.dataset.episode_pos_queue[len( pos_queue) - 5:] + pos_queue images = train_loader.dataset.get_frames( h3d, pos_queue[-5:], preprocess=True) images_var = Variable( torch.from_numpy(images).cuda()).view( 1, 5, 3, 224, 224) scores, att_probs = ans_model(images_var, question_var) ans_acc, ans_rank = vqa_metrics.compute_ranks( scores.data.cpu(), answer) vqa_metrics.update([ans_acc, ans_rank, 1.0 / ans_rank]) rewards.append(h3d.success_reward * ans_acc[0]) R = torch.zeros(1, 1) planner_loss = 0 controller_loss = 0 planner_rev_idx = -1 for i in reversed(range(len(rewards))): R = 0.99 * R + rewards[i] advantage = R - nav_metrics.metrics[2][1] if i < len(controller_actions): controller_loss = controller_loss - controller_log_probs[i] * Variable( advantage) if controller_actions[i] == 0 and planner_rev_idx + len(planner_log_probs) >= 0: planner_loss = planner_loss - planner_log_probs[planner_rev_idx] * Variable( advantage) planner_rev_idx -= 1 elif planner_rev_idx + len(planner_log_probs) >= 0: planner_loss = planner_loss - planner_log_probs[planner_rev_idx] * Variable( advantage) planner_rev_idx -= 1 controller_loss /= max(1, len(controller_log_probs)) planner_loss /= max(1, len(planner_log_probs)) optim.zero_grad() if isinstance(planner_loss, float) == False and isinstance( controller_loss, float) == False: p_losses.append(planner_loss.data[0, 0]) c_losses.append(controller_loss.data[0, 0]) reward_list.append(np.sum(rewards)) episode_length_list.append(episode_length) (planner_loss + controller_loss).backward() ensure_shared_grads(nav_model.cpu(), shared_nav_model) optim.step() if len(reward_list) > 50: nav_metrics.update([ p_losses, c_losses, reward_list, episode_length_list ]) print(nav_metrics.get_stat_string()) if args.log == True: nav_metrics.dump_log() if nav_metrics.metrics[2][1] > 0.35: mult = min(mult + 0.1, 1.0) p_losses, c_losses, reward_list, episode_length_list = [], [], [], [] if all_envs_loaded == False: train_loader.dataset._load_envs(in_order=True) if len(train_loader.dataset.pruned_env_set) == 0: done = True if args.cache == False: train_loader.dataset._load_envs( start_idx=0, in_order=True) else: done = True epoch += 1 if __name__ == '__main__': parser = argparse.ArgumentParser() # data params parser.add_argument('-train_h5', default='data/train.h5') parser.add_argument('-val_h5', default='data/val.h5') parser.add_argument('-test_h5', default='data/test.h5') parser.add_argument('-data_json', default='data/data.json') parser.add_argument('-vocab_json', default='data/vocab.json') parser.add_argument( '-target_obj_conn_map_dir', default='/path/to/target-obj-conn-maps/500') parser.add_argument('-map_resolution', default=500, type=int) parser.add_argument( '-mode', default='train+eval', type=str, choices=['train', 'eval', 'train+eval']) parser.add_argument('-eval_split', default='val', type=str) # model details parser.add_argument( '-model_type', default='pacman', choices=['cnn', 'cnn+q', 'lstm', 'lstm+q', 'pacman']) parser.add_argument('-max_episode_length', default=100, type=int) # optim params parser.add_argument('-batch_size', default=20, type=int) parser.add_argument('-learning_rate', default=1e-5, type=float) parser.add_argument('-max_epochs', default=1000, type=int) # bookkeeping parser.add_argument('-print_every', default=5, type=int) parser.add_argument('-eval_every', default=1, type=int) parser.add_argument('-identifier', default='cnn') parser.add_argument('-num_processes', default=1, type=int) parser.add_argument('-max_threads_per_gpu', default=10, type=int) # checkpointing parser.add_argument('-nav_checkpoint_path', default=False) parser.add_argument('-ans_checkpoint_path', default=False) parser.add_argument('-checkpoint_dir', default='checkpoints/eqa/') parser.add_argument('-log_dir', default='logs/eqa/') parser.add_argument('-log', default=False, action='store_true') parser.add_argument('-cache', default=False, action='store_true') parser.add_argument('-max_controller_actions', type=int, default=5) parser.add_argument('-max_actions', type=int) args = parser.parse_args() args.time_id = time.strftime("%m_%d_%H:%M") try: args.gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',') args.gpus = [int(x) for x in args.gpus] except KeyError: print("CPU not supported") exit() # Load navigation model if args.nav_checkpoint_path != False: print('Loading navigation checkpoint from %s' % args.nav_checkpoint_path) checkpoint = torch.load( args.nav_checkpoint_path, map_location={ 'cuda:0': 'cpu' }) args_to_keep = ['model_type'] for i in args.__dict__: if i not in args_to_keep: checkpoint['args'][i] = args.__dict__[i] args = type('new_dict', (object, ), checkpoint['args']) args.checkpoint_dir = os.path.join(args.checkpoint_dir, args.time_id + '_' + args.identifier) args.log_dir = os.path.join(args.log_dir, args.time_id + '_' + args.identifier) print(args.__dict__) if not os.path.exists(args.checkpoint_dir) and args.log == True: os.makedirs(args.checkpoint_dir) os.makedirs(args.log_dir) if args.model_type == 'pacman': model_kwargs = {'question_vocab': load_vocab(args.vocab_json)} shared_nav_model = NavPlannerControllerModel(**model_kwargs) else: exit() shared_nav_model.share_memory() if args.nav_checkpoint_path != False: print('Loading navigation params from checkpoint: %s' % args.nav_checkpoint_path) shared_nav_model.load_state_dict(checkpoint['state']) # Load answering model if args.ans_checkpoint_path != False: print('Loading answering checkpoint from %s' % args.ans_checkpoint_path) ans_checkpoint = torch.load( args.ans_checkpoint_path, map_location={ 'cuda:0': 'cpu' }) ans_model_kwargs = {'vocab': load_vocab(args.vocab_json)} shared_ans_model = VqaLstmCnnAttentionModel(**ans_model_kwargs) shared_ans_model.share_memory() if args.ans_checkpoint_path != False: print('Loading params from checkpoint: %s' % args.ans_checkpoint_path) shared_ans_model.load_state_dict(ans_checkpoint['state']) if args.mode == 'eval': eval(0, args, shared_nav_model, shared_ans_model) elif args.mode == 'train': train(0, args, shared_nav_model, shared_ans_model) else: processes = [] p = mp.Process( target=eval, args=(0, args, shared_nav_model, shared_ans_model)) p.start() processes.append(p) for rank in range(1, args.num_processes + 1): p = mp.Process( target=train, args=(rank, args, shared_nav_model, shared_ans_model)) p.start() processes.append(p) for p in processes: p.join() ================================================ FILE: training/train_nav.py ================================================ import time import argparse from datetime import datetime import logging import numpy as np import os import torch import torch.nn.functional as F import torch.multiprocessing as mp from models import NavCnnModel, NavCnnRnnModel, NavCnnRnnMultModel, NavPlannerControllerModel from data import EqaDataLoader from metrics import NavMetric from models import MaskedNLLCriterion from models import get_state, ensure_shared_grads from data import load_vocab from torch.autograd import Variable from tqdm import tqdm import time torch.backends.cudnn.enabled = False ################################################################################################ #make models trained in pytorch 4 compatible with earlier pytorch versions import torch._utils try: torch._utils._rebuild_tensor_v2 except AttributeError: def _rebuild_tensor_v2(storage, storage_offset, size, stride, requires_grad, backward_hooks): tensor = torch._utils._rebuild_tensor(storage, storage_offset, size, stride) tensor.requires_grad = requires_grad tensor._backward_hooks = backward_hooks return tensor torch._utils._rebuild_tensor_v2 = _rebuild_tensor_v2 ################################################################################################ def eval(rank, args, shared_model): torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) if args.model_type == 'cnn': model_kwargs = {} model = NavCnnModel(**model_kwargs) elif args.model_type == 'cnn+q': model_kwargs = { 'question_input': True, 'question_vocab': load_vocab(args.vocab_json) } model = NavCnnModel(**model_kwargs) elif args.model_type == 'lstm': model_kwargs = {} model = NavCnnRnnModel(**model_kwargs) elif args.model_type == 'lstm+q': model_kwargs = { 'question_input': True, 'question_vocab': load_vocab(args.vocab_json) } model = NavCnnRnnModel(**model_kwargs) elif args.model_type == 'lstm-mult+q': model_kwargs = { 'question_input': True, 'question_vocab': load_vocab(args.vocab_json) } model = NavCnnRnnMultModel(**model_kwargs) elif args.model_type == 'pacman': model_kwargs = {'question_vocab': load_vocab(args.vocab_json)} model = NavPlannerControllerModel(**model_kwargs) else: exit() eval_loader_kwargs = { 'questions_h5': getattr(args, args.eval_split + '_h5'), 'data_json': args.data_json, 'vocab': args.vocab_json, 'target_obj_conn_map_dir': args.target_obj_conn_map_dir, 'map_resolution': args.map_resolution, 'batch_size': 1, 'input_type': args.model_type, 'num_frames': 5, 'split': args.eval_split, 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank % len(args.gpus)], 'to_cache': False, 'overfit': args.overfit, 'max_controller_actions': args.max_controller_actions, } eval_loader = EqaDataLoader(**eval_loader_kwargs) print('eval_loader has %d samples' % len(eval_loader.dataset)) logging.info("EVAL: eval_loader has {} samples".format(len(eval_loader.dataset))) args.output_log_path = os.path.join(args.log_dir, 'eval_' + str(rank) + '.json') t, epoch, best_eval_acc = 0, 0, 0.0 max_epochs = args.max_epochs if args.mode == 'eval': max_epochs = 1 while epoch < int(max_epochs): invalids = [] model.load_state_dict(shared_model.state_dict()) model.eval() # that's a lot of numbers metrics = NavMetric( info={'split': args.eval_split, 'thread': rank}, metric_names=[ 'd_0_10', 'd_0_30', 'd_0_50', 'd_T_10', 'd_T_30', 'd_T_50', 'd_D_10', 'd_D_30', 'd_D_50', 'd_min_10', 'd_min_30', 'd_min_50', 'r_T_10', 'r_T_30', 'r_T_50', 'r_e_10', 'r_e_30', 'r_e_50', 'stop_10', 'stop_30', 'stop_50', 'ep_len_10', 'ep_len_30', 'ep_len_50' ], log_json=args.output_log_path) if 'cnn' in args.model_type: done = False while done == False: for batch in tqdm(eval_loader): model.load_state_dict(shared_model.state_dict()) model.cuda() idx, questions, _, img_feats, actions_in, actions_out, action_length = batch metrics_slug = {} # evaluate at multiple initializations for i in [10, 30, 50]: t += 1 if action_length[0] + 1 - i - 5 < 0: invalids.append(idx[0]) continue ep_inds = [ x for x in range(action_length[0] + 1 - i - 5, action_length[0] + 1 - i) ] sub_img_feats = torch.index_select( img_feats, 1, torch.LongTensor(ep_inds)) init_pos = eval_loader.dataset.episode_pos_queue[ ep_inds[-1]] h3d = eval_loader.dataset.episode_house h3d.env.reset( x=init_pos[0], y=init_pos[2], yaw=init_pos[3]) init_dist_to_target = h3d.get_dist_to_target( h3d.env.cam.pos) if init_dist_to_target < 0: # unreachable invalids.append(idx[0]) continue sub_img_feats_var = Variable(sub_img_feats.cuda()) if '+q' in args.model_type: questions_var = Variable(questions.cuda()) # sample actions till max steps or # max no. of actions = 100 episode_length = 0 episode_done = True dists_to_target, pos_queue, actions = [ init_dist_to_target ], [init_pos], [] for step in range(args.max_episode_length): episode_length += 1 if '+q' in args.model_type: scores = model(sub_img_feats_var, questions_var) else: scores = model(sub_img_feats_var) prob = F.softmax(scores, dim=1) action = int(prob.max(1)[1].data.cpu().numpy()[0]) actions.append(action) img, _, episode_done = h3d.step(action) episode_done = episode_done or episode_length >= args.max_episode_length img = torch.from_numpy(img.transpose( 2, 0, 1)).float() / 255.0 img_feat_var = eval_loader.dataset.cnn( Variable(img.view(1, 3, 224, 224) .cuda())).view(1, 1, 3200) sub_img_feats_var = torch.cat( [sub_img_feats_var, img_feat_var], dim=1) sub_img_feats_var = sub_img_feats_var[:, -5:, :] dists_to_target.append( h3d.get_dist_to_target(h3d.env.cam.pos)) pos_queue.append([ h3d.env.cam.pos.x, h3d.env.cam.pos.y, h3d.env.cam.pos.z, h3d.env.cam.yaw ]) if episode_done == True: break # compute stats metrics_slug['d_0_' + str(i)] = dists_to_target[0] metrics_slug['d_T_' + str(i)] = dists_to_target[-1] metrics_slug['d_D_' + str( i)] = dists_to_target[0] - dists_to_target[-1] metrics_slug['d_min_' + str(i)] = np.array( dists_to_target).min() metrics_slug['ep_len_' + str(i)] = episode_length if action == 3: metrics_slug['stop_' + str(i)] = 1 else: metrics_slug['stop_' + str(i)] = 0 inside_room = [] for p in pos_queue: inside_room.append( h3d.is_inside_room( p, eval_loader.dataset.target_room)) if inside_room[-1] == True: metrics_slug['r_T_' + str(i)] = 1 else: metrics_slug['r_T_' + str(i)] = 0 if any([x == True for x in inside_room]) == True: metrics_slug['r_e_' + str(i)] = 1 else: metrics_slug['r_e_' + str(i)] = 0 # collate and update metrics metrics_list = [] for i in metrics.metric_names: if i not in metrics_slug: metrics_list.append(metrics.metrics[ metrics.metric_names.index(i)][0]) else: metrics_list.append(metrics_slug[i]) # update metrics metrics.update(metrics_list) print(metrics.get_stat_string(mode=0)) print('invalids', len(invalids)) logging.info("EVAL: metrics: {}".format(metrics.get_stat_string(mode=0))) logging.info("EVAL: invalids: {}".format(len(invalids))) # del h3d eval_loader.dataset._load_envs() if len(eval_loader.dataset.pruned_env_set) == 0: done = True elif 'lstm' in args.model_type: done = False while done == False: if args.overfit: metrics = NavMetric( info={'split': args.eval_split, 'thread': rank}, metric_names=[ 'd_0_10', 'd_0_30', 'd_0_50', 'd_T_10', 'd_T_30', 'd_T_50', 'd_D_10', 'd_D_30', 'd_D_50', 'd_min_10', 'd_min_30', 'd_min_50', 'r_T_10', 'r_T_30', 'r_T_50', 'r_e_10', 'r_e_30', 'r_e_50', 'stop_10', 'stop_30', 'stop_50', 'ep_len_10', 'ep_len_30', 'ep_len_50' ], log_json=args.output_log_path) for batch in tqdm(eval_loader): model.load_state_dict(shared_model.state_dict()) model.cuda() idx, questions, answer, _, actions_in, actions_out, action_lengths, _ = batch question_var = Variable(questions.cuda()) metrics_slug = {} # evaluate at multiple initializations for i in [10, 30, 50]: t += 1 if action_lengths[0] - 1 - i < 0: invalids.append([idx[0], i]) continue h3d = eval_loader.dataset.episode_house # forward through lstm till spawn if len(eval_loader.dataset.episode_pos_queue[:-i] ) > 0: images = eval_loader.dataset.get_frames( h3d, eval_loader.dataset.episode_pos_queue[:-i], preprocess=True) raw_img_feats = eval_loader.dataset.cnn( Variable(torch.FloatTensor(images).cuda())) actions_in_pruned = actions_in[:, : action_lengths[0] - i] actions_in_var = Variable(actions_in_pruned.cuda()) action_lengths_pruned = action_lengths.clone( ).fill_(action_lengths[0] - i) img_feats_var = raw_img_feats.view(1, -1, 3200) if '+q' in args.model_type: scores, hidden = model( img_feats_var, question_var, actions_in_var, action_lengths_pruned.cpu().numpy()) else: scores, hidden = model( img_feats_var, False, actions_in_var, action_lengths_pruned.cpu().numpy()) try: init_pos = eval_loader.dataset.episode_pos_queue[ -i] except: invalids.append([idx[0], i]) continue action_in = torch.LongTensor(1, 1).fill_( actions_in[0, action_lengths[0] - i]).cuda() else: init_pos = eval_loader.dataset.episode_pos_queue[ -i] hidden = model.nav_rnn.init_hidden(1) action_in = torch.LongTensor(1, 1).fill_(0).cuda() h3d.env.reset( x=init_pos[0], y=init_pos[2], yaw=init_pos[3]) init_dist_to_target = h3d.get_dist_to_target( h3d.env.cam.pos) if init_dist_to_target < 0: # unreachable invalids.append([idx[0], i]) continue img = h3d.env.render() img = torch.from_numpy(img.transpose( 2, 0, 1)).float() / 255.0 img_feat_var = eval_loader.dataset.cnn( Variable(img.view(1, 3, 224, 224).cuda())).view( 1, 1, 3200) episode_length = 0 episode_done = True dists_to_target, pos_queue, actions = [ init_dist_to_target ], [init_pos], [] actual_pos_queue = [(h3d.env.cam.pos.x, h3d.env.cam.pos.z, h3d.env.cam.yaw)] for step in range(args.max_episode_length): episode_length += 1 if '+q' in args.model_type: scores, hidden = model( img_feat_var, question_var, Variable(action_in), False, hidden=hidden, step=True) else: scores, hidden = model( img_feat_var, False, Variable(action_in), False, hidden=hidden, step=True) prob = F.softmax(scores, dim=1) action = int(prob.max(1)[1].data.cpu().numpy()[0]) actions.append(action) img, _, episode_done = h3d.step(action) episode_done = episode_done or episode_length >= args.max_episode_length img = torch.from_numpy(img.transpose( 2, 0, 1)).float() / 255.0 img_feat_var = eval_loader.dataset.cnn( Variable(img.view(1, 3, 224, 224) .cuda())).view(1, 1, 3200) action_in = torch.LongTensor( 1, 1).fill_(action + 1).cuda() dists_to_target.append( h3d.get_dist_to_target(h3d.env.cam.pos)) pos_queue.append([ h3d.env.cam.pos.x, h3d.env.cam.pos.y, h3d.env.cam.pos.z, h3d.env.cam.yaw ]) if episode_done == True: break actual_pos_queue.append([ h3d.env.cam.pos.x, h3d.env.cam.pos.z, h3d.env.cam.yaw]) # compute stats metrics_slug['d_0_' + str(i)] = dists_to_target[0] metrics_slug['d_T_' + str(i)] = dists_to_target[-1] metrics_slug['d_D_' + str( i)] = dists_to_target[0] - dists_to_target[-1] metrics_slug['d_min_' + str(i)] = np.array( dists_to_target).min() metrics_slug['ep_len_' + str(i)] = episode_length if action == 3: metrics_slug['stop_' + str(i)] = 1 else: metrics_slug['stop_' + str(i)] = 0 inside_room = [] for p in pos_queue: inside_room.append( h3d.is_inside_room( p, eval_loader.dataset.target_room)) if inside_room[-1] == True: metrics_slug['r_T_' + str(i)] = 1 else: metrics_slug['r_T_' + str(i)] = 0 if any([x == True for x in inside_room]) == True: metrics_slug['r_e_' + str(i)] = 1 else: metrics_slug['r_e_' + str(i)] = 0 # collate and update metrics metrics_list = [] for i in metrics.metric_names: if i not in metrics_slug: metrics_list.append(metrics.metrics[ metrics.metric_names.index(i)][0]) else: metrics_list.append(metrics_slug[i]) # update metrics metrics.update(metrics_list) print(metrics.get_stat_string(mode=0)) print('invalids', len(invalids)) logging.info("EVAL: init_steps: {} metrics: {}".format(i, metrics.get_stat_string(mode=0))) logging.info("EVAL: init_steps: {} invalids: {}".format(i, len(invalids))) # del h3d eval_loader.dataset._load_envs() print("eval_loader pruned_env_set len: {}".format(len(eval_loader.dataset.pruned_env_set))) logging.info("eval_loader pruned_env_set len: {}".format(len(eval_loader.dataset.pruned_env_set))) assert len(eval_loader.dataset.pruned_env_set) > 0 if len(eval_loader.dataset.pruned_env_set) == 0: done = True elif 'pacman' in args.model_type: done = False while done == False: if args.overfit: metrics = NavMetric( info={'split': args.eval_split, 'thread': rank}, metric_names=[ 'd_0_10', 'd_0_30', 'd_0_50', 'd_T_10', 'd_T_30', 'd_T_50', 'd_D_10', 'd_D_30', 'd_D_50', 'd_min_10', 'd_min_30', 'd_min_50', 'r_T_10', 'r_T_30', 'r_T_50', 'r_e_10', 'r_e_30', 'r_e_50', 'stop_10', 'stop_30', 'stop_50', 'ep_len_10', 'ep_len_30', 'ep_len_50' ], log_json=args.output_log_path) for batch in tqdm(eval_loader): model.load_state_dict(shared_model.state_dict()) model.cuda() idx, question, answer, actions, action_length = batch metrics_slug = {} h3d = eval_loader.dataset.episode_house # evaluate at multiple initializations for i in [10, 30, 50]: t += 1 if i > action_length[0]: invalids.append([idx[0], i]) continue question_var = Variable(question.cuda()) controller_step = False planner_hidden = model.planner_nav_rnn.init_hidden(1) # get hierarchical action history ( planner_actions_in, planner_img_feats, controller_step, controller_action_in, controller_img_feats, init_pos, controller_action_counter ) = eval_loader.dataset.get_hierarchical_features_till_spawn( actions[0, :action_length[0] + 1].numpy(), i, args.max_controller_actions ) planner_actions_in_var = Variable( planner_actions_in.cuda()) planner_img_feats_var = Variable( planner_img_feats.cuda()) # forward planner till spawn to update hidden state for step in range(planner_actions_in.size(0)): planner_scores, planner_hidden = model.planner_step( question_var, planner_img_feats_var[step] .unsqueeze(0).unsqueeze(0), planner_actions_in_var[step].view(1, 1), planner_hidden ) h3d.env.reset( x=init_pos[0], y=init_pos[2], yaw=init_pos[3]) init_dist_to_target = h3d.get_dist_to_target( h3d.env.cam.pos) if init_dist_to_target < 0: # unreachable invalids.append([idx[0], i]) continue dists_to_target, pos_queue, pred_actions = [ init_dist_to_target ], [init_pos], [] planner_actions, controller_actions = [], [] episode_length = 0 if args.max_controller_actions > 1: controller_action_counter = controller_action_counter % args.max_controller_actions controller_action_counter = max(controller_action_counter - 1, 0) else: controller_action_counter = 0 first_step = True first_step_is_controller = controller_step planner_step = True action = int(controller_action_in) for step in range(args.max_episode_length): if not first_step: img = torch.from_numpy(img.transpose( 2, 0, 1)).float() / 255.0 img_feat_var = eval_loader.dataset.cnn( Variable(img.view(1, 3, 224, 224).cuda())).view( 1, 1, 3200) else: img_feat_var = Variable(controller_img_feats.cuda()).view(1, 1, 3200) if not first_step or first_step_is_controller: # query controller to continue or not controller_action_in = Variable( torch.LongTensor(1, 1).fill_(action).cuda()) controller_scores = model.controller_step( img_feat_var, controller_action_in, planner_hidden[0]) prob = F.softmax(controller_scores, dim=1) controller_action = int( prob.max(1)[1].data.cpu().numpy()[0]) if controller_action == 1 and controller_action_counter < args.max_controller_actions - 1: controller_action_counter += 1 planner_step = False else: controller_action_counter = 0 planner_step = True controller_action = 0 controller_actions.append(controller_action) first_step = False if planner_step: if not first_step: action_in = torch.LongTensor( 1, 1).fill_(action + 1).cuda() planner_scores, planner_hidden = model.planner_step( question_var, img_feat_var, Variable(action_in), planner_hidden) prob = F.softmax(planner_scores, dim=1) action = int( prob.max(1)[1].data.cpu().numpy()[0]) planner_actions.append(action) episode_done = action == 3 or episode_length >= args.max_episode_length episode_length += 1 dists_to_target.append( h3d.get_dist_to_target(h3d.env.cam.pos)) pos_queue.append([ h3d.env.cam.pos.x, h3d.env.cam.pos.y, h3d.env.cam.pos.z, h3d.env.cam.yaw ]) if episode_done: break img, _, _ = h3d.step(action) first_step = False # compute stats metrics_slug['d_0_' + str(i)] = dists_to_target[0] metrics_slug['d_T_' + str(i)] = dists_to_target[-1] metrics_slug['d_D_' + str( i)] = dists_to_target[0] - dists_to_target[-1] metrics_slug['d_min_' + str(i)] = np.array( dists_to_target).min() metrics_slug['ep_len_' + str(i)] = episode_length if action == 3: metrics_slug['stop_' + str(i)] = 1 else: metrics_slug['stop_' + str(i)] = 0 inside_room = [] for p in pos_queue: inside_room.append( h3d.is_inside_room( p, eval_loader.dataset.target_room)) if inside_room[-1] == True: metrics_slug['r_T_' + str(i)] = 1 else: metrics_slug['r_T_' + str(i)] = 0 if any([x == True for x in inside_room]) == True: metrics_slug['r_e_' + str(i)] = 1 else: metrics_slug['r_e_' + str(i)] = 0 # collate and update metrics metrics_list = [] for i in metrics.metric_names: if i not in metrics_slug: metrics_list.append(metrics.metrics[ metrics.metric_names.index(i)][0]) else: metrics_list.append(metrics_slug[i]) # update metrics metrics.update(metrics_list) try: print(metrics.get_stat_string(mode=0)) logging.info("EVAL: metrics: {}".format(metrics.get_stat_string(mode=0))) except: pass print('epoch', epoch) print('invalids', len(invalids)) logging.info("EVAL: epoch {}".format(epoch)) logging.info("EVAL: invalids {}".format(invalids)) # del h3d eval_loader.dataset._load_envs() if len(eval_loader.dataset.pruned_env_set) == 0: done = True epoch += 1 # checkpoint if best val loss if metrics.metrics[8][0] > best_eval_acc: # d_D_50 best_eval_acc = metrics.metrics[8][0] if epoch % args.eval_every == 0 and args.log == True: metrics.dump_log() model_state = get_state(model) aad = dict(args.__dict__) ad = {} for i in aad: if i[0] != '_': ad[i] = aad[i] checkpoint = {'args': ad, 'state': model_state, 'epoch': epoch} checkpoint_path = '%s/epoch_%d_d_D_50_%.04f.pt' % ( args.checkpoint_dir, epoch, best_eval_acc) print('Saving checkpoint to %s' % checkpoint_path) logging.info("EVAL: Saving checkpoint to {}".format(checkpoint_path)) torch.save(checkpoint, checkpoint_path) print('[best_eval_d_D_50:%.04f]' % best_eval_acc) logging.info("EVAL: [best_eval_d_D_50:{:.04f}]".format(best_eval_acc)) eval_loader.dataset._load_envs(start_idx=0, in_order=True) def train(rank, args, shared_model): torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) if args.model_type == 'cnn': model_kwargs = {} model = NavCnnModel(**model_kwargs) elif args.model_type == 'cnn+q': model_kwargs = { 'question_input': True, 'question_vocab': load_vocab(args.vocab_json) } model = NavCnnModel(**model_kwargs) elif args.model_type == 'lstm': model_kwargs = {} model = NavCnnRnnModel(**model_kwargs) elif args.model_type == 'lstm-mult+q': model_kwargs = { 'question_input': True, 'question_vocab': load_vocab(args.vocab_json) } model = NavCnnRnnMultModel(**model_kwargs) elif args.model_type == 'lstm+q': model_kwargs = { 'question_input': True, 'question_vocab': load_vocab(args.vocab_json) } model = NavCnnRnnModel(**model_kwargs) elif args.model_type == 'pacman': model_kwargs = {'question_vocab': load_vocab(args.vocab_json)} model = NavPlannerControllerModel(**model_kwargs) else: exit() lossFn = torch.nn.CrossEntropyLoss().cuda() optim = torch.optim.Adamax( filter(lambda p: p.requires_grad, shared_model.parameters()), lr=args.learning_rate) train_loader_kwargs = { 'questions_h5': args.train_h5, 'data_json': args.data_json, 'vocab': args.vocab_json, 'batch_size': args.batch_size, 'input_type': args.model_type, 'num_frames': 5, 'map_resolution': args.map_resolution, 'split': 'train', 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank % len(args.gpus)], 'to_cache': args.cache, 'overfit': args.overfit, 'max_controller_actions': args.max_controller_actions, 'max_actions': args.max_actions } args.output_log_path = os.path.join(args.log_dir, 'train_' + str(rank) + '.json') if 'pacman' in args.model_type: metrics = NavMetric( info={'split': 'train', 'thread': rank}, metric_names=['planner_loss', 'controller_loss'], log_json=args.output_log_path) else: metrics = NavMetric( info={'split': 'train', 'thread': rank}, metric_names=['loss'], log_json=args.output_log_path) train_loader = EqaDataLoader(**train_loader_kwargs) print('train_loader has %d samples' % len(train_loader.dataset)) logging.info('TRAIN: train loader has {} samples'.format(len(train_loader.dataset))) t, epoch = 0, 0 while epoch < int(args.max_epochs): if 'cnn' in args.model_type: done = False all_envs_loaded = train_loader.dataset._check_if_all_envs_loaded() while done == False: for batch in train_loader: t += 1 model.load_state_dict(shared_model.state_dict()) model.train() model.cuda() idx, questions, _, img_feats, _, actions_out, _ = batch img_feats_var = Variable(img_feats.cuda()) if '+q' in args.model_type: questions_var = Variable(questions.cuda()) actions_out_var = Variable(actions_out.cuda()) if '+q' in args.model_type: scores = model(img_feats_var, questions_var) else: scores = model(img_feats_var) loss = lossFn(scores, actions_out_var) # zero grad optim.zero_grad() # update metrics metrics.update([loss.data[0]]) # backprop and update loss.backward() ensure_shared_grads(model.cpu(), shared_model) optim.step() if t % args.print_every == 0: print(metrics.get_stat_string()) logging.info("TRAIN: metrics: {}".format(metrics.get_stat_string())) if args.log == True: metrics.dump_log() print('[CHECK][Cache:%d][Total:%d]' % (len(train_loader.dataset.img_data_cache), len(train_loader.dataset.env_list))) logging.info('TRAIN: [CHECK][Cache:{}][Total:{}]'.format( len(train_loader.dataset.img_data_cache), len(train_loader.dataset.env_list))) if all_envs_loaded == False: train_loader.dataset._load_envs(in_order=True) if len(train_loader.dataset.pruned_env_set) == 0: done = True if args.cache == False: train_loader.dataset._load_envs( start_idx=0, in_order=True) else: done = True elif 'lstm' in args.model_type: lossFn = MaskedNLLCriterion().cuda() done = False all_envs_loaded = train_loader.dataset._check_if_all_envs_loaded() total_times = [] while done == False: start_time = time.time() for batch in train_loader: t += 1 model.load_state_dict(shared_model.state_dict()) model.train() model.cuda() idx, questions, _, img_feats, actions_in, actions_out, action_lengths, masks = batch img_feats_var = Variable(img_feats.cuda()) if '+q' in args.model_type: questions_var = Variable(questions.cuda()) actions_in_var = Variable(actions_in.cuda()) actions_out_var = Variable(actions_out.cuda()) action_lengths = action_lengths.cuda() masks_var = Variable(masks.cuda()) action_lengths, perm_idx = action_lengths.sort( 0, descending=True) img_feats_var = img_feats_var[perm_idx] if '+q' in args.model_type: questions_var = questions_var[perm_idx] actions_in_var = actions_in_var[perm_idx] actions_out_var = actions_out_var[perm_idx] masks_var = masks_var[perm_idx] if '+q' in args.model_type: scores, hidden = model(img_feats_var, questions_var, actions_in_var, action_lengths.cpu().numpy()) else: scores, hidden = model(img_feats_var, False, actions_in_var, action_lengths.cpu().numpy()) #block out masks if args.curriculum: curriculum_length = (epoch+1)*5 for i, action_length in enumerate(action_lengths): if action_length - curriculum_length > 0: masks_var[i, :action_length-curriculum_length] = 0 logprob = F.log_softmax(scores, dim=1) loss = lossFn( logprob, actions_out_var[:, :action_lengths.max()] .contiguous().view(-1, 1), masks_var[:, :action_lengths.max()].contiguous().view( -1, 1)) # zero grad optim.zero_grad() # update metrics metrics.update([loss.data[0]]) logging.info("TRAIN LSTM loss: {:.6f}".format(loss.data[0])) # backprop and update loss.backward() ensure_shared_grads(model.cpu(), shared_model) optim.step() if t % args.print_every == 0: print(metrics.get_stat_string()) logging.info("TRAIN: metrics: {}".format(metrics.get_stat_string())) if args.log == True: metrics.dump_log() print('[CHECK][Cache:%d][Total:%d]' % (len(train_loader.dataset.img_data_cache), len(train_loader.dataset.env_list))) logging.info('TRAIN: [CHECK][Cache:{}][Total:{}]'.format( len(train_loader.dataset.img_data_cache), len(train_loader.dataset.env_list))) if all_envs_loaded == False: train_loader.dataset._load_envs(in_order=True) if len(train_loader.dataset.pruned_env_set) == 0: done = True if args.cache == False: train_loader.dataset._load_envs( start_idx=0, in_order=True) else: done = True elif 'pacman' in args.model_type: planner_lossFn = MaskedNLLCriterion().cuda() controller_lossFn = MaskedNLLCriterion().cuda() done = False all_envs_loaded = train_loader.dataset._check_if_all_envs_loaded() while done == False: for batch in train_loader: t += 1 model.load_state_dict(shared_model.state_dict()) model.train() model.cuda() idx, questions, _, planner_img_feats, planner_actions_in, \ planner_actions_out, planner_action_lengths, planner_masks, \ controller_img_feats, controller_actions_in, planner_hidden_idx, \ controller_outs, controller_action_lengths, controller_masks = batch questions_var = Variable(questions.cuda()) planner_img_feats_var = Variable(planner_img_feats.cuda()) planner_actions_in_var = Variable( planner_actions_in.cuda()) planner_actions_out_var = Variable( planner_actions_out.cuda()) planner_action_lengths = planner_action_lengths.cuda() planner_masks_var = Variable(planner_masks.cuda()) controller_img_feats_var = Variable( controller_img_feats.cuda()) controller_actions_in_var = Variable( controller_actions_in.cuda()) planner_hidden_idx_var = Variable( planner_hidden_idx.cuda()) controller_outs_var = Variable(controller_outs.cuda()) controller_action_lengths = controller_action_lengths.cuda( ) controller_masks_var = Variable(controller_masks.cuda()) planner_action_lengths, perm_idx = planner_action_lengths.sort( 0, descending=True) questions_var = questions_var[perm_idx] planner_img_feats_var = planner_img_feats_var[perm_idx] planner_actions_in_var = planner_actions_in_var[perm_idx] planner_actions_out_var = planner_actions_out_var[perm_idx] planner_masks_var = planner_masks_var[perm_idx] controller_img_feats_var = controller_img_feats_var[ perm_idx] controller_actions_in_var = controller_actions_in_var[ perm_idx] controller_outs_var = controller_outs_var[perm_idx] planner_hidden_idx_var = planner_hidden_idx_var[perm_idx] controller_action_lengths = controller_action_lengths[ perm_idx] controller_masks_var = controller_masks_var[perm_idx] planner_scores, controller_scores, planner_hidden = model( questions_var, planner_img_feats_var, planner_actions_in_var, planner_action_lengths.cpu().numpy(), planner_hidden_idx_var, controller_img_feats_var, controller_actions_in_var, controller_action_lengths) planner_logprob = F.log_softmax(planner_scores, dim=1) controller_logprob = F.log_softmax( controller_scores, dim=1) planner_loss = planner_lossFn( planner_logprob, planner_actions_out_var[:, :planner_action_lengths.max( )].contiguous().view(-1, 1), planner_masks_var[:, :planner_action_lengths.max()] .contiguous().view(-1, 1)) controller_loss = controller_lossFn( controller_logprob, controller_outs_var[:, :controller_action_lengths.max( )].contiguous().view(-1, 1), controller_masks_var[:, :controller_action_lengths.max( )].contiguous().view(-1, 1)) # zero grad optim.zero_grad() # update metrics metrics.update( [planner_loss.data[0], controller_loss.data[0]]) logging.info("TRAINING PACMAN planner-loss: {:.6f} controller-loss: {:.6f}".format( planner_loss.data[0], controller_loss.data[0])) # backprop and update if args.max_controller_actions == 1: (planner_loss).backward() else: (planner_loss + controller_loss).backward() ensure_shared_grads(model.cpu(), shared_model) optim.step() if t % args.print_every == 0: print(metrics.get_stat_string()) logging.info("TRAIN: metrics: {}".format(metrics.get_stat_string())) if args.log == True: metrics.dump_log() print('[CHECK][Cache:%d][Total:%d]' % (len(train_loader.dataset.img_data_cache), len(train_loader.dataset.env_list))) logging.info('TRAIN: [CHECK][Cache:{}][Total:{}]'.format( len(train_loader.dataset.img_data_cache), len(train_loader.dataset.env_list))) if all_envs_loaded == False: train_loader.dataset._load_envs(in_order=True) if len(train_loader.dataset.pruned_env_set) == 0: done = True if args.cache == False: train_loader.dataset._load_envs( start_idx=0, in_order=True) else: done = True epoch += 1 if epoch % args.save_every == 0: model_state = get_state(model) optimizer_state = optim.state_dict() aad = dict(args.__dict__) ad = {} for i in aad: if i[0] != '_': ad[i] = aad[i] checkpoint = {'args': ad, 'state': model_state, 'epoch': epoch, 'optimizer': optimizer_state} checkpoint_path = '%s/epoch_%d_thread_%d.pt' % ( args.checkpoint_dir, epoch, rank) print('Saving checkpoint to %s' % checkpoint_path) logging.info("TRAIN: Saving checkpoint to {}".format(checkpoint_path)) torch.save(checkpoint, checkpoint_path) if __name__ == '__main__': parser = argparse.ArgumentParser() # data params parser.add_argument('-train_h5', default='data/train.h5') parser.add_argument('-val_h5', default='data/val.h5') parser.add_argument('-test_h5', default='data/test.h5') parser.add_argument('-data_json', default='data/data.json') parser.add_argument('-vocab_json', default='data/vocab.json') parser.add_argument( '-target_obj_conn_map_dir', default='data/target-obj-conn-maps/500') parser.add_argument('-map_resolution', default=500, type=int) parser.add_argument( '-mode', default='train+eval', type=str, choices=['train', 'eval', 'train+eval']) parser.add_argument('-eval_split', default='val', type=str) # model details parser.add_argument( '-model_type', default='cnn', choices=['cnn', 'cnn+q', 'lstm', 'lstm+q', 'lstm-mult+q', 'pacman']) parser.add_argument('-max_episode_length', default=100, type=int) parser.add_argument('-curriculum', default=0, type=int) # optim params parser.add_argument('-batch_size', default=20, type=int) parser.add_argument('-learning_rate', default=1e-3, type=float) parser.add_argument('-max_epochs', default=1000, type=int) parser.add_argument('-overfit', default=False, action='store_true') # bookkeeping parser.add_argument('-print_every', default=5, type=int) parser.add_argument('-eval_every', default=1, type=int) parser.add_argument('-save_every', default=1000, type=int) #optional if you would like to save specific epochs as opposed to relying on the eval thread parser.add_argument('-identifier', default='cnn') parser.add_argument('-num_processes', default=1, type=int) parser.add_argument('-max_threads_per_gpu', default=10, type=int) # checkpointing parser.add_argument('-checkpoint_path', default=False) parser.add_argument('-checkpoint_dir', default='checkpoints/nav/') parser.add_argument('-log_dir', default='logs/nav/') parser.add_argument('-log', default=False, action='store_true') parser.add_argument('-cache', default=False, action='store_true') parser.add_argument('-max_controller_actions', type=int, default=5) parser.add_argument('-max_actions', type=int) args = parser.parse_args() args.time_id = time.strftime("%m_%d_%H:%M") #MAX_CONTROLLER_ACTIONS = args.max_controller_actions if not os.path.isdir(args.log_dir): os.makedirs(args.log_dir) if args.curriculum: assert 'lstm' in args.model_type #TODO: Finish implementing curriculum for other model types logging.basicConfig(filename=os.path.join(args.log_dir, "run_{}.log".format( str(datetime.now()).replace(' ', '_'))), level=logging.INFO, format='%(asctime)-15s %(message)s') try: args.gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',') args.gpus = [int(x) for x in args.gpus] except KeyError: print("CPU not supported") logging.info("CPU not supported") exit() if args.checkpoint_path != False: print('Loading checkpoint from %s' % args.checkpoint_path) logging.info("Loading checkpoint from {}".format(args.checkpoint_path)) args_to_keep = ['model_type'] checkpoint = torch.load(args.checkpoint_path, map_location={ 'cuda:0': 'cpu' }) for i in args.__dict__: if i not in args_to_keep: checkpoint['args'][i] = args.__dict__[i] args = type('new_dict', (object, ), checkpoint['args']) args.checkpoint_dir = os.path.join(args.checkpoint_dir, args.time_id + '_' + args.identifier) args.log_dir = os.path.join(args.log_dir, args.time_id + '_' + args.identifier) # if set to overfit; set eval_split to train if args.overfit == True: args.eval_split = 'train' print(args.__dict__) logging.info(args.__dict__) if not os.path.exists(args.checkpoint_dir): os.makedirs(args.checkpoint_dir) os.makedirs(args.log_dir) if args.model_type == 'cnn': model_kwargs = {} shared_model = NavCnnModel(**model_kwargs) elif args.model_type == 'cnn+q': model_kwargs = { 'question_input': True, 'question_vocab': load_vocab(args.vocab_json) } shared_model = NavCnnModel(**model_kwargs) elif args.model_type == 'lstm': model_kwargs = {} shared_model = NavCnnRnnModel(**model_kwargs) elif args.model_type == 'lstm+q': model_kwargs = { 'question_input': True, 'question_vocab': load_vocab(args.vocab_json) } shared_model = NavCnnRnnModel(**model_kwargs) elif args.model_type == 'pacman': model_kwargs = {'question_vocab': load_vocab(args.vocab_json)} shared_model = NavPlannerControllerModel(**model_kwargs) else: exit() shared_model.share_memory() if args.checkpoint_path != False: print('Loading params from checkpoint: %s' % args.checkpoint_path) logging.info("Loading params from checkpoint: {}".format(args.checkpoint_path)) shared_model.load_state_dict(checkpoint['state']) if args.mode == 'eval': eval(0, args, shared_model) elif args.mode == 'train': if args.num_processes > 1: processes = [] for rank in range(0, args.num_processes): # for rank in range(0, args.num_processes): p = mp.Process(target=train, args=(rank, args, shared_model)) p.start() processes.append(p) for p in processes: p.join() else: train(0, args, shared_model) else: processes = [] # Start the eval thread p = mp.Process(target=eval, args=(0, args, shared_model)) p.start() processes.append(p) # Start the training thread(s) for rank in range(1, args.num_processes + 1): # for rank in range(0, args.num_processes): p = mp.Process(target=train, args=(rank, args, shared_model)) p.start() processes.append(p) for p in processes: p.join() ================================================ FILE: training/train_vqa.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. import h5py import time import argparse import numpy as np import os, sys, json import torch from torch.autograd import Variable torch.backends.cudnn.enabled = False import torch.multiprocessing as mp from models import VqaLstmModel, VqaLstmCnnAttentionModel from data import EqaDataset, EqaDataLoader from metrics import VqaMetric from models import get_state, repackage_hidden, ensure_shared_grads from data import load_vocab import pdb def eval(rank, args, shared_model): torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) if args.input_type == 'ques': model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmModel(**model_kwargs) elif args.input_type == 'ques,image': model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmCnnAttentionModel(**model_kwargs) lossFn = torch.nn.CrossEntropyLoss().cuda() eval_loader_kwargs = { 'questions_h5': getattr(args, args.eval_split + '_h5'), 'data_json': args.data_json, 'vocab': args.vocab_json, 'batch_size': 1, 'input_type': args.input_type, 'num_frames': args.num_frames, 'split': args.eval_split, 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank%len(args.gpus)], 'to_cache': args.cache } eval_loader = EqaDataLoader(**eval_loader_kwargs) print('eval_loader has %d samples' % len(eval_loader.dataset)) args.output_log_path = os.path.join(args.log_dir, 'eval_' + str(rank) + '.json') t, epoch, best_eval_acc = 0, 0, 0 while epoch < int(args.max_epochs): model.load_state_dict(shared_model.state_dict()) model.eval() metrics = VqaMetric( info={'split': args.eval_split}, metric_names=[ 'loss', 'accuracy', 'mean_rank', 'mean_reciprocal_rank' ], log_json=args.output_log_path) if args.input_type == 'ques': for batch in eval_loader: t += 1 model.cuda() idx, questions, answers = batch questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) scores = model(questions_var) loss = lossFn(scores, answers_var) # update metrics accuracy, ranks = metrics.compute_ranks( scores.data.cpu(), answers) metrics.update([loss.data[0], accuracy, ranks, 1.0 / ranks]) print(metrics.get_stat_string(mode=0)) elif args.input_type == 'ques,image': done = False all_envs_loaded = eval_loader.dataset._check_if_all_envs_loaded() while done == False: for batch in eval_loader: t += 1 model.cuda() idx, questions, answers, images, _, _, _ = batch questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) images_var = Variable(images.cuda()) scores, att_probs = model(images_var, questions_var) loss = lossFn(scores, answers_var) # update metrics accuracy, ranks = metrics.compute_ranks( scores.data.cpu(), answers) metrics.update( [loss.data[0], accuracy, ranks, 1.0 / ranks]) print(metrics.get_stat_string(mode=0)) if all_envs_loaded == False: eval_loader.dataset._load_envs() if len(eval_loader.dataset.pruned_env_set) == 0: done = True else: done = True epoch += 1 # checkpoint if best val accuracy if metrics.metrics[1][0] > best_eval_acc: best_eval_acc = metrics.metrics[1][0] if epoch % args.eval_every == 0 and args.log == True: metrics.dump_log() model_state = get_state(model) if args.checkpoint_path != False: ad = checkpoint['args'] else: ad = args.__dict__ checkpoint = {'args': ad, 'state': model_state, 'epoch': epoch} checkpoint_path = '%s/epoch_%d_accuracy_%.04f.pt' % ( args.checkpoint_dir, epoch, best_eval_acc) print('Saving checkpoint to %s' % checkpoint_path) torch.save(checkpoint, checkpoint_path) print('[best_eval_accuracy:%.04f]' % best_eval_acc) def train(rank, args, shared_model): torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) if args.input_type == 'ques': model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmModel(**model_kwargs) elif args.input_type == 'ques,image': model_kwargs = {'vocab': load_vocab(args.vocab_json)} model = VqaLstmCnnAttentionModel(**model_kwargs) lossFn = torch.nn.CrossEntropyLoss().cuda() optim = torch.optim.Adam( filter(lambda p: p.requires_grad, shared_model.parameters()), lr=args.learning_rate) train_loader_kwargs = { 'questions_h5': args.train_h5, 'data_json': args.data_json, 'vocab': args.vocab_json, 'batch_size': args.batch_size, 'input_type': args.input_type, 'num_frames': args.num_frames, 'split': 'train', 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank%len(args.gpus)], 'to_cache': args.cache } args.output_log_path = os.path.join(args.log_dir, 'train_' + str(rank) + '.json') metrics = VqaMetric( info={'split': 'train', 'thread': rank}, metric_names=['loss', 'accuracy', 'mean_rank', 'mean_reciprocal_rank'], log_json=args.output_log_path) train_loader = EqaDataLoader(**train_loader_kwargs) if args.input_type == 'ques,image': train_loader.dataset._load_envs(start_idx=0, in_order=True) print('train_loader has %d samples' % len(train_loader.dataset)) t, epoch = 0, 0 while epoch < int(args.max_epochs): if args.input_type == 'ques': for batch in train_loader: t += 1 model.load_state_dict(shared_model.state_dict()) model.train() model.cuda() idx, questions, answers = batch questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) scores = model(questions_var) loss = lossFn(scores, answers_var) # zero grad optim.zero_grad() # update metrics accuracy, ranks = metrics.compute_ranks(scores.data.cpu(), answers) metrics.update([loss.data[0], accuracy, ranks, 1.0 / ranks]) # backprop and update loss.backward() ensure_shared_grads(model.cpu(), shared_model) optim.step() if t % args.print_every == 0: print(metrics.get_stat_string()) if args.log == True: metrics.dump_log() elif args.input_type == 'ques,image': done = False all_envs_loaded = train_loader.dataset._check_if_all_envs_loaded() while done == False: for batch in train_loader: t += 1 model.load_state_dict(shared_model.state_dict()) model.train() model.cnn.eval() model.cuda() idx, questions, answers, images, _, _, _ = batch questions_var = Variable(questions.cuda()) answers_var = Variable(answers.cuda()) images_var = Variable(images.cuda()) scores, att_probs = model(images_var, questions_var) loss = lossFn(scores, answers_var) # zero grad optim.zero_grad() # update metrics accuracy, ranks = metrics.compute_ranks(scores.data.cpu(), answers) metrics.update([loss.data[0], accuracy, ranks, 1.0 / ranks]) # backprop and update loss.backward() ensure_shared_grads(model.cpu(), shared_model) optim.step() if t % args.print_every == 0: print(metrics.get_stat_string()) if args.log == True: metrics.dump_log() if all_envs_loaded == False: print('[CHECK][Cache:%d][Total:%d]' % (len(train_loader.dataset.img_data_cache), len(train_loader.dataset.env_list))) train_loader.dataset._load_envs(in_order=True) if len(train_loader.dataset.pruned_env_set) == 0: done = True else: done = True epoch += 1 if __name__ == '__main__': parser = argparse.ArgumentParser() # data params parser.add_argument('-train_h5', default='data/train.h5') parser.add_argument('-val_h5', default='data/val.h5') parser.add_argument('-test_h5', default='data/test.h5') parser.add_argument('-data_json', default='data/data.json') parser.add_argument('-vocab_json', default='data/vocab.json') parser.add_argument('-train_cache_path', default=False) parser.add_argument('-val_cache_path', default=False) parser.add_argument('-mode', default='train', type=str, choices=['train','eval']) parser.add_argument('-eval_split', default='val', type=str) # model details parser.add_argument( '-input_type', default='ques,image', choices=['ques', 'ques,image']) parser.add_argument( '-num_frames', default=5, type=int) # -1 = all frames of navigation sequence # optim params parser.add_argument('-batch_size', default=20, type=int) parser.add_argument('-learning_rate', default=3e-4, type=float) parser.add_argument('-max_epochs', default=1000, type=int) # bookkeeping parser.add_argument('-print_every', default=50, type=int) parser.add_argument('-eval_every', default=1, type=int) parser.add_argument('-identifier', default='q-only') parser.add_argument('-num_processes', default=1, type=int) parser.add_argument('-max_threads_per_gpu', default=10, type=int) # checkpointing parser.add_argument('-checkpoint_path', default=False) parser.add_argument('-checkpoint_dir', default='checkpoints/vqa/') parser.add_argument('-log_dir', default='logs/vqa/') parser.add_argument('-log', default=False, action='store_true') parser.add_argument('-cache', default=False, action='store_true') args = parser.parse_args() args.time_id = time.strftime("%m_%d_%H:%M") try: args.gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',') args.gpus = [int(x) for x in args.gpus] except KeyError: print("CPU not supported") exit() if args.checkpoint_path != False: print('Loading checkpoint from %s' % args.checkpoint_path) args_to_keep = ['input_type', 'num_frames'] checkpoint = torch.load(args.checkpoint_path, map_location={'cuda:0': 'cpu'}) for i in args.__dict__: if i not in args_to_keep: checkpoint['args'][i] = args.__dict__[i] args = type('new_dict', (object, ), checkpoint['args']) args.checkpoint_dir = os.path.join(args.checkpoint_dir, args.time_id + '_' + args.identifier) args.log_dir = os.path.join(args.log_dir, args.time_id + '_' + args.identifier) print(args.__dict__) if not os.path.exists(args.checkpoint_dir) and args.log == True: os.makedirs(args.checkpoint_dir) os.makedirs(args.log_dir) if args.input_type == 'ques': model_kwargs = {'vocab': load_vocab(args.vocab_json)} shared_model = VqaLstmModel(**model_kwargs) elif args.input_type == 'ques,image': model_kwargs = {'vocab': load_vocab(args.vocab_json)} shared_model = VqaLstmCnnAttentionModel(**model_kwargs) if args.checkpoint_path != False: print('Loading params from checkpoint: %s' % args.checkpoint_path) shared_model.load_state_dict(checkpoint['state']) shared_model.share_memory() if args.mode == 'eval': eval(0, args, shared_model) else: processes = [] # Start the eval thread p = mp.Process(target=eval, args=(0, args, shared_model)) p.start() processes.append(p) # Start the training thread(s) for rank in range(1, args.num_processes + 1): p = mp.Process(target=train, args=(rank, args, shared_model)) p.start() processes.append(p) for p in processes: p.join() ================================================ FILE: training/utils/preprocess_questions.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. # adapted from https://github.com/facebookresearch/clevr-iep/blob/master/iep/preprocess.py import h5py import argparse import numpy as np from tqdm import tqdm import os, sys, json, random import pdb """ Tokenize a sequence, converting a string seq into a list of (string) tokens by splitting on the specified delimiter. Optionally add start and end tokens. """ def tokenize(seq, delim=' ', punctToRemove=None, addStartToken=True, addEndToken=True): if punctToRemove is not None: for p in punctToRemove: seq = str(seq).replace(p, '') tokens = str(seq).split(delim) if addStartToken: tokens.insert(0, '') if addEndToken: tokens.append('') return tokens def buildVocab(sequences, minTokenCount=1, delim=' ', punctToRemove=None, addSpecialTok=False): SPECIAL_TOKENS = { '': 0, '': 1, '': 2, '': 3, } tokenToCount = {} for seq in sequences: seqTokens = tokenize( seq, delim=delim, punctToRemove=punctToRemove, addStartToken=False, addEndToken=False) for token in seqTokens: if token not in tokenToCount: tokenToCount[token] = 0 tokenToCount[token] += 1 tokenToIdx = {} if addSpecialTok == True: for token, idx in SPECIAL_TOKENS.items(): tokenToIdx[token] = idx for token, count in sorted(tokenToCount.items()): if count >= minTokenCount: tokenToIdx[token] = len(tokenToIdx) return tokenToIdx def encode(seqTokens, tokenToIdx, allowUnk=False): seqIdx = [] for token in seqTokens: if token not in tokenToIdx: if allowUnk: token = '' else: raise KeyError('Token "%s" not in vocab' % token) seqIdx.append(tokenToIdx[token]) return seqIdx def decode(seqIdx, idxToToken, delim=None, stopAtEnd=True): tokens = [] for idx in seqIdx: tokens.append(idxToToken[idx]) if stopAtEnd and tokens[-1] == '': break if delim is None: return tokens else: return delim.join(tokens) def preprocessImages(obj, render_dir=False): working_dir = os.path.join(render_dir, 'working') path_id = obj['path_id'] image_paths = [] for i in range(len(obj['pos_queue']) - 1): image_paths.append('%s/%s_%05d.jpg' % (working_dir, path_id, i + 1)) image_frames = [] for i in image_paths: if os.path.isfile(i) == False: print(i) return False img = imread(i, mode='RGB') img = imresize(img, (224, 224), interp='bicubic') img = img.transpose(2, 0, 1) img = img / 255.0 image_frames.append(img) # TODO: mean subtraction return image_frames def processActions(actions): # from shortest-path-gen format # 0: forward # 1: left # 2: right # 3: stop # # to # 0: null # 1: start # 2: forward # 3: left # 4: right # 5: stop # for model training action_translations = {0: 2, 1: 3, 2: 4, 3: 5} action_ids = [1] for i in actions: action_ids.append(action_translations[i]) return action_ids if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-input_json', required=True) parser.add_argument('-input_vocab', default=None) parser.add_argument('-output_train_h5', required=True) parser.add_argument('-output_val_h5', required=True) parser.add_argument('-output_test_h5', required=True) parser.add_argument('-output_data_json', required=True) parser.add_argument('-output_vocab', default=None) parser.add_argument('-num_ques', default=10000000, type=int) parser.add_argument('-shortest_path_dir', required=True, type=str) args = parser.parse_args() random.seed(123) np.random.seed(123) assert args.input_vocab != None or args.output_vocab != None, "Either input or output vocab required" data = json.load(open(args.input_json, 'r')) houses = data['questions'] questions = [] for h in tqdm(houses): print(h, len(houses[h])) for q in houses[h]: if len(str(q['answer']).split(' ')) > 1: q['answer'] = '_'.join(q['answer'].split(' ')) questions.append(q) print('Total questions: ', len(questions)) # build vocab if no vocab file provided if args.input_vocab == None: answerTokenToIdx = buildVocab((str(q['answer']) for q in questions if q['answer'] != 'NIL')) questionTokenToIdx = buildVocab( (q['question'] for q in questions if q['answer'] != 'NIL'), punctToRemove=['?'], addSpecialTok=True) vocab = { 'questionTokenToIdx': questionTokenToIdx, 'answerTokenToIdx': answerTokenToIdx, } else: vocab = json.load(open(args.input_vocab, 'r')) if args.output_vocab != None: json.dump(vocab, open(args.output_vocab, 'w')) # encode questions idx, encoded_questions, question_types, answers, action_labels, action_lengths, pos_queue, envs, boxes = [], [], [], [], [], [], [], [], [] for i, q in tqdm(enumerate(questions[:args.num_ques])): if os.path.exists( os.path.join(args.shortest_path_dir, q['house'] + '_' + str(q['id']) + '.json')) == False: continue nav = json.load( open( os.path.join(args.shortest_path_dir, q['house'] + '_' + str(q['id']) + '.json'), 'r')) idx.append(q['id']) questionTokens = tokenize( q['question'], punctToRemove=['?'], addStartToken=False) encoded_question = encode(questionTokens, vocab['questionTokenToIdx']) encoded_questions.append(encoded_question) question_types.append(q['type']) answers.append(vocab['answerTokenToIdx'][str(q['answer'])]) # if there are 3 positions, there will be 2 actions + actions = nav['actions'] positions = nav['positions'] action_labels.append(processActions(actions)) action_lengths.append(len(actions)) pos_queue.append(positions) boxes.append(q['bbox']) envs.append(q['house']) args.num_ques = len(idx) maxALength = max(action_lengths) + 1 action_labels_mat = np.zeros( (len(questions[:args.num_ques]), maxALength), dtype=np.int16) action_labels_mat.fill(0) # 0 = null for i in tqdm(range(len(questions[:args.num_ques]))): for j in range(len(action_labels[i])): action_labels_mat[i][j] = action_labels[i][j] # pad encoded questions maxQLength = max(len(x) for x in encoded_questions) for qe in encoded_questions: while len(qe) < maxQLength: qe.append(vocab['questionTokenToIdx']['']) # make train/test splits inds = list(range(0, len(idx))) random.shuffle(inds) train_envs = data['splits']['train'] val_envs = data['splits']['val'] test_envs = data['splits']['test'] assert any([x in train_envs for x in test_envs]) == False assert any([x in train_envs for x in val_envs]) == False train_inds = [i for i in inds if envs[i] in train_envs] val_inds = [i for i in inds if envs[i] in val_envs] test_inds = [i for i in inds if envs[i] in test_envs] # TRAIN train_idx = [idx[i] for i in train_inds] train_encoded_questions = [encoded_questions[i] for i in train_inds] train_question_types = [question_types[i] for i in train_inds] train_answers = [answers[i] for i in train_inds] train_envs = [envs[i] for i in train_inds] train_pos_queue = [pos_queue[i] for i in train_inds] train_boxes = [boxes[i] for i in train_inds] train_action_labels = action_labels_mat[train_inds] train_action_lengths = [action_lengths[i] for i in train_inds] # VAL val_idx = [idx[i] for i in val_inds] val_encoded_questions = [encoded_questions[i] for i in val_inds] val_question_types = [question_types[i] for i in val_inds] val_answers = [answers[i] for i in val_inds] val_envs = [envs[i] for i in val_inds] val_pos_queue = [pos_queue[i] for i in val_inds] val_boxes = [boxes[i] for i in val_inds] val_action_labels = action_labels_mat[val_inds] val_action_lengths = [action_lengths[i] for i in val_inds] # TEST test_idx = [idx[i] for i in test_inds] test_encoded_questions = [encoded_questions[i] for i in test_inds] test_question_types = [question_types[i] for i in test_inds] test_answers = [answers[i] for i in test_inds] test_envs = [envs[i] for i in test_inds] test_pos_queue = [pos_queue[i] for i in test_inds] test_boxes = [boxes[i] for i in test_inds] test_action_labels = action_labels_mat[test_inds] test_action_lengths = [action_lengths[i] for i in test_inds] # parse envs all_envs = list(set(envs)) train_env_idx = [all_envs.index(x) for x in train_envs] val_env_idx = [all_envs.index(x) for x in val_envs] test_env_idx = [all_envs.index(x) for x in test_envs] # write h5 files print('Writing hdf5') train_encoded_questions = np.asarray( train_encoded_questions, dtype=np.int16) print('Train', train_encoded_questions.shape) with h5py.File(args.output_train_h5, 'w') as f: f.create_dataset('idx', data=np.asarray(train_idx)) f.create_dataset('questions', data=train_encoded_questions) f.create_dataset('answers', data=np.asarray(train_answers)) f.create_dataset( 'action_labels', data=np.asarray(train_action_labels), dtype=np.int16) f.create_dataset( 'action_lengths', data=np.asarray(train_action_lengths), dtype=np.int16) val_encoded_questions = np.asarray(val_encoded_questions, dtype=np.int16) print('Val', val_encoded_questions.shape) with h5py.File(args.output_val_h5, 'w') as f: f.create_dataset('idx', data=np.asarray(val_idx)) f.create_dataset('questions', data=val_encoded_questions) f.create_dataset('answers', data=np.asarray(val_answers)) f.create_dataset( 'action_labels', data=np.asarray(val_action_labels), dtype=np.int16) f.create_dataset( 'action_lengths', data=np.asarray(val_action_lengths), dtype=np.int16) test_encoded_questions = np.asarray(test_encoded_questions, dtype=np.int16) print('Test', test_encoded_questions.shape) with h5py.File(args.output_test_h5, 'w') as f: f.create_dataset('idx', data=np.asarray(test_idx)) f.create_dataset('questions', data=test_encoded_questions) f.create_dataset('answers', data=np.asarray(test_answers)) f.create_dataset( 'action_labels', data=np.asarray(test_action_labels), dtype=np.int16) f.create_dataset( 'action_lengths', data=np.asarray(test_action_lengths), dtype=np.int16) json.dump({ 'envs': all_envs, 'train_env_idx': train_env_idx, 'val_env_idx': val_env_idx, 'test_env_idx': test_env_idx, 'train_pos_queue': train_pos_queue, 'val_pos_queue': val_pos_queue, 'test_pos_queue': test_pos_queue, 'train_boxes': train_boxes, 'val_boxes': val_boxes, 'test_boxes': test_boxes }, open(args.output_data_json, 'w')) ================================================ FILE: training/utils/preprocess_questions_pkl.py ================================================ # adapted from https://github.com/facebookresearch/clevr-iep/blob/master/iep/preprocess.py import h5py import argparse import numpy as np #from tqdm import tqdm import os, sys, json, random import pickle as pkl import pdb """ Tokenize a sequence, converting a string seq into a list of (string) tokens by splitting on the specified delimiter. Optionally add start and end tokens. """ def tokenize(seq, delim=' ', punctToRemove=None, addStartToken=True, addEndToken=True): if punctToRemove is not None: for p in punctToRemove: seq = str(seq).replace(p, '') tokens = str(seq).split(delim) if addStartToken: tokens.insert(0, '') if addEndToken: tokens.append('') return tokens def buildVocab(sequences, minTokenCount=1, delim=' ', punctToRemove=None, addSpecialTok=False): SPECIAL_TOKENS = { '': 0, '': 1, '': 2, '': 3, } tokenToCount = {} for seq in sequences: seqTokens = tokenize( seq, delim=delim, punctToRemove=punctToRemove, addStartToken=False, addEndToken=False) for token in seqTokens: if token not in tokenToCount: tokenToCount[token] = 0 tokenToCount[token] += 1 tokenToIdx = {} if addSpecialTok == True: for token, idx in SPECIAL_TOKENS.items(): tokenToIdx[token] = idx for token, count in sorted(tokenToCount.items()): if count >= minTokenCount: tokenToIdx[token] = len(tokenToIdx) return tokenToIdx def encode(seqTokens, tokenToIdx, allowUnk=False): seqIdx = [] for token in seqTokens: if token not in tokenToIdx: if allowUnk: token = '' else: raise KeyError('Token "%s" not in vocab' % token) seqIdx.append(tokenToIdx[token]) return seqIdx def decode(seqIdx, idxToToken, delim=None, stopAtEnd=True): tokens = [] for idx in seqIdx: tokens.append(idxToToken[idx]) if stopAtEnd and tokens[-1] == '': break if delim is None: return tokens else: return delim.join(tokens) def preprocessImages(obj, render_dir=False): working_dir = os.path.join(render_dir, 'working') path_id = obj['path_id'] image_paths = [] for i in range(len(obj['pos_queue']) - 1): image_paths.append('%s/%s_%05d.jpg' % (working_dir, path_id, i + 1)) image_frames = [] for i in image_paths: if os.path.isfile(i) == False: print(i) return False img = imread(i, mode='RGB') img = imresize(img, (224, 224), interp='bicubic') img = img.transpose(2, 0, 1) img = img / 255.0 image_frames.append(img) # TODO: mean subtraction return image_frames def processActions(actions): # from shortest-path-gen format # 0: forward # 1: left # 2: right # 3: stop # # to # 0: null # 1: start # 2: forward # 3: left # 4: right # 5: stop # for model training action_translations = {0: 2, 1: 3, 2: 4, 3: 5} action_ids = [1] for i in actions: action_ids.append(action_translations[i]) return action_ids if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-input_json', required=True) parser.add_argument('-input_vocab', default=None) parser.add_argument('-output_train_h5', required=True) parser.add_argument('-output_val_h5', required=True) parser.add_argument('-output_test_h5', required=True) parser.add_argument('-output_data_json', required=True) parser.add_argument('-output_vocab', default=None) parser.add_argument('-num_ques', default=10000000, type=int) parser.add_argument('-shortest_path_dir', required=True, type=str) args = parser.parse_args() random.seed(123) np.random.seed(123) assert args.input_vocab != None or args.output_vocab != None, "Either input or output vocab required" data = json.load(open(args.input_json, 'r')) houses = data['questions'] questions = [] for h in houses: print(h, len(houses[h])) for q in houses[h]: if len(str(q['answer']).split(' ')) > 1: q['answer'] = '_'.join(q['answer'].split(' ')) questions.append(q) print('Total questions: ', len(questions)) # build vocab if no vocab file provided if args.input_vocab == None: answerTokenToIdx = buildVocab((str(q['answer']) for q in questions if q['answer'] != 'NIL')) questionTokenToIdx = buildVocab( (q['question'] for q in questions if q['answer'] != 'NIL'), punctToRemove=['?'], addSpecialTok=True) vocab = { 'questionTokenToIdx': questionTokenToIdx, 'answerTokenToIdx': answerTokenToIdx, } else: vocab = json.load(open(args.input_vocab, 'r')) if args.output_vocab != None: json.dump(vocab, open(args.output_vocab, 'w')) # encode questions idx, encoded_questions, question_types, answers, action_labels, action_lengths, pos_queue, envs, boxes = [], [], [], [], [], [], [], [], [] for i, q in enumerate(questions[:args.num_ques]): if os.path.exists( os.path.join(args.shortest_path_dir, q['house'] + '_' + str(q['id']) + '.pkl')) == False: continue try: nav = pkl.load( open( os.path.join(args.shortest_path_dir, q['house'] + '_' + str(q['id']) + '.pkl'), 'rb')) except: continue idx.append(q['id']) questionTokens = tokenize( q['question'], punctToRemove=['?'], addStartToken=False) encoded_question = encode(questionTokens, vocab['questionTokenToIdx']) encoded_questions.append(encoded_question) question_types.append(q['type']) answers.append(vocab['answerTokenToIdx'][str(q['answer'])]) # if there are 3 positions, there will be 2 actions + actions = nav['actions'] positions = nav['positions'] action_labels.append(processActions(actions)) action_lengths.append(len(actions)) pos_queue.append(positions) boxes.append(q['bbox']) envs.append(q['house']) assert q['question'] == nav['question'] args.num_ques = len(idx) maxALength = max(action_lengths) + 1 action_labels_mat = np.zeros( (len(questions[:args.num_ques]), maxALength), dtype=np.int16) action_labels_mat.fill(0) # 0 = null for i in range(len(questions[:args.num_ques])): for j in range(len(action_labels[i])): action_labels_mat[i][j] = action_labels[i][j] # pad encoded questions maxQLength = max(len(x) for x in encoded_questions) for qe in encoded_questions: while len(qe) < maxQLength: qe.append(vocab['questionTokenToIdx']['']) # make train/test splits inds = list(range(0, len(idx))) random.shuffle(inds) train_envs = data['splits']['train'] val_envs = data['splits']['val'] test_envs = data['splits']['test'] assert any([x in train_envs for x in test_envs]) == False assert any([x in train_envs for x in val_envs]) == False train_inds = [i for i in inds if envs[i] in train_envs] val_inds = [i for i in inds if envs[i] in val_envs] test_inds = [i for i in inds if envs[i] in test_envs] # TRAIN train_idx = [idx[i] for i in train_inds] train_encoded_questions = [encoded_questions[i] for i in train_inds] train_question_types = [question_types[i] for i in train_inds] train_answers = [answers[i] for i in train_inds] train_envs = [envs[i] for i in train_inds] train_pos_queue = [pos_queue[i] for i in train_inds] train_boxes = [boxes[i] for i in train_inds] train_action_labels = action_labels_mat[train_inds] train_action_lengths = [action_lengths[i] for i in train_inds] # VAL val_idx = [idx[i] for i in val_inds] val_encoded_questions = [encoded_questions[i] for i in val_inds] val_question_types = [question_types[i] for i in val_inds] val_answers = [answers[i] for i in val_inds] val_envs = [envs[i] for i in val_inds] val_pos_queue = [pos_queue[i] for i in val_inds] val_boxes = [boxes[i] for i in val_inds] val_action_labels = action_labels_mat[val_inds] val_action_lengths = [action_lengths[i] for i in val_inds] # TEST test_idx = [idx[i] for i in test_inds] test_encoded_questions = [encoded_questions[i] for i in test_inds] test_question_types = [question_types[i] for i in test_inds] test_answers = [answers[i] for i in test_inds] test_envs = [envs[i] for i in test_inds] test_pos_queue = [pos_queue[i] for i in test_inds] test_boxes = [boxes[i] for i in test_inds] test_action_labels = action_labels_mat[test_inds] test_action_lengths = [action_lengths[i] for i in test_inds] # parse envs all_envs = list(set(envs)) train_env_idx = [all_envs.index(x) for x in train_envs] val_env_idx = [all_envs.index(x) for x in val_envs] test_env_idx = [all_envs.index(x) for x in test_envs] # write h5 files print('Writing hdf5') train_encoded_questions = np.asarray( train_encoded_questions, dtype=np.int16) print('Train', train_encoded_questions.shape) with h5py.File(args.output_train_h5, 'w') as f: f.create_dataset('idx', data=np.asarray(train_idx)) f.create_dataset('questions', data=train_encoded_questions) f.create_dataset('answers', data=np.asarray(train_answers)) f.create_dataset( 'action_labels', data=np.asarray(train_action_labels), dtype=np.int16) f.create_dataset( 'action_lengths', data=np.asarray(train_action_lengths), dtype=np.int16) val_encoded_questions = np.asarray(val_encoded_questions, dtype=np.int16) print('Val', val_encoded_questions.shape) with h5py.File(args.output_val_h5, 'w') as f: f.create_dataset('idx', data=np.asarray(val_idx)) f.create_dataset('questions', data=val_encoded_questions) f.create_dataset('answers', data=np.asarray(val_answers)) f.create_dataset( 'action_labels', data=np.asarray(val_action_labels), dtype=np.int16) f.create_dataset( 'action_lengths', data=np.asarray(val_action_lengths), dtype=np.int16) test_encoded_questions = np.asarray(test_encoded_questions, dtype=np.int16) print('Test', test_encoded_questions.shape) with h5py.File(args.output_test_h5, 'w') as f: f.create_dataset('idx', data=np.asarray(test_idx)) f.create_dataset('questions', data=test_encoded_questions) f.create_dataset('answers', data=np.asarray(test_answers)) f.create_dataset( 'action_labels', data=np.asarray(test_action_labels), dtype=np.int16) f.create_dataset( 'action_lengths', data=np.asarray(test_action_lengths), dtype=np.int16) json.dump({ 'envs': all_envs, 'train_env_idx': train_env_idx, 'val_env_idx': val_env_idx, 'test_env_idx': test_env_idx, 'train_pos_queue': train_pos_queue, 'val_pos_queue': val_pos_queue, 'test_pos_queue': test_pos_queue, 'train_boxes': train_boxes, 'val_boxes': val_boxes, 'test_boxes': test_boxes }, open(args.output_data_json, 'w')) ================================================ FILE: utils/house3d.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. import cv2 import csv import copy import os, sys import itertools import numpy as np from tqdm import tqdm from House3D.objrender import Vec3 import pdb class House3DUtils(): def __init__( self, env, rotation_sensitivity=9, move_sensitivity=0.5, build_graph=False, graph_dir='/path/to/3d-graphs', target_obj_conn_map_dir='/path/to/target_obj_connmaps', debug=True, load_semantic_classes=True, collision_reward=0.0, success_reward=1.0, dist_reward_scale=0.005, seeing_rwd=False): self.env = env self.debug = debug self.rotation_sensitivity = rotation_sensitivity self.move_sensitivity = move_sensitivity self.angles = [x for x in range(-180, 180, self.rotation_sensitivity)] self.angle_strings = {1: 'right', -1: 'left'} self.dirs, self.angle_map = self.calibrate_steps(reset=True) self.move_multiplier = self.move_sensitivity / np.array([np.abs(x).sum() for x in self.dirs]).mean() self.graph_dir = graph_dir self.graph = None self.target_obj_conn_map_dir = target_obj_conn_map_dir if build_graph == True: if os.path.exists( os.path.join(graph_dir, self.env.house.house['id'] + '.pkl')): self.load_graph( os.path.join(graph_dir, self.env.house.house['id'] + '.pkl')) else: self.build_graph( save_path=os.path.join( graph_dir, self.env.house.house['id'] + '.pkl')) self.rooms, self.objects = self._parse() self.collision_reward = collision_reward self.success_reward = success_reward self.dist_reward_scale = dist_reward_scale self.seeing_rwd = seeing_rwd if load_semantic_classes == True: self._load_semantic_classes() # Shortest paths are computed in 1000 x 1000 grid coordinates. # One step in the SUNCG continuous coordinate system however, can be # multiple grids in the grid coordinate system (since turns aren't 90 deg). # So even though the grid shortest path is fine-grained, # an equivalent best-fit path in SUNCG continuous coordinates # has to be computed by simulating steps. Sucks, but yeah. # # For now, we first explicitly calibrate how many steps in the gridworld # correspond to one step in continuous world, across all directions def calibrate_steps(self, reset=True): mults, angle_map = [], {} cx, cy = self.env.house.to_coor(50, 50) if reset == True: self.env.reset(x=cx, y=cy) for i in range(len(self.angles)): yaw = self.angles[i] self.env.cam.yaw = yaw self.env.cam.updateDirection() x1, y1 = self.env.house.to_grid(self.env.cam.pos.x, self.env.cam.pos.z) pos = self.env.cam.pos pos = pos + self.env.cam.front * self.move_sensitivity x2, y2 = self.env.house.to_grid(pos.x, pos.z) mult = np.array([x2, y2]) - np.array([x1, y1]) mult = (mult[0], mult[1]) angle_map[mult] = yaw mults.append(mult) return mults, angle_map # 0: forward # 1: left # 2: right # 3: stop # # returns observation, reward, done, info def step(self, action, step_reward=False): if action not in [0, 1, 2, 3]: raise IndexError if step_reward == True: pos = self.env.cam.pos x1, y1 = self.env.house.to_grid(self.env.cam.pos.x, self.env.cam.pos.z) init_target_dist = self.env.house.connMap[x1, y1] reward = 0 done = False if action == 0: mv = self.env.move_forward( dist_fwd=self.move_sensitivity, dist_hor=0) obs = self.env.render() if mv == False: # collision reward -= self.collision_reward elif mv != False and step_reward == True: # evaluate connMap dist here x2, y2 = self.env.house.to_grid(self.env.cam.pos.x, self.env.cam.pos.z) final_target_dist = self.env.house.connMap[x2, y2] reward += self.dist_reward_scale * ((init_target_dist - final_target_dist) / np.abs( self.dirs[self.angles.index(self.env.cam.yaw % 180)]).sum()) elif action == 1: self.env.rotate(-self.rotation_sensitivity) obs = self.env.render() elif action == 2: self.env.rotate(self.rotation_sensitivity) obs = self.env.render() elif action == 3: done = True obs = self.env.render() return obs, reward, done # pos: [x, y, z, yaw], or objrender.Vec3 def get_dist_to_target(self, pos): if isinstance(pos, Vec3) == True: x, y = self.env.house.to_grid(pos.x, pos.z) else: x, y = self.env.house.to_grid(pos[0], pos[2]) dist = self.env.house.connMap[x, y] return self.move_multiplier * dist def is_inside_room(self, pos, room): if isinstance(pos, Vec3) == True: x = pos.x y = pos.z else: x = pos[0] y = pos[2] if x >= room['bbox']['min'][0] and x <= room['bbox']['max'][0] and \ y >= room['bbox']['min'][2] and y <= room['bbox']['max'][2]: return True return False # takes 200-300 seconds(!) when rotation_sensitivity == 9 def build_graph(self, save_path=None): import time start_time = time.time() collide_res = self.env.house.n_row from dijkstar import Graph visit = dict() self.graph = Graph() self.mock_obs_map = np.zeros( (collide_res + 1, collide_res + 1), dtype=np.uint8) self.mock_obs_map[np.where(self.env.house.connMap == -1)] = 1 for x in range(collide_res + 1): for y in range(collide_res + 1): pos = (x, y) if self.env.house.canMove(x, y) and pos not in visit: que = [pos] visit[pos] = True ptr = 0 while ptr < len(que): cx, cy = que[ptr] ptr += 1 # add all angles for (cx, cy) here # connect first and last for ang in range(len(self.angles) - 1): self.graph.add_edge((cx, cy, self.angles[ang]), (cx, cy, self.angles[ang + 1]), { 'cost': 1 }) self.graph.add_edge((cx, cy, self.angles[ang + 1]), (cx, cy, self.angles[ang]), { 'cost': 1 }) self.graph.add_edge((cx, cy, self.angles[-1]), (cx, cy, self.angles[0]), { 'cost': 1 }) self.graph.add_edge((cx, cy, self.angles[0]), (cx, cy, self.angles[-1]), { 'cost': 1 }) for deti in range(len(self.dirs)): det = self.dirs[deti] tx, ty = cx + det[0], cy + det[1] if (self.env.house.inside(tx, ty) and self.mock_obs_map[min(cx, tx):max(cx, tx)+1, min(cy, ty):max(cy, ty)+1].sum() == 0): # make changes here to add edges for angle increments as well # # cost = 1 from one angle to the next, # and connect first and last # this would be for different angles for same tx, ty # # then there would be connections for same angle # and from (cx, cy) to (tx, ty) self.graph.add_edge( (cx, cy, self.angle_map[self.dirs[deti]]), (tx, ty, self.angle_map[self.dirs[deti]]), { 'cost': 1 }) tp = (tx, ty) if tp not in visit: visit[tp] = True que.append(tp) if self.debug == True: print("--- %s seconds to build the graph ---" % (time.time() - start_time)) if save_path != None: start_time = time.time() print("saving graph to %s" % (save_path)) self.graph.dump(save_path) if self.debug == True: print("--- %s seconds to save the graph ---" % (time.time() - start_time)) def load_graph(self, path): import time start_time = time.time() from dijkstar import Graph self.graph = Graph() self.graph.load(path) if self.debug == True: print("--- %s seconds to load the graph ---" % (time.time() - start_time)) # takes 1-5 seconds when rotation_sensitivity == 9 def compute_shortest_path(self, source, target, graph=None): from dijkstar import find_path if graph == None: if self.graph == None: if os.path.exists( os.path.join(self.graph_dir, self.env.house.house['id'] + '.pkl')): self.load_graph( os.path.join(self.graph_dir, self.env.house.house['id'] + '.pkl')) else: self.build_graph( save_path=os.path.join( graph_dir, self.env.house.house['id'] + '.pkl')) graph = self.graph cost_func = lambda u, v, e, prev_e: e['cost'] shortest_path = find_path(graph, source, target, cost_func=cost_func) return shortest_path def fit_grid_path_to_suncg(self, nodes, init_yaw=None, back_skip=2): # don't mess with the originals nodes = copy.deepcopy(nodes) # set initial position x, y = self.env.house.to_coor(nodes[0][0], nodes[0][1], True) x, y = x.astype(np.float32).item(), y.astype(np.float32).item() self.env.cam.pos.x, self.env.cam.pos.y, self.env.cam.pos.z = x, self.env.house.robotHei, y if init_yaw == None: self.env.cam.yaw = np.random.choice(self.angles) else: self.env.cam.yaw = init_yaw self.env.cam.updateDirection() pos_queue, action_queue = [], [] current_pos = self._vec_to_array(self.env.cam.pos, self.env.cam.yaw) pos_queue = pos_queue + [current_pos] ptr = 0 while ptr < len(nodes) - 1: turned = False # target rotation target_yaw = self.angle_map[tuple( np.array(nodes[ptr]) - np.array(nodes[ptr + 1]))] # turn if target_yaw != current_pos[3]: p_q, a_q = self.get_rotate_steps(current_pos, target_yaw) pos_queue = pos_queue + p_q action_queue = action_queue + a_q self.env.cam.yaw = target_yaw self.env.cam.updateDirection() turned = True current_pos = self._vec_to_array(self.env.cam.pos, self.env.cam.yaw) # move cx, cz = self.env.house.to_coor(nodes[ptr + 1][0], nodes[ptr + 1][1], True) # if collision, find another sub-path, and delete that edge if self.env.move(cx, cz) == False: if nodes[ptr + 1] in self.graph[nodes[ptr]]: del self.graph[nodes[ptr]][nodes[ptr + 1]] print('deleted', nodes[ptr], nodes[ptr + 1]) # delete the turns if turned == True: pos_queue = pos_queue[:-len(p_q)] action_queue = action_queue[:-len(a_q)] if back_skip != 0: pos_queue = pos_queue[:-back_skip] action_queue = action_queue[:-back_skip] dest_ptr = ptr + 1 ptr = ptr - back_skip sub_shortest_path = self.compute_shortest_path( nodes[ptr], nodes[dest_ptr]) nodes = nodes[:ptr] + sub_shortest_path.nodes + nodes[dest_ptr + 1:] current_pos = pos_queue[-1] else: # this is the new position the agent moved to current_pos = self._vec_to_array(self.env.cam.pos, self.env.cam.yaw) assert current_pos[3] == pos_queue[-1][3] and ( current_pos[0] != pos_queue[-1][0] or current_pos[2] != pos_queue[-1][2]) pos_queue = pos_queue + [current_pos] action_queue = action_queue + ['fwd'] ptr = ptr + 1 action_queue.append('stop') return pos_queue, action_queue # pos contains [x, y, z, yaw] # given a position and target yaw, this function # computes actions needed to turn there def get_rotate_steps(self, pos, target_yaw): direction = np.random.choice([1, -1]) cur_yaw = pos[-1] ptr = self.angles.index(cur_yaw) pos_queue, action_queue = [], [] while cur_yaw != target_yaw: if len(pos_queue) == len(self.angles) // 2: # reset direction = direction * -1 cur_yaw = pos[-1] ptr = self.angles.index(cur_yaw) pos_queue, action_queue = [], [] ptr = (ptr + direction) % len(self.angles) cur_yaw = self.angles[ptr] pos_queue.append([pos[0], pos[1], pos[2], self.angles[ptr]]) action_queue.append(self.angle_strings[direction]) return pos_queue, action_queue def _vec_to_array(self, pos, yaw): return [pos.x, pos.y, pos.z, yaw] # render images from camera position queue def render_images_from_pos_queue(self, pos_queue=[], img_dir='tmp/images', actions=None, values=None, rewards=None): if len(pos_queue) == 0: return False action_map = {0: 'FRWD', 1: 'LEFT', 2: 'RGHT', 3: 'STOP'} import scipy.misc sgx, sgy = self.env.house.to_grid(pos_queue[0][0], pos_queue[0][2]) tgx, tgy = self.env.house.to_grid(pos_queue[-1][0], pos_queue[-1][2]) for i in range(len(pos_queue)): # set position p = pos_queue[i] self.env.reset(x=p[0], y=p[2], yaw=p[3]) # save image image = np.array(self.env.render(), copy=False) # put some text text = "[%02d]" % (i + 1) if actions != None and i < len(actions): text += "[%s]" % action_map[actions[i]] if values != None and i < len(values): text += "[V%.03f]" % values[i] if rewards != None and i > 0 and i <= len(rewards): text += "[R%.03f]" % rewards[i - 1] image = cv2.putText( img=np.copy(image), text=text, org=(20, 30), fontFace=3, fontScale=0.4, color=(255, 255, 255), thickness=1) scipy.misc.toimage(image).save( '%s/%s_%04d_%04d_%04d_%04d_%05d_%05d.jpg' % (img_dir, self.env.house.house['id'], sgx, sgy, tgx, tgy, i + 1, len(pos_queue))) return True # render video from camera position queue # # NOTE: call `render_images_from_pos_queue` before calling this def render_video_from_pos_queue(self, pos_queue=[], img_dir='tmp/images', vid_dir='tmp/videos', fps=[5], tag_name='piano'): if len(pos_queue) == 0: return False import subprocess sgx, sgy = self.env.house.to_grid(pos_queue[0][0], pos_queue[0][2]) tgx, tgy = self.env.house.to_grid(pos_queue[-1][0], pos_queue[-1][2]) for fp in fps: subprocess.Popen([ '/srv/share/abhshkdz/local/bin/ffmpeg', '-f', 'image2', '-r', str(fp), '-i', '%s/%s_%04d_%04d_%04d_%04d' % (img_dir, self.env.house.house['id'], sgx, sgy, tgx, tgy) + '_%05d_' + '%05d.jpg' % (len(pos_queue)), '-vcodec', 'libx264', '-crf', '25', '-y', '%s/%s_%04d_%04d_%s_%04d_%04d_%d.mp4' % (vid_dir, self.env.house.house['id'], sgx, sgy, tag_name, tgx, tgy, fp) ]) if self.debug == True: print('Rendered video to ' + '%s/%s_%04d_%04d_%s_%04d_%04d_%d.mp4' % (vid_dir, self.env.house.house['id'], sgx, sgy, tag_name, tgx, tgy, fp)) return True # Go over all nodes of house environment and accumulate objects room-wise. def _parse(self, levelsToExplore=[0]): rooms, objects = [], {} data = self.env.house.house modelCategoryMapping = {} import csv csvFile = csv.reader(open(self.env.house.metaDataFile, 'r')) headers = next(csvFile) for row in csvFile: modelCategoryMapping[row[headers.index('model_id')]] = { headers[x]: row[x] for x in range(2, len(headers)) # 0 is index, 1 is model_id } for i in levelsToExplore: for j in range(len(data['levels'][i]['nodes'])): assert data['levels'][i]['nodes'][j]['type'] != 'Box' if 'valid' in data['levels'][i]['nodes'][j]: assert data['levels'][i]['nodes'][j]['valid'] == 1 # Rooms if data['levels'][i]['nodes'][j]['type'] == 'Room': if 'roomTypes' not in data['levels'][i]['nodes'][j]: continue # Can rooms have more than one type? # Yes, they can; just found ['Living_Room', 'Dining_Room', 'Kitchen'] # assert len(data['levels'][i]['nodes'][j]['roomTypes']) <= 3 roomType = [ # ' '.join(x.lower().split('_')) x.lower() for x in data['levels'][i]['nodes'][j]['roomTypes'] ] nodes = data['levels'][i]['nodes'][j][ 'nodeIndices'] if 'nodeIndices' in data['levels'][i][ 'nodes'][j] else [] rooms.append({ 'type': roomType, 'bbox': data['levels'][i]['nodes'][j]['bbox'], 'nodes': nodes, 'model_id': data['levels'][i]['nodes'][j]['modelId'] }) # Objects elif data['levels'][i]['nodes'][j]['type'] == 'Object': if 'materials' not in data['levels'][i]['nodes'][j]: material = [] else: material = data['levels'][i]['nodes'][j]['materials'] objects[data['levels'][i]['nodes'][j]['id']] = { 'id': data['levels'][i]['nodes'][j]['id'], 'model_id': data['levels'][i]['nodes'][j]['modelId'], 'fine_class': modelCategoryMapping[data['levels'][i]['nodes'][j][ 'modelId']]['fine_grained_class'], 'coarse_class': modelCategoryMapping[data['levels'][i]['nodes'][j][ 'modelId']]['coarse_grained_class'], 'bbox': data['levels'][i]['nodes'][j]['bbox'], 'mat': material } return rooms, objects # Spawn at a randomly selected point in a particular room def spawn_room(self, room=None): if room == None: return False, None target_room = '_'.join(room.lower().split(' ')) if self.env.house.hasRoomType(target_room) == False: return False, None rooms = self.env.house._getRooms(target_room) room = np.random.choice(rooms) gx1, gy1, gx2, gy2 = self.env.house._getRoomBounds(room) available_coords = [] for x in range(gx1, gx2 + 1): for y in range(gy1, gy2 + 1): if self.env.house.moveMap[x, y] > 0: available_coords.append((x, y)) # print(available_coords) spawn_coord_idx = np.random.choice(len(available_coords)) spawn_coord = available_coords[spawn_coord_idx] return spawn_coord, room # Spawn close to an object # If room given, look for object within room def spawn_object(self, obj=None, room=None): if object == None: return False, None if isinstance(obj, list) == False: obj = [obj] is_door = False if 'door' in obj: is_door = True target_obj = ['_'.join(x.lower().split(' ')) for x in obj] if room != None: if 'nodeIndices' in room: objs = [ self.objects['0_' + str(x)] for x in room['nodeIndices'] if self.objects['0_' + str(x)]['fine_class'] in target_obj ] else: objs = [ self.objects['0_' + str(x)] for x in room['nodes'] if self.objects['0_' + str(x)]['fine_class'] in target_obj ] else: obj_id_list = list( itertools.chain.from_iterable( [x['nodes'] for x in self.rooms if x['type'] != []])) objs = [ self.objects['0_' + str(x)] for x in obj_id_list if self.objects['0_' + str(x)]['fine_class'] in target_obj ] if len(objs) == 0: return False, None, None obj_idx = np.random.choice(len(objs)) obj = objs[obj_idx] self.target_obj_class = obj['fine_class'].lower() gx1, gy1, gx2, gy2 = self.env.house._getRoomBounds(obj) if room == None: obj_node_idx = int(obj['id'][2:]) room = [ x for x in self.env.house.all_rooms if 'nodeIndices' in x and obj_node_idx in x['nodeIndices'] ][0] self.set_target_object(obj, room) available_x, available_y = np.where(self.env.house.connMap == 0) if len(available_x) == 0: return False, None, None spawn_coords = [] for i in range(len(available_x)): spawn_coords.append((available_x[i], available_y[i])) return spawn_coords, obj, room # analogous to `setTargetRoom` in the House3D API def set_target_object(self, obj, room): object_tp = room['id'] + '_' + obj['id'] + '_' + obj['fine_class'].lower( ) # Caching if object_tp in self.env.house.connMapDict: self.env.house.connMap, self.env.house.connectedCoors, self.env.house.inroomDist, self.env.house.maxConnDist = self.env.house.connMapDict[ object_tp] return True # object changed! elif os.path.exists( os.path.join( self.target_obj_conn_map_dir, self.env.house.house['id'] + '_' + object_tp + '.npy')): self.env.house.connMap = np.load( os.path.join( self.target_obj_conn_map_dir, self.env.house.house['id'] + '_' + object_tp + '.npy')) if self.env.house.connMap.shape[0] == self.env.house.n_row+1: self.env.house.connectedCoors, self.env.house.inroomDist, self.env.house.maxConnDist = None, None, None return True self.env.house.connMap = connMap = np.ones( (self.env.house.n_row + 1, self.env.house.n_row + 1), dtype=np.int32) * -1 self.env.house.inroomDist = inroomDist = np.ones( (self.env.house.n_row + 1, self.env.house.n_row + 1), dtype=np.float32) * -1 dirs = [[0, 1], [1, 0], [-1, 0], [0, -1]] que = [] flag_find_open_components = True _ox1, _, _oy1 = obj['bbox']['min'] _ox2, _, _oy2 = obj['bbox']['max'] ocx, ocy = (_ox1 + _ox2) / 2, (_oy1 + _oy2) / 2 ox1, oy1, ox2, oy2 = self.env.house.rescale(_ox1, _oy1, _ox2, _oy2) for _ in range(2): _x1, _, _y1 = room['bbox']['min'] _x2, _, _y2 = room['bbox']['max'] cx, cy = (_x1 + _x2) / 2, (_y1 + _y2) / 2 x1, y1, x2, y2 = self.env.house.rescale(_x1, _y1, _x2, _y2) curr_components = self.env.house._find_components( x1, y1, x2, y2, dirs=dirs, return_open=flag_find_open_components ) # find all the open components if len(curr_components) == 0: print('No space found! =(') raise ValueError('no space') if isinstance(curr_components[0], list): # join all the coors in the open components curr_major_coors = list(itertools.chain(*curr_components)) else: curr_major_coors = curr_components min_dist_to_center, min_dist_to_edge = 1e50, 1e50 for x, y in curr_major_coors: ### # Compute minimum dist to edge here if x in range(ox1, ox2): dx = 0 elif x < ox1: dx = ox1 - x else: dx = x - ox2 if y in range(oy1, oy2): dy = 0 elif y < oy1: dy = oy1 - y else: dy = y - oy2 assert dx >= 0 and dy >= 0 if dx != 0 or dy != 0: dd = np.sqrt(dx**2 + dy**2) elif dx == 0: dd = dy else: dd = dx if dd < min_dist_to_edge: min_dist_to_edge = int(np.ceil(dd)) ### tx, ty = self.env.house.to_coor(x, y) tdist = np.sqrt((tx - ocx)**2 + (ty - ocy)**2) if tdist < min_dist_to_center: min_dist_to_center = tdist inroomDist[x, y] = tdist margin = min_dist_to_edge + 1 for x, y in curr_major_coors: inroomDist[x, y] -= min_dist_to_center for x, y in curr_major_coors: if x in range(ox1 - margin, ox2 + margin) and y in range( oy1 - margin, oy2 + margin): connMap[x, y] = 0 que.append((x, y)) if len(que) > 0: break if flag_find_open_components: flag_find_open_components = False else: break raise ValueError ptr = 0 self.env.house.maxConnDist = 1 while ptr < len(que): x, y = que[ptr] cur_dist = connMap[x, y] ptr += 1 for dx, dy in dirs: tx, ty = x + dx, y + dy if self.env.house.inside(tx, ty) and self.env.house.canMove( tx, ty) and not self.env.house.isConnect(tx, ty): que.append((tx, ty)) connMap[tx, ty] = cur_dist + 1 if cur_dist + 1 > self.env.house.maxConnDist: self.env.house.maxConnDist = cur_dist + 1 self.env.house.connMapDict[object_tp] = (connMap, que, inroomDist, self.env.house.maxConnDist) np.save( os.path.join( self.target_obj_conn_map_dir, self.env.house.house['id'] + '_' + object_tp + '.npy'), connMap) self.connectedCoors = que print(' >>>> ConnMap Cached!') return True # room changed! def _load_semantic_classes(self, color_file=None): if color_file == None: color_file = self.env.config['colorFile'] self.semantic_classes = {} with open(color_file) as csv_file: reader = csv.DictReader(csv_file) for row in reader: c = np.array((row['r'], row['g'], row['b']), dtype=np.uint8) fine_cat = row['name'].lower() self.semantic_classes[fine_cat] = c return self.semantic_classes def _get_best_yaw_obj_from_pos(self, obj_id, grid_pos, height=1.0): obj = self.objects[obj_id] obj_fine_class = obj['fine_class'] cx, cy = self.env.house.to_coor(grid_pos[0], grid_pos[1]) self.env.cam.pos.x = cx self.env.cam.pos.y = height self.env.cam.pos.z = cy best_yaw, best_coverage = None, 0 for yaw in self.angles: self.env.cam.yaw = yaw self.env.cam.updateDirection() seg = self.env.render(mode='semantic') c = self.semantic_classes[obj_fine_class.lower()] mask = np.all(seg == c, axis=2) coverage = np.sum(mask) / (seg.shape[0] * seg.shape[1]) if best_yaw == None: best_yaw = yaw best_coverage = coverage else: if coverage > best_coverage: best_yaw = yaw best_coverage = coverage return best_yaw, best_coverage def _get_best_view_obj(self, obj, coverage_thres=0.5, dist_add=0.5, robot_height=False): bbox = obj['bbox'] obj_fine_class = obj['fine_class'] obj_max = np.asarray(bbox['max']) obj_min = np.asarray(bbox['min']) obj_center = (obj_min + obj_max) / 2 c_x, c_y, c_z = obj_center max_radius = np.sqrt( (obj_max[0] - obj_min[0]) * (obj_max[0] - obj_min[0]) + (obj_max[2] - obj_min[2]) * (obj_max[2] - obj_min[2])) / 2.0 max_radius += dist_add best_pos = None best_coverage = 0 returned_pos_cov = [] for yaw in self.angles: pos = [ c_x - max_radius * np.cos(yaw * (2 * np.pi) / 360.0), c_y, c_z - max_radius * np.sin(yaw * (2 * np.pi) / 360.0), yaw ] if robot_height == True: pos[1] = min(max(0.75, c_y), 2.00) self.env.cam.pos.x = pos[0] self.env.cam.pos.y = pos[1] self.env.cam.pos.z = pos[2] self.env.cam.yaw = pos[3] self.env.cam.updateDirection() seg = self.env.render(mode='semantic') c = self.semantic_classes[obj_fine_class.lower()] mask = np.all(seg == c, axis=2) coverage = np.sum(mask) / (seg.shape[0] * seg.shape[1]) returned_pos_cov.append([pos, coverage]) if coverage > coverage_thres: return pos, coverage, returned_pos_cov elif coverage > best_coverage: best_coverage = coverage best_pos = pos return best_pos, best_coverage, returned_pos_cov ================================================ FILE: utils/make_houses.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. import json import argparse import subprocess import shlex import os import multiprocessing parser = argparse.ArgumentParser( description='Create obj+mtl files for the houses in the dataset.') parser.add_argument('-eqa_path', help='/path/to/eqa.json', required=True) parser.add_argument( '-suncg_toolbox_path', help='/path/to/SUNCGtoolbox', required=True) parser.add_argument( '-suncg_data_path', help='/path/to/suncg/data_root', required=True) parser.add_argument( '-num_processes', help='number of threads to use', default=multiprocessing.cpu_count()) args = parser.parse_args() eqa_data = json.load(open(args.eqa_path, 'r')) houses = list(eqa_data['questions'].keys()) start_dir = os.getcwd() def extract_threaded(house): os.chdir(os.path.join(args.suncg_data_path, 'house', house)) subprocess.call( shlex.split('%s house.json house.obj' % (os.path.join( args.suncg_toolbox_path, 'gaps', 'bin', 'x86_64', 'scn2scn'), ))) print('extracted', house) pool = multiprocessing.Pool(args.num_processes) pool.map(extract_threaded, houses)