Repository: rohitgirdhar/AttentionalPoolingAction Branch: master Commit: 9ab0acd9360f Files: 98 Total size: 622.1 KB Directory structure: gitextract_4chdi2im/ ├── .gitignore ├── LICENSE ├── README.md ├── experiments/ │ ├── 001_MPII_ResNet.yaml │ ├── 001_MPII_ResNet_pretrained.yaml │ ├── 002_MPII_ResNet_withAttention.yaml │ ├── 002_MPII_ResNet_withAttention_pretrained.yaml │ ├── 003_MPII_ResNet_withPoseAttention.yaml │ ├── 003_MPII_ResNet_withPoseAttention_pretrained.yaml │ └── 004_MPII_ResNet_withAttention_train+val.yaml ├── models/ │ ├── .github/ │ │ └── ISSUE_TEMPLATE.md │ ├── .gitignore │ ├── .gitmodules │ ├── LICENSE │ └── slim/ │ ├── __init__.py │ ├── datasets/ │ │ ├── __init__.py │ │ ├── cifar10.py │ │ ├── dataset_factory.py │ │ ├── dataset_utils.py │ │ ├── download_and_convert_cifar10.py │ │ ├── download_and_convert_flowers.py │ │ ├── download_and_convert_mnist.py │ │ ├── flowers.py │ │ ├── imagenet.py │ │ └── mnist.py │ ├── deployment/ │ │ ├── __init__.py │ │ ├── model_deploy.py │ │ └── model_deploy_test.py │ ├── nets/ │ │ ├── __init__.py │ │ ├── alexnet.py │ │ ├── alexnet_test.py │ │ ├── cifarnet.py │ │ ├── inception.py │ │ ├── inception_resnet_v2.py │ │ ├── inception_resnet_v2_test.py │ │ ├── inception_utils.py │ │ ├── inception_v1.py │ │ ├── inception_v1_test.py │ │ ├── inception_v2.py │ │ ├── inception_v2_test.py │ │ ├── inception_v2_tsn.py │ │ ├── inception_v3.py │ │ ├── inception_v3_test.py │ │ ├── inception_v4.py │ │ ├── inception_v4_test.py │ │ ├── lenet.py │ │ ├── nets_factory.py │ │ ├── nets_factory_test.py │ │ ├── overfeat.py │ │ ├── overfeat_test.py │ │ ├── resnet_utils.py │ │ ├── resnet_v1.py │ │ ├── resnet_v1_test.py │ │ ├── resnet_v2.py │ │ ├── resnet_v2_test.py │ │ ├── vgg.py │ │ └── vgg_test.py │ └── preprocessing/ │ ├── __init__.py │ ├── cifarnet_preprocessing.py │ ├── inception_preprocessing.py │ ├── lenet_preprocessing.py │ ├── preprocessing_factory.py │ └── vgg_preprocessing.py ├── src/ │ ├── config.py │ ├── custom_ops/ │ │ ├── Makefile │ │ ├── __init__.py │ │ ├── custom_ops_factory.py │ │ ├── pose_to_heatmap.cc │ │ ├── pose_utils.hpp │ │ ├── render_objects.cc │ │ ├── render_pose.cc │ │ ├── test/ │ │ │ ├── pose_to_heatmap_op_test.py │ │ │ ├── render_objects_op_test.py │ │ │ └── zero_out_channels_op_test.py │ │ └── zero_out_channels.cc │ ├── datasets/ │ │ ├── __init__.py │ │ ├── charades.py │ │ ├── dataset_factory.py │ │ ├── dataset_utils.py │ │ ├── hico.py │ │ ├── hmdb51.py │ │ ├── image_read_utils.py │ │ ├── jhmdb21.py │ │ ├── mpii.py │ │ └── video_data_utils.py │ ├── eval/ │ │ ├── __init__.py │ │ ├── cap_eval_utils.py │ │ └── utils.py │ ├── eval.py │ ├── loss.py │ ├── preprocess_pipeline.py │ ├── restore/ │ │ ├── __init__.py │ │ ├── model_restorer.py │ │ └── var_name_mapper.py │ └── train.py └── utils/ ├── convert_mpii_result_for_eval.m ├── convert_mpii_result_for_eval.sh └── dataset_utils/ └── gen_tfrecord_mpii.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ *.pyc *.swp *.so *.jpg *.html \#*\# .\#* *~ *.h5 src/expt_outputs/* src/data/* # Custom stuff webpages/002_VisAtt2/hmdb_frames webpages/002_VisAtt2/linAtt webpages/002_VisAtt2/poseAtt ================================================ FILE: LICENSE ================================================ Copyright (c) 2017 Rohit Girdhar and Deva Ramanan. All rights reserved. This code is copyrighted by the authors and Carnegie Mellon University, and is for non-commercial research purposes only. Please contact the authors and Carnegie Mellon University if you are interested in licensing for commercial purposes. Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright {yyyy} {name of copyright owner} Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ # Attentional Pooling for Action Recognition [[project page](https://rohitgirdhar.github.io/AttentionalPoolingAction/)] [[paper](https://arxiv.org/abs/1711.01467)] If this code helps with your work/research, please consider citing Rohit Girdhar and Deva Ramanan. **Attentional Pooling for Action Recognition**. Advances in Neural Information Processing Systems (NIPS), 2017. ```txt @inproceedings{Girdhar_17b_AttentionalPoolingAction, title = {Attentional Pooling for Action Recognition}, author = {Girdhar, Rohit and Ramanan, Deva}, booktitle = {NIPS}, year = 2017 } ``` ## Pre-requisites This code was trained and tested with 1. CentOS 6.5 2. Python 2.7 3. TensorFlow 1.1.0-rc2 ([6a1825e2](https://github.com/tensorflow/tensorflow/tree/6a1825e2369d2537e15dc585705c53c4b763f3f6)) ## Getting started Clone the code and create some directories for outputs ```bash $ git clone --recursive https://github.com/rohitgirdhar/AttentionalPoolingAction.git $ export ROOT=`pwd`/AttentionalPoolingAction $ cd $ROOT/src/ $ mkdir -p expt_outputs data $ # compile some custom ops $ cd custom_ops; make; cd .. ``` ## Data setup You can download the `tfrecord` files for MPII I used from [here](https://cmu.box.com/shared/static/xb7esevyl6uzmra2eehnkbt2ud7awld9.tar) and uncompress on to a fast local disk. If you want to create your own tfrecords, you can use the following steps, which is what I used to create the linked tfrecord files Convert the MPII data into tfrecords. The system also can read from individual JPEG files, but that needs a slightly different intial setup. First download the MPII [images](http://datasets.d2.mpi-inf.mpg.de/andriluka14cvpr/mpii_human_pose_v1.tar.gz) and [annotations](http://datasets.d2.mpi-inf.mpg.de/andriluka14cvpr/mpii_human_pose_v1_u12_2.zip), and un-compress the files. ```bash $ cd $ROOT/utils/dataset_utils $ # Set the paths for MPII images and annotations file in gen_tfrecord_mpii.py $ python gen_tfrecord_mpii.py # Will generate the tfrecord files ``` ### Keypoint labels for other datasets While MPII dataset comes with pose labels, I also experiment with HMDB-51 and HICO, pose for which was computed using an initial version of [OpenPose](https://github.com/CMU-Perceptual-Computing-Lab/openpose). I provide the extracted keypoints here: [HMDB51](https://cmu.box.com/shared/static/gt8lhpafu7zwexf1wdwwmsufoktg94rg.tar) and [HICO](https://cmu.box.com/shared/static/42xizpt0w3almdgwczjxawvc1pvpesoa.tar). ## Testing pre-trained models First download and unzip the [pretrained models](https://cmu.box.com/shared/static/s72scgtjj3lm60hsufi25rfjs2dk3a7i.zip) to a `$ROOT/src/pretrained_models/`. The models can be run by ```bash # Baseline model (no attention) $ python eval.py --cfg ../experiments/001_MPII_ResNet_pretrained.yaml # With attention $ python eval.py --cfg ../experiments/002_MPII_ResNet_withAttention_pretrained.yaml # With pose regularized attention $ python eval.py --cfg ../experiments/003_MPII_ResNet_withPoseAttention_pretrained.yaml ``` ### Expected performance on MPII Validation set | Method | mAP | Accuracy | |--------|-----|------| | Baseline (no attention) | 26.2 | 33.5 | | With attention | 30.3 | 37.2 | | With pose regularized attention | 30.6 | 37.8 | ## Training Train an attentional pooled model on MPII dataset, using `python train.py --cfg `. ```bash $ cd $ROOT/src $ python train.py --cfg ../experiments/002_MPII_ResNet_withAttention.yaml # To train the model with pose regularized attention, use the following config $ python train.py --cfg ../experiments/003_MPII_ResNet_withPoseAttention.yaml # To train the baseline without attention, use the following config $ python train.py --cfg ../experiments/001_MPII_ResNet.yaml ``` ## Testing and evaluation Test the model trained above on the validation set, using `python eval.py --cfg `. ```bash $ python eval.py --cfg ../experiments/002_MPII_ResNet_withAttention.yaml # To evaluate the model with pose regularized attention $ python eval.py --cfg ../experiments/003_MPII_ResNet_withPoseAttention.yaml # To evaluate the model without attention $ python train.py --cfg ../experiments/001_MPII_ResNet.yaml ``` The performance of these models should be similar to the above released pre-trained models. ## Train + test on the final test set This is for getting the final number on MPII test set. ```bash # Train on the train + val set $ python train.py --cfg ../experiments/004_MPII_ResNet_withAttention_train+val.yaml # Test on the test set $ python eval.py --cfg ../experiments/004_MPII_ResNet_withAttention_train+val.yaml --save # Convert the output into the MAT files as expected by MPII authors (requires matlab/octave) $ cd ../utils; $ bash convert_mpii_result_for_eval.sh ../src/expt_outputs/004_MPII_ResNet_withAttention_train+val.yaml/ # Now the generated mat file can be emailed to MPII authors for test evaluation ``` ================================================ FILE: experiments/001_MPII_ResNet.yaml ================================================ GPUS: '0,1,2,3' NUM_READERS: 4 NUM_PREPROCESSING_THREADS: 12 MODEL_NAME: 'resnet_v1_101' TRAIN: ITER_SIZE: 2 LEARNING_RATE: 0.001 BATCH_SIZE: 16 FINAL_POSE_HMAP_SIDE: 15 LEARNING_RATE_DECAY_RATE: 0.33 NUM_STEPS_PER_DECAY: 5000 MAX_NUMBER_OF_STEPS: 12000 LOSS_FN_ACTION: softmax-xentropy CHECKPOINT_PATH: data/pretrained_models/resnet_v1_101.ckpt CHECKPOINT_EXCLUDE_SCOPES: resnet_v1_101/logits LOSS_FN_ACTION: 'softmax-xentropy' LOSS_FN_POSE: '' TEST: EVAL_METRIC: mAP BATCH_SIZE: 1 ================================================ FILE: experiments/001_MPII_ResNet_pretrained.yaml ================================================ GPUS: '0,1,2,3' NUM_READERS: 4 NUM_PREPROCESSING_THREADS: 12 MODEL_NAME: 'resnet_v1_101' TRAIN: ITER_SIZE: 2 LEARNING_RATE: 0.001 BATCH_SIZE: 16 FINAL_POSE_HMAP_SIDE: 15 LEARNING_RATE_DECAY_RATE: 0.33 NUM_STEPS_PER_DECAY: 5000 MAX_NUMBER_OF_STEPS: 12000 LOSS_FN_ACTION: softmax-xentropy CHECKPOINT_PATH: data/pretrained_models/resnet_v1_101.ckpt CHECKPOINT_EXCLUDE_SCOPES: resnet_v1_101/logits LOSS_FN_ACTION: 'softmax-xentropy' LOSS_FN_POSE: '' TEST: EVAL_METRIC: mAP BATCH_SIZE: 1 CHECKPOINT_PATH: pretrained_models/mpii_baseline/model.ckpt-12000 ================================================ FILE: experiments/002_MPII_ResNet_withAttention.yaml ================================================ GPUS: '0,1,2,3' NUM_READERS: 4 NUM_PREPROCESSING_THREADS: 12 MODEL_NAME: 'resnet_v1_101' NET: USE_POSE_PRELOGITS_BASED_ATTENTION: True USE_POSE_PRELOGITS_BASED_ATTENTION_SINGLE_LAYER_ATT: True TRAIN: ITER_SIZE: 2 LEARNING_RATE: 0.001 BATCH_SIZE: 16 FINAL_POSE_HMAP_SIDE: 15 LEARNING_RATE_DECAY_RATE: 0.33 NUM_STEPS_PER_DECAY: 5000 MAX_NUMBER_OF_STEPS: 12000 LOSS_FN_ACTION: softmax-xentropy CHECKPOINT_PATH: data/pretrained_models/resnet_v1_101.ckpt CHECKPOINT_EXCLUDE_SCOPES: resnet_v1_101/logits LOSS_FN_ACTION: 'softmax-xentropy' LOSS_FN_POSE: '' TEST: EVAL_METRIC: mAP BATCH_SIZE: 1 ================================================ FILE: experiments/002_MPII_ResNet_withAttention_pretrained.yaml ================================================ GPUS: '0,1,2,3' NUM_READERS: 4 NUM_PREPROCESSING_THREADS: 12 MODEL_NAME: 'resnet_v1_101' NET: USE_POSE_PRELOGITS_BASED_ATTENTION: True USE_POSE_PRELOGITS_BASED_ATTENTION_SINGLE_LAYER_ATT: True TRAIN: ITER_SIZE: 2 LEARNING_RATE: 0.001 BATCH_SIZE: 16 FINAL_POSE_HMAP_SIDE: 15 LEARNING_RATE_DECAY_RATE: 0.33 NUM_STEPS_PER_DECAY: 5000 MAX_NUMBER_OF_STEPS: 12000 LOSS_FN_ACTION: softmax-xentropy CHECKPOINT_PATH: data/pretrained_models/resnet_v1_101.ckpt CHECKPOINT_EXCLUDE_SCOPES: resnet_v1_101/logits LOSS_FN_ACTION: 'softmax-xentropy' LOSS_FN_POSE: '' TEST: EVAL_METRIC: mAP BATCH_SIZE: 1 CHECKPOINT_PATH: pretrained_models/mpii_attention/model.ckpt-12000 ================================================ FILE: experiments/003_MPII_ResNet_withPoseAttention.yaml ================================================ GPUS: '0,1,2,3' NUM_READERS: 4 NUM_PREPROCESSING_THREADS: 12 MODEL_NAME: 'resnet_v1_101' HEATMAP_MARKER_WD_RATIO: 0.05 NET: USE_POSE_PRELOGITS_BASED_ATTENTION: True TRAIN: ITER_SIZE: 2 LEARNING_RATE: 0.001 BATCH_SIZE: 16 FINAL_POSE_HMAP_SIDE: 15 LEARNING_RATE_DECAY_RATE: 0.33 NUM_STEPS_PER_DECAY: 5000 MAX_NUMBER_OF_STEPS: 12000 LOSS_FN_ACTION: softmax-xentropy CHECKPOINT_PATH: data/pretrained_models/resnet_v1_101.ckpt CHECKPOINT_EXCLUDE_SCOPES: resnet_v1_101/logits LOSS_FN_ACTION: 'softmax-xentropy' LOSS_FN_POSE: 'l2' TEST: EVAL_METRIC: mAP BATCH_SIZE: 1 ================================================ FILE: experiments/003_MPII_ResNet_withPoseAttention_pretrained.yaml ================================================ GPUS: '0,1,2,3' NUM_READERS: 4 NUM_PREPROCESSING_THREADS: 12 MODEL_NAME: 'resnet_v1_101' HEATMAP_MARKER_WD_RATIO: 0.05 NET: USE_POSE_PRELOGITS_BASED_ATTENTION: True TRAIN: ITER_SIZE: 2 LEARNING_RATE: 0.001 BATCH_SIZE: 16 FINAL_POSE_HMAP_SIDE: 15 LEARNING_RATE_DECAY_RATE: 0.33 NUM_STEPS_PER_DECAY: 5000 MAX_NUMBER_OF_STEPS: 12000 LOSS_FN_ACTION: softmax-xentropy CHECKPOINT_PATH: data/pretrained_models/resnet_v1_101.ckpt CHECKPOINT_EXCLUDE_SCOPES: resnet_v1_101/logits LOSS_FN_ACTION: 'softmax-xentropy' LOSS_FN_POSE: 'l2' TEST: EVAL_METRIC: mAP BATCH_SIZE: 1 CHECKPOINT_PATH: pretrained_models/mpii_poseAttention/model.ckpt-12000 ================================================ FILE: experiments/004_MPII_ResNet_withAttention_train+val.yaml ================================================ GPUS: '0,1,2,3' NUM_READERS: 4 NUM_PREPROCESSING_THREADS: 12 MODEL_NAME: 'resnet_v1_101' NET: USE_POSE_PRELOGITS_BASED_ATTENTION: True USE_POSE_PRELOGITS_BASED_ATTENTION_SINGLE_LAYER_ATT: True TRAIN: ITER_SIZE: 2 LEARNING_RATE: 0.001 BATCH_SIZE: 16 FINAL_POSE_HMAP_SIDE: 15 LEARNING_RATE_DECAY_RATE: 0.33 NUM_STEPS_PER_DECAY: 5000 MAX_NUMBER_OF_STEPS: 12000 LOSS_FN_ACTION: softmax-xentropy CHECKPOINT_PATH: data/pretrained_models/resnet_v1_101.ckpt CHECKPOINT_EXCLUDE_SCOPES: resnet_v1_101/logits LOSS_FN_ACTION: 'softmax-xentropy' LOSS_FN_POSE: '' DATASET_SPLIT_NAME: 'trainval' TEST: DATASET_SPLIT_NAME: 'test' EVAL_METRIC: mAP BATCH_SIZE: 1 ================================================ FILE: models/.github/ISSUE_TEMPLATE.md ================================================ ## Please let us know which model this issue is about (specify the top-level directory) ================================================ FILE: models/.gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *,cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # IPython Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # dotenv .env # virtualenv venv/ ENV/ # Spyder project settings .spyderproject # Rope project settings .ropeproject # editor *.swp ================================================ FILE: models/.gitmodules ================================================ [submodule "tensorflow"] path = syntaxnet/tensorflow url = https://github.com/tensorflow/tensorflow.git ================================================ FILE: models/LICENSE ================================================ Copyright 2016 The TensorFlow Authors. All rights reserved. Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2016, The Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: models/slim/__init__.py ================================================ ================================================ FILE: models/slim/datasets/__init__.py ================================================ ================================================ FILE: models/slim/datasets/cifar10.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Provides data for the Cifar10 dataset. The dataset scripts used to create the dataset can be found at: tensorflow/models/slim/data/create_cifar10_dataset.py """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import tensorflow as tf from datasets import dataset_utils slim = tf.contrib.slim _FILE_PATTERN = 'cifar10_%s.tfrecord' SPLITS_TO_SIZES = {'train': 50000, 'test': 10000} _NUM_CLASSES = 10 _ITEMS_TO_DESCRIPTIONS = { 'image': 'A [32 x 32 x 3] color image.', 'label': 'A single integer between 0 and 9', } def get_split(split_name, dataset_dir, file_pattern=None, reader=None): """Gets a dataset tuple with instructions for reading cifar10. Args: split_name: A train/test split name. dataset_dir: The base directory of the dataset sources. file_pattern: The file pattern to use when matching the dataset sources. It is assumed that the pattern contains a '%s' string so that the split name can be inserted. reader: The TensorFlow reader type. Returns: A `Dataset` namedtuple. Raises: ValueError: if `split_name` is not a valid train/test split. """ if split_name not in SPLITS_TO_SIZES: raise ValueError('split name %s was not recognized.' % split_name) if not file_pattern: file_pattern = _FILE_PATTERN file_pattern = os.path.join(dataset_dir, file_pattern % split_name) # Allowing None in the signature so that dataset_factory can use the default. if not reader: reader = tf.TFRecordReader keys_to_features = { 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), 'image/format': tf.FixedLenFeature((), tf.string, default_value='png'), 'image/class/label': tf.FixedLenFeature( [], tf.int64, default_value=tf.zeros([], dtype=tf.int64)), } items_to_handlers = { 'image': slim.tfexample_decoder.Image(shape=[32, 32, 3]), 'label': slim.tfexample_decoder.Tensor('image/class/label'), } decoder = slim.tfexample_decoder.TFExampleDecoder( keys_to_features, items_to_handlers) labels_to_names = None if dataset_utils.has_labels(dataset_dir): labels_to_names = dataset_utils.read_label_file(dataset_dir) return slim.dataset.Dataset( data_sources=file_pattern, reader=reader, decoder=decoder, num_samples=SPLITS_TO_SIZES[split_name], items_to_descriptions=_ITEMS_TO_DESCRIPTIONS, num_classes=_NUM_CLASSES, labels_to_names=labels_to_names) ================================================ FILE: models/slim/datasets/dataset_factory.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """A factory-pattern class which returns classification image/label pairs.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function from datasets import cifar10 from datasets import flowers from datasets import imagenet from datasets import mnist datasets_map = { 'cifar10': cifar10, 'flowers': flowers, 'imagenet': imagenet, 'mnist': mnist, } def get_dataset(name, split_name, dataset_dir, file_pattern=None, reader=None): """Given a dataset name and a split_name returns a Dataset. Args: name: String, the name of the dataset. split_name: A train/test split name. dataset_dir: The directory where the dataset files are stored. file_pattern: The file pattern to use for matching the dataset source files. reader: The subclass of tf.ReaderBase. If left as `None`, then the default reader defined by each dataset is used. Returns: A `Dataset` class. Raises: ValueError: If the dataset `name` is unknown. """ if name not in datasets_map: raise ValueError('Name of dataset unknown %s' % name) return datasets_map[name].get_split( split_name, dataset_dir, file_pattern, reader) ================================================ FILE: models/slim/datasets/dataset_utils.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Contains utilities for downloading and converting datasets.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys import tarfile from six.moves import urllib import tensorflow as tf LABELS_FILENAME = 'labels.txt' def int64_feature(values): """Returns a TF-Feature of int64s. Args: values: A scalar or list of values. Returns: a TF-Feature. """ if not isinstance(values, (tuple, list)): values = [values] return tf.train.Feature(int64_list=tf.train.Int64List(value=values)) def bytes_feature(values): """Returns a TF-Feature of bytes. Args: values: A string. Returns: a TF-Feature. """ return tf.train.Feature(bytes_list=tf.train.BytesList(value=[values])) def image_to_tfexample(image_data, image_format, height, width, class_id): return tf.train.Example(features=tf.train.Features(feature={ 'image/encoded': bytes_feature(image_data), 'image/format': bytes_feature(image_format), 'image/class/label': int64_feature(class_id), 'image/height': int64_feature(height), 'image/width': int64_feature(width), })) def download_and_uncompress_tarball(tarball_url, dataset_dir): """Downloads the `tarball_url` and uncompresses it locally. Args: tarball_url: The URL of a tarball file. dataset_dir: The directory where the temporary files are stored. """ filename = tarball_url.split('/')[-1] filepath = os.path.join(dataset_dir, filename) def _progress(count, block_size, total_size): sys.stdout.write('\r>> Downloading %s %.1f%%' % ( filename, float(count * block_size) / float(total_size) * 100.0)) sys.stdout.flush() filepath, _ = urllib.request.urlretrieve(tarball_url, filepath, _progress) print() statinfo = os.stat(filepath) print('Successfully downloaded', filename, statinfo.st_size, 'bytes.') tarfile.open(filepath, 'r:gz').extractall(dataset_dir) def write_label_file(labels_to_class_names, dataset_dir, filename=LABELS_FILENAME): """Writes a file with the list of class names. Args: labels_to_class_names: A map of (integer) labels to class names. dataset_dir: The directory in which the labels file should be written. filename: The filename where the class names are written. """ labels_filename = os.path.join(dataset_dir, filename) with tf.gfile.Open(labels_filename, 'w') as f: for label in labels_to_class_names: class_name = labels_to_class_names[label] f.write('%d:%s\n' % (label, class_name)) def has_labels(dataset_dir, filename=LABELS_FILENAME): """Specifies whether or not the dataset directory contains a label map file. Args: dataset_dir: The directory in which the labels file is found. filename: The filename where the class names are written. Returns: `True` if the labels file exists and `False` otherwise. """ return tf.gfile.Exists(os.path.join(dataset_dir, filename)) def read_label_file(dataset_dir, filename=LABELS_FILENAME): """Reads the labels file and returns a mapping from ID to class name. Args: dataset_dir: The directory in which the labels file is found. filename: The filename where the class names are written. Returns: A map from a label (integer) to class name. """ labels_filename = os.path.join(dataset_dir, filename) with tf.gfile.Open(labels_filename, 'r') as f: lines = f.read().decode() lines = lines.split('\n') lines = filter(None, lines) labels_to_class_names = {} for line in lines: index = line.index(':') labels_to_class_names[int(line[:index])] = line[index+1:] return labels_to_class_names ================================================ FILE: models/slim/datasets/download_and_convert_cifar10.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== r"""Downloads and converts cifar10 data to TFRecords of TF-Example protos. This module downloads the cifar10 data, uncompresses it, reads the files that make up the cifar10 data and creates two TFRecord datasets: one for train and one for test. Each TFRecord dataset is comprised of a set of TF-Example protocol buffers, each of which contain a single image and label. The script should take several minutes to run. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import cPickle import os import sys import tarfile import numpy as np from six.moves import urllib import tensorflow as tf from datasets import dataset_utils # The URL where the CIFAR data can be downloaded. _DATA_URL = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' # The number of training files. _NUM_TRAIN_FILES = 5 # The height and width of each image. _IMAGE_SIZE = 32 # The names of the classes. _CLASS_NAMES = [ 'airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck', ] def _add_to_tfrecord(filename, tfrecord_writer, offset=0): """Loads data from the cifar10 pickle files and writes files to a TFRecord. Args: filename: The filename of the cifar10 pickle file. tfrecord_writer: The TFRecord writer to use for writing. offset: An offset into the absolute number of images previously written. Returns: The new offset. """ with tf.gfile.Open(filename, 'r') as f: data = cPickle.load(f) images = data['data'] num_images = images.shape[0] images = images.reshape((num_images, 3, 32, 32)) labels = data['labels'] with tf.Graph().as_default(): image_placeholder = tf.placeholder(dtype=tf.uint8) encoded_image = tf.image.encode_png(image_placeholder) with tf.Session('') as sess: for j in range(num_images): sys.stdout.write('\r>> Reading file [%s] image %d/%d' % ( filename, offset + j + 1, offset + num_images)) sys.stdout.flush() image = np.squeeze(images[j]).transpose((1, 2, 0)) label = labels[j] png_string = sess.run(encoded_image, feed_dict={image_placeholder: image}) example = dataset_utils.image_to_tfexample( png_string, 'png', _IMAGE_SIZE, _IMAGE_SIZE, label) tfrecord_writer.write(example.SerializeToString()) return offset + num_images def _get_output_filename(dataset_dir, split_name): """Creates the output filename. Args: dataset_dir: The dataset directory where the dataset is stored. split_name: The name of the train/test split. Returns: An absolute file path. """ return '%s/cifar10_%s.tfrecord' % (dataset_dir, split_name) def _download_and_uncompress_dataset(dataset_dir): """Downloads cifar10 and uncompresses it locally. Args: dataset_dir: The directory where the temporary files are stored. """ filename = _DATA_URL.split('/')[-1] filepath = os.path.join(dataset_dir, filename) if not os.path.exists(filepath): def _progress(count, block_size, total_size): sys.stdout.write('\r>> Downloading %s %.1f%%' % ( filename, float(count * block_size) / float(total_size) * 100.0)) sys.stdout.flush() filepath, _ = urllib.request.urlretrieve(_DATA_URL, filepath, _progress) print() statinfo = os.stat(filepath) print('Successfully downloaded', filename, statinfo.st_size, 'bytes.') tarfile.open(filepath, 'r:gz').extractall(dataset_dir) def _clean_up_temporary_files(dataset_dir): """Removes temporary files used to create the dataset. Args: dataset_dir: The directory where the temporary files are stored. """ filename = _DATA_URL.split('/')[-1] filepath = os.path.join(dataset_dir, filename) tf.gfile.Remove(filepath) tmp_dir = os.path.join(dataset_dir, 'cifar-10-batches-py') tf.gfile.DeleteRecursively(tmp_dir) def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) training_filename = _get_output_filename(dataset_dir, 'train') testing_filename = _get_output_filename(dataset_dir, 'test') if tf.gfile.Exists(training_filename) and tf.gfile.Exists(testing_filename): print('Dataset files already exist. Exiting without re-creating them.') return dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) # First, process the training data: with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer: offset = 0 for i in range(_NUM_TRAIN_FILES): filename = os.path.join(dataset_dir, 'cifar-10-batches-py', 'data_batch_%d' % (i + 1)) # 1-indexed. offset = _add_to_tfrecord(filename, tfrecord_writer, offset) # Next, process the testing data: with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer: filename = os.path.join(dataset_dir, 'cifar-10-batches-py', 'test_batch') _add_to_tfrecord(filename, tfrecord_writer) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) _clean_up_temporary_files(dataset_dir) print('\nFinished converting the Cifar10 dataset!') ================================================ FILE: models/slim/datasets/download_and_convert_flowers.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== r"""Downloads and converts Flowers data to TFRecords of TF-Example protos. This module downloads the Flowers data, uncompresses it, reads the files that make up the Flowers data and creates two TFRecord datasets: one for train and one for test. Each TFRecord dataset is comprised of a set of TF-Example protocol buffers, each of which contain a single image and label. The script should take about a minute to run. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import math import os import random import sys import tensorflow as tf from datasets import dataset_utils # The URL where the Flowers data can be downloaded. _DATA_URL = 'http://download.tensorflow.org/example_images/flower_photos.tgz' # The number of images in the validation set. _NUM_VALIDATION = 350 # Seed for repeatability. _RANDOM_SEED = 0 # The number of shards per dataset split. _NUM_SHARDS = 5 class ImageReader(object): """Helper class that provides TensorFlow image coding utilities.""" def __init__(self): # Initializes function that decodes RGB JPEG data. self._decode_jpeg_data = tf.placeholder(dtype=tf.string) self._decode_jpeg = tf.image.decode_jpeg(self._decode_jpeg_data, channels=3) def read_image_dims(self, sess, image_data): image = self.decode_jpeg(sess, image_data) return image.shape[0], image.shape[1] def decode_jpeg(self, sess, image_data): image = sess.run(self._decode_jpeg, feed_dict={self._decode_jpeg_data: image_data}) assert len(image.shape) == 3 assert image.shape[2] == 3 return image def _get_filenames_and_classes(dataset_dir): """Returns a list of filenames and inferred class names. Args: dataset_dir: A directory containing a set of subdirectories representing class names. Each subdirectory should contain PNG or JPG encoded images. Returns: A list of image file paths, relative to `dataset_dir` and the list of subdirectories, representing class names. """ flower_root = os.path.join(dataset_dir, 'flower_photos') directories = [] class_names = [] for filename in os.listdir(flower_root): path = os.path.join(flower_root, filename) if os.path.isdir(path): directories.append(path) class_names.append(filename) photo_filenames = [] for directory in directories: for filename in os.listdir(directory): path = os.path.join(directory, filename) photo_filenames.append(path) return photo_filenames, sorted(class_names) def _get_dataset_filename(dataset_dir, split_name, shard_id): output_filename = 'flowers_%s_%05d-of-%05d.tfrecord' % ( split_name, shard_id, _NUM_SHARDS) return os.path.join(dataset_dir, output_filename) def _convert_dataset(split_name, filenames, class_names_to_ids, dataset_dir): """Converts the given filenames to a TFRecord dataset. Args: split_name: The name of the dataset, either 'train' or 'validation'. filenames: A list of absolute paths to png or jpg images. class_names_to_ids: A dictionary from class names (strings) to ids (integers). dataset_dir: The directory where the converted datasets are stored. """ assert split_name in ['train', 'validation'] num_per_shard = int(math.ceil(len(filenames) / float(_NUM_SHARDS))) with tf.Graph().as_default(): image_reader = ImageReader() with tf.Session('') as sess: for shard_id in range(_NUM_SHARDS): output_filename = _get_dataset_filename( dataset_dir, split_name, shard_id) with tf.python_io.TFRecordWriter(output_filename) as tfrecord_writer: start_ndx = shard_id * num_per_shard end_ndx = min((shard_id+1) * num_per_shard, len(filenames)) for i in range(start_ndx, end_ndx): sys.stdout.write('\r>> Converting image %d/%d shard %d' % ( i+1, len(filenames), shard_id)) sys.stdout.flush() # Read the filename: image_data = tf.gfile.FastGFile(filenames[i], 'r').read() height, width = image_reader.read_image_dims(sess, image_data) class_name = os.path.basename(os.path.dirname(filenames[i])) class_id = class_names_to_ids[class_name] example = dataset_utils.image_to_tfexample( image_data, 'jpg', height, width, class_id) tfrecord_writer.write(example.SerializeToString()) sys.stdout.write('\n') sys.stdout.flush() def _clean_up_temporary_files(dataset_dir): """Removes temporary files used to create the dataset. Args: dataset_dir: The directory where the temporary files are stored. """ filename = _DATA_URL.split('/')[-1] filepath = os.path.join(dataset_dir, filename) tf.gfile.Remove(filepath) tmp_dir = os.path.join(dataset_dir, 'flower_photos') tf.gfile.DeleteRecursively(tmp_dir) def _dataset_exists(dataset_dir): for split_name in ['train', 'validation']: for shard_id in range(_NUM_SHARDS): output_filename = _get_dataset_filename( dataset_dir, split_name, shard_id) if not tf.gfile.Exists(output_filename): return False return True def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) if _dataset_exists(dataset_dir): print('Dataset files already exist. Exiting without re-creating them.') return dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) photo_filenames, class_names = _get_filenames_and_classes(dataset_dir) class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) training_filenames = photo_filenames[_NUM_VALIDATION:] validation_filenames = photo_filenames[:_NUM_VALIDATION] # First, convert the training and validation sets. _convert_dataset('train', training_filenames, class_names_to_ids, dataset_dir) _convert_dataset('validation', validation_filenames, class_names_to_ids, dataset_dir) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) _clean_up_temporary_files(dataset_dir) print('\nFinished converting the Flowers dataset!') ================================================ FILE: models/slim/datasets/download_and_convert_mnist.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== r"""Downloads and converts MNIST data to TFRecords of TF-Example protos. This module downloads the MNIST data, uncompresses it, reads the files that make up the MNIST data and creates two TFRecord datasets: one for train and one for test. Each TFRecord dataset is comprised of a set of TF-Example protocol buffers, each of which contain a single image and label. The script should take about a minute to run. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import gzip import os import sys import numpy as np from six.moves import urllib import tensorflow as tf from datasets import dataset_utils # The URLs where the MNIST data can be downloaded. _DATA_URL = 'http://yann.lecun.com/exdb/mnist/' _TRAIN_DATA_FILENAME = 'train-images-idx3-ubyte.gz' _TRAIN_LABELS_FILENAME = 'train-labels-idx1-ubyte.gz' _TEST_DATA_FILENAME = 't10k-images-idx3-ubyte.gz' _TEST_LABELS_FILENAME = 't10k-labels-idx1-ubyte.gz' _IMAGE_SIZE = 28 _NUM_CHANNELS = 1 # The names of the classes. _CLASS_NAMES = [ 'zero', 'one', 'two', 'three', 'four', 'five', 'size', 'seven', 'eight', 'nine', ] def _extract_images(filename, num_images): """Extract the images into a numpy array. Args: filename: The path to an MNIST images file. num_images: The number of images in the file. Returns: A numpy array of shape [number_of_images, height, width, channels]. """ print('Extracting images from: ', filename) with gzip.open(filename) as bytestream: bytestream.read(16) buf = bytestream.read( _IMAGE_SIZE * _IMAGE_SIZE * num_images * _NUM_CHANNELS) data = np.frombuffer(buf, dtype=np.uint8) data = data.reshape(num_images, _IMAGE_SIZE, _IMAGE_SIZE, _NUM_CHANNELS) return data def _extract_labels(filename, num_labels): """Extract the labels into a vector of int64 label IDs. Args: filename: The path to an MNIST labels file. num_labels: The number of labels in the file. Returns: A numpy array of shape [number_of_labels] """ print('Extracting labels from: ', filename) with gzip.open(filename) as bytestream: bytestream.read(8) buf = bytestream.read(1 * num_labels) labels = np.frombuffer(buf, dtype=np.uint8).astype(np.int64) return labels def _add_to_tfrecord(data_filename, labels_filename, num_images, tfrecord_writer): """Loads data from the binary MNIST files and writes files to a TFRecord. Args: data_filename: The filename of the MNIST images. labels_filename: The filename of the MNIST labels. num_images: The number of images in the dataset. tfrecord_writer: The TFRecord writer to use for writing. """ images = _extract_images(data_filename, num_images) labels = _extract_labels(labels_filename, num_images) shape = (_IMAGE_SIZE, _IMAGE_SIZE, _NUM_CHANNELS) with tf.Graph().as_default(): image = tf.placeholder(dtype=tf.uint8, shape=shape) encoded_png = tf.image.encode_png(image) with tf.Session('') as sess: for j in range(num_images): sys.stdout.write('\r>> Converting image %d/%d' % (j + 1, num_images)) sys.stdout.flush() png_string = sess.run(encoded_png, feed_dict={image: images[j]}) example = dataset_utils.image_to_tfexample( png_string, 'png', _IMAGE_SIZE, _IMAGE_SIZE, labels[j]) tfrecord_writer.write(example.SerializeToString()) def _get_output_filename(dataset_dir, split_name): """Creates the output filename. Args: dataset_dir: The directory where the temporary files are stored. split_name: The name of the train/test split. Returns: An absolute file path. """ return '%s/mnist_%s.tfrecord' % (dataset_dir, split_name) def _download_dataset(dataset_dir): """Downloads MNIST locally. Args: dataset_dir: The directory where the temporary files are stored. """ for filename in [_TRAIN_DATA_FILENAME, _TRAIN_LABELS_FILENAME, _TEST_DATA_FILENAME, _TEST_LABELS_FILENAME]: filepath = os.path.join(dataset_dir, filename) if not os.path.exists(filepath): print('Downloading file %s...' % filename) def _progress(count, block_size, total_size): sys.stdout.write('\r>> Downloading %.1f%%' % ( float(count * block_size) / float(total_size) * 100.0)) sys.stdout.flush() filepath, _ = urllib.request.urlretrieve(_DATA_URL + filename, filepath, _progress) print() with tf.gfile.GFile(filepath) as f: size = f.Size() print('Successfully downloaded', filename, size, 'bytes.') def _clean_up_temporary_files(dataset_dir): """Removes temporary files used to create the dataset. Args: dataset_dir: The directory where the temporary files are stored. """ for filename in [_TRAIN_DATA_FILENAME, _TRAIN_LABELS_FILENAME, _TEST_DATA_FILENAME, _TEST_LABELS_FILENAME]: filepath = os.path.join(dataset_dir, filename) tf.gfile.Remove(filepath) def run(dataset_dir): """Runs the download and conversion operation. Args: dataset_dir: The dataset directory where the dataset is stored. """ if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) training_filename = _get_output_filename(dataset_dir, 'train') testing_filename = _get_output_filename(dataset_dir, 'test') if tf.gfile.Exists(training_filename) and tf.gfile.Exists(testing_filename): print('Dataset files already exist. Exiting without re-creating them.') return _download_dataset(dataset_dir) # First, process the training data: with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer: data_filename = os.path.join(dataset_dir, _TRAIN_DATA_FILENAME) labels_filename = os.path.join(dataset_dir, _TRAIN_LABELS_FILENAME) _add_to_tfrecord(data_filename, labels_filename, 60000, tfrecord_writer) # Next, process the testing data: with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer: data_filename = os.path.join(dataset_dir, _TEST_DATA_FILENAME) labels_filename = os.path.join(dataset_dir, _TEST_LABELS_FILENAME) _add_to_tfrecord(data_filename, labels_filename, 10000, tfrecord_writer) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES)) dataset_utils.write_label_file(labels_to_class_names, dataset_dir) _clean_up_temporary_files(dataset_dir) print('\nFinished converting the MNIST dataset!') ================================================ FILE: models/slim/datasets/flowers.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Provides data for the flowers dataset. The dataset scripts used to create the dataset can be found at: tensorflow/models/slim/datasets/download_and_convert_flowers.py """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import tensorflow as tf from datasets import dataset_utils slim = tf.contrib.slim _FILE_PATTERN = 'flowers_%s_*.tfrecord' SPLITS_TO_SIZES = {'train': 3320, 'validation': 350} _NUM_CLASSES = 5 _ITEMS_TO_DESCRIPTIONS = { 'image': 'A color image of varying size.', 'label': 'A single integer between 0 and 4', } def get_split(split_name, dataset_dir, file_pattern=None, reader=None): """Gets a dataset tuple with instructions for reading flowers. Args: split_name: A train/validation split name. dataset_dir: The base directory of the dataset sources. file_pattern: The file pattern to use when matching the dataset sources. It is assumed that the pattern contains a '%s' string so that the split name can be inserted. reader: The TensorFlow reader type. Returns: A `Dataset` namedtuple. Raises: ValueError: if `split_name` is not a valid train/validation split. """ if split_name not in SPLITS_TO_SIZES: raise ValueError('split name %s was not recognized.' % split_name) if not file_pattern: file_pattern = _FILE_PATTERN file_pattern = os.path.join(dataset_dir, file_pattern % split_name) # Allowing None in the signature so that dataset_factory can use the default. if reader is None: reader = tf.TFRecordReader keys_to_features = { 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), 'image/format': tf.FixedLenFeature((), tf.string, default_value='png'), 'image/class/label': tf.FixedLenFeature( [], tf.int64, default_value=tf.zeros([], dtype=tf.int64)), } items_to_handlers = { 'image': slim.tfexample_decoder.Image(), 'label': slim.tfexample_decoder.Tensor('image/class/label'), } decoder = slim.tfexample_decoder.TFExampleDecoder( keys_to_features, items_to_handlers) labels_to_names = None if dataset_utils.has_labels(dataset_dir): labels_to_names = dataset_utils.read_label_file(dataset_dir) return slim.dataset.Dataset( data_sources=file_pattern, reader=reader, decoder=decoder, num_samples=SPLITS_TO_SIZES[split_name], items_to_descriptions=_ITEMS_TO_DESCRIPTIONS, num_classes=_NUM_CLASSES, labels_to_names=labels_to_names) ================================================ FILE: models/slim/datasets/imagenet.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Provides data for the ImageNet ILSVRC 2012 Dataset plus some bounding boxes. Some images have one or more bounding boxes associated with the label of the image. See details here: http://image-net.org/download-bboxes ImageNet is based upon WordNet 3.0. To uniquely identify a synset, we use "WordNet ID" (wnid), which is a concatenation of POS ( i.e. part of speech ) and SYNSET OFFSET of WordNet. For more information, please refer to the WordNet documentation[http://wordnet.princeton.edu/wordnet/documentation/]. "There are bounding boxes for over 3000 popular synsets available. For each synset, there are on average 150 images with bounding boxes." WARNING: Don't use for object detection, in this case all the bounding boxes of the image belong to just one class. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os from six.moves import urllib import tensorflow as tf from datasets import dataset_utils slim = tf.contrib.slim # TODO(nsilberman): Add tfrecord file type once the script is updated. _FILE_PATTERN = '%s-*' _SPLITS_TO_SIZES = { 'train': 1281167, 'validation': 50000, } _ITEMS_TO_DESCRIPTIONS = { 'image': 'A color image of varying height and width.', 'label': 'The label id of the image, integer between 0 and 999', 'label_text': 'The text of the label.', 'object/bbox': 'A list of bounding boxes.', 'object/label': 'A list of labels, one per each object.', } _NUM_CLASSES = 1001 def create_readable_names_for_imagenet_labels(): """Create a dict mapping label id to human readable string. Returns: labels_to_names: dictionary where keys are integers from to 1000 and values are human-readable names. We retrieve a synset file, which contains a list of valid synset labels used by ILSVRC competition. There is one synset one per line, eg. # n01440764 # n01443537 We also retrieve a synset_to_human_file, which contains a mapping from synsets to human-readable names for every synset in Imagenet. These are stored in a tsv format, as follows: # n02119247 black fox # n02119359 silver fox We assign each synset (in alphabetical order) an integer, starting from 1 (since 0 is reserved for the background class). Code is based on https://github.com/tensorflow/models/blob/master/inception/inception/data/build_imagenet_data.py#L463 """ # pylint: disable=g-line-too-long base_url = 'https://raw.githubusercontent.com/tensorflow/models/master/inception/inception/data/' synset_url = '{}/imagenet_lsvrc_2015_synsets.txt'.format(base_url) synset_to_human_url = '{}/imagenet_metadata.txt'.format(base_url) filename, _ = urllib.request.urlretrieve(synset_url) synset_list = [s.strip() for s in open(filename).readlines()] num_synsets_in_ilsvrc = len(synset_list) assert num_synsets_in_ilsvrc == 1000 filename, _ = urllib.request.urlretrieve(synset_to_human_url) synset_to_human_list = open(filename).readlines() num_synsets_in_all_imagenet = len(synset_to_human_list) assert num_synsets_in_all_imagenet == 21842 synset_to_human = {} for s in synset_to_human_list: parts = s.strip().split('\t') assert len(parts) == 2 synset = parts[0] human = parts[1] synset_to_human[synset] = human label_index = 1 labels_to_names = {0: 'background'} for synset in synset_list: name = synset_to_human[synset] labels_to_names[label_index] = name label_index += 1 return labels_to_names def get_split(split_name, dataset_dir, file_pattern=None, reader=None): """Gets a dataset tuple with instructions for reading ImageNet. Args: split_name: A train/test split name. dataset_dir: The base directory of the dataset sources. file_pattern: The file pattern to use when matching the dataset sources. It is assumed that the pattern contains a '%s' string so that the split name can be inserted. reader: The TensorFlow reader type. Returns: A `Dataset` namedtuple. Raises: ValueError: if `split_name` is not a valid train/test split. """ if split_name not in _SPLITS_TO_SIZES: raise ValueError('split name %s was not recognized.' % split_name) if not file_pattern: file_pattern = _FILE_PATTERN file_pattern = os.path.join(dataset_dir, file_pattern % split_name) # Allowing None in the signature so that dataset_factory can use the default. if reader is None: reader = tf.TFRecordReader keys_to_features = { 'image/encoded': tf.FixedLenFeature( (), tf.string, default_value=''), 'image/format': tf.FixedLenFeature( (), tf.string, default_value='jpeg'), 'image/class/label': tf.FixedLenFeature( [], dtype=tf.int64, default_value=-1), 'image/class/text': tf.FixedLenFeature( [], dtype=tf.string, default_value=''), 'image/object/bbox/xmin': tf.VarLenFeature( dtype=tf.float32), 'image/object/bbox/ymin': tf.VarLenFeature( dtype=tf.float32), 'image/object/bbox/xmax': tf.VarLenFeature( dtype=tf.float32), 'image/object/bbox/ymax': tf.VarLenFeature( dtype=tf.float32), 'image/object/class/label': tf.VarLenFeature( dtype=tf.int64), } items_to_handlers = { 'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'), 'label': slim.tfexample_decoder.Tensor('image/class/label'), 'label_text': slim.tfexample_decoder.Tensor('image/class/text'), 'object/bbox': slim.tfexample_decoder.BoundingBox( ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'), 'object/label': slim.tfexample_decoder.Tensor('image/object/class/label'), } decoder = slim.tfexample_decoder.TFExampleDecoder( keys_to_features, items_to_handlers) labels_to_names = None if dataset_utils.has_labels(dataset_dir): labels_to_names = dataset_utils.read_label_file(dataset_dir) else: labels_to_names = create_readable_names_for_imagenet_labels() dataset_utils.write_label_file(labels_to_names, dataset_dir) return slim.dataset.Dataset( data_sources=file_pattern, reader=reader, decoder=decoder, num_samples=_SPLITS_TO_SIZES[split_name], items_to_descriptions=_ITEMS_TO_DESCRIPTIONS, num_classes=_NUM_CLASSES, labels_to_names=labels_to_names) ================================================ FILE: models/slim/datasets/mnist.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Provides data for the MNIST dataset. The dataset scripts used to create the dataset can be found at: tensorflow/models/slim/data/create_mnist_dataset.py """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import tensorflow as tf from datasets import dataset_utils slim = tf.contrib.slim _FILE_PATTERN = 'mnist_%s.tfrecord' _SPLITS_TO_SIZES = {'train': 60000, 'test': 10000} _NUM_CLASSES = 10 _ITEMS_TO_DESCRIPTIONS = { 'image': 'A [28 x 28 x 1] grayscale image.', 'label': 'A single integer between 0 and 9', } def get_split(split_name, dataset_dir, file_pattern=None, reader=None): """Gets a dataset tuple with instructions for reading MNIST. Args: split_name: A train/test split name. dataset_dir: The base directory of the dataset sources. file_pattern: The file pattern to use when matching the dataset sources. It is assumed that the pattern contains a '%s' string so that the split name can be inserted. reader: The TensorFlow reader type. Returns: A `Dataset` namedtuple. Raises: ValueError: if `split_name` is not a valid train/test split. """ if split_name not in _SPLITS_TO_SIZES: raise ValueError('split name %s was not recognized.' % split_name) if not file_pattern: file_pattern = _FILE_PATTERN file_pattern = os.path.join(dataset_dir, file_pattern % split_name) # Allowing None in the signature so that dataset_factory can use the default. if reader is None: reader = tf.TFRecordReader keys_to_features = { 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), 'image/format': tf.FixedLenFeature((), tf.string, default_value='raw'), 'image/class/label': tf.FixedLenFeature( [1], tf.int64, default_value=tf.zeros([1], dtype=tf.int64)), } items_to_handlers = { 'image': slim.tfexample_decoder.Image(shape=[28, 28, 1], channels=1), 'label': slim.tfexample_decoder.Tensor('image/class/label', shape=[]), } decoder = slim.tfexample_decoder.TFExampleDecoder( keys_to_features, items_to_handlers) labels_to_names = None if dataset_utils.has_labels(dataset_dir): labels_to_names = dataset_utils.read_label_file(dataset_dir) return slim.dataset.Dataset( data_sources=file_pattern, reader=reader, decoder=decoder, num_samples=_SPLITS_TO_SIZES[split_name], num_classes=_NUM_CLASSES, items_to_descriptions=_ITEMS_TO_DESCRIPTIONS, labels_to_names=labels_to_names) ================================================ FILE: models/slim/deployment/__init__.py ================================================ ================================================ FILE: models/slim/deployment/model_deploy.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Deploy Slim models across multiple clones and replicas. # TODO(sguada) docstring paragraph by (a) motivating the need for the file and # (b) defining clones. # TODO(sguada) describe the high-level components of model deployment. # E.g. "each model deployment is composed of several parts: a DeploymentConfig, # which captures A, B and C, an input_fn which loads data.. etc To easily train a model on multiple GPUs or across multiple machines this module provides a set of helper functions: `create_clones`, `optimize_clones` and `deploy`. Usage: g = tf.Graph() # Set up DeploymentConfig config = model_deploy.DeploymentConfig(num_clones=2, clone_on_cpu=True) # Create the global step on the device storing the variables. with tf.device(config.variables_device()): global_step = slim.create_global_step() # Define the inputs with tf.device(config.inputs_device()): images, labels = LoadData(...) inputs_queue = slim.data.prefetch_queue((images, labels)) # Define the optimizer. with tf.device(config.optimizer_device()): optimizer = tf.train.MomentumOptimizer(FLAGS.learning_rate, FLAGS.momentum) # Define the model including the loss. def model_fn(inputs_queue): images, labels = inputs_queue.dequeue() predictions = CreateNetwork(images) slim.losses.log_loss(predictions, labels) model_dp = model_deploy.deploy(config, model_fn, [inputs_queue], optimizer=optimizer) # Run training. slim.learning.train(model_dp.train_op, my_log_dir, summary_op=model_dp.summary_op) The Clone namedtuple holds together the values associated with each call to model_fn: * outputs: The return values of the calls to `model_fn()`. * scope: The scope used to create the clone. * device: The device used to create the clone. DeployedModel namedtuple, holds together the values needed to train multiple clones: * train_op: An operation that run the optimizer training op and include all the update ops created by `model_fn`. Present only if an optimizer was specified. * summary_op: An operation that run the summaries created by `model_fn` and process_gradients. * total_loss: A `Tensor` that contains the sum of all losses created by `model_fn` plus the regularization losses. * clones: List of `Clone` tuples returned by `create_clones()`. DeploymentConfig parameters: * num_clones: Number of model clones to deploy in each replica. * clone_on_cpu: True if clones should be placed on CPU. * replica_id: Integer. Index of the replica for which the model is deployed. Usually 0 for the chief replica. * num_replicas: Number of replicas to use. * num_ps_tasks: Number of tasks for the `ps` job. 0 to not use replicas. * worker_job_name: A name for the worker job. * ps_job_name: A name for the parameter server job. TODO(sguada): - describe side effect to the graph. - what happens to summaries and update_ops. - which graph collections are altered. - write a tutorial on how to use this. - analyze the possibility of calling deploy more than once. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import collections import tensorflow as tf from tensorflow.python.ops import control_flow_ops slim = tf.contrib.slim __all__ = ['create_clones', 'deploy', 'optimize_clones', 'DeployedModel', 'DeploymentConfig', 'Clone', ] # Namedtuple used to represent a clone during deployment. Clone = collections.namedtuple('Clone', ['outputs', # Whatever model_fn() returned. 'scope', # The scope used to create it. 'device', # The device used to create. ]) # Namedtuple used to represent a DeployedModel, returned by deploy(). DeployedModel = collections.namedtuple('DeployedModel', ['train_op', # The `train_op` 'summary_op', # The `summary_op` 'total_loss', # The loss `Tensor` 'clones', # A list of `Clones` tuples. ]) # Default parameters for DeploymentConfig _deployment_params = {'num_clones': 1, 'clone_on_cpu': False, 'replica_id': 0, 'num_replicas': 1, 'num_ps_tasks': 0, 'worker_job_name': 'worker', 'ps_job_name': 'ps'} def create_clones(config, model_fn, args=None, kwargs=None): """Creates multiple clones according to config using a `model_fn`. The returned values of `model_fn(*args, **kwargs)` are collected along with the scope and device used to created it in a namedtuple `Clone(outputs, scope, device)` Note: it is assumed that any loss created by `model_fn` is collected at the tf.GraphKeys.LOSSES collection. To recover the losses, summaries or update_ops created by the clone use: ```python losses = tf.get_collection(tf.GraphKeys.LOSSES, clone.scope) summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, clone.scope) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, clone.scope) ``` The deployment options are specified by the config object and support deploying one or several clones on different GPUs and one or several replicas of such clones. The argument `model_fn` is called `config.num_clones` times to create the model clones as `model_fn(*args, **kwargs)`. If `config` specifies deployment on multiple replicas then the default tensorflow device is set appropriatly for each call to `model_fn` and for the slim variable creation functions: model and global variables will be created on the `ps` device, the clone operations will be on the `worker` device. Args: config: A DeploymentConfig object. model_fn: A callable. Called as `model_fn(*args, **kwargs)` args: Optional list of arguments to pass to `model_fn`. kwargs: Optional list of keyword arguments to pass to `model_fn`. Returns: A list of namedtuples `Clone`. """ clones = [] args = args or [] kwargs = kwargs or {} with slim.arg_scope([slim.model_variable, slim.variable], device=config.variables_device()): # Create clones. for i in range(0, config.num_clones): with tf.name_scope(config.clone_scope(i)) as clone_scope: clone_device = config.clone_device(i) with tf.device(clone_device): with tf.variable_scope(tf.get_variable_scope(), reuse=True if i > 0 else None): outputs = model_fn(*args, **kwargs) clones.append(Clone(outputs, clone_scope, clone_device)) return clones def _gather_clone_loss(clone, num_clones, regularization_losses): """Gather the loss for a single clone. Args: clone: A Clone namedtuple. num_clones: The number of clones being deployed. regularization_losses: Possibly empty list of regularization_losses to add to the clone losses. Returns: A tensor for the total loss for the clone. Can be None. """ # The return value. sum_loss = None # Individual components of the loss that will need summaries. clone_loss = None regularization_loss = None # Compute and aggregate losses on the clone device. with tf.device(clone.device): all_losses = [] clone_losses = tf.get_collection(tf.GraphKeys.LOSSES, clone.scope) if clone_losses: clone_loss = tf.add_n(clone_losses, name='clone_loss') if num_clones > 1: clone_loss = tf.div(clone_loss, 1.0 * num_clones, name='scaled_clone_loss') all_losses.append(clone_loss) if regularization_losses: regularization_loss = tf.add_n(regularization_losses, name='regularization_loss') all_losses.append(regularization_loss) if all_losses: sum_loss = tf.add_n(all_losses) # Add the summaries out of the clone device block. if clone_loss is not None: tf.summary.scalar(clone.scope + '/clone_loss', clone_loss) if regularization_loss is not None: tf.summary.scalar('regularization_loss', regularization_loss) return sum_loss def _optimize_clone(optimizer, clone, num_clones, regularization_losses, **kwargs): """Compute losses and gradients for a single clone. Args: optimizer: A tf.Optimizer object. clone: A Clone namedtuple. num_clones: The number of clones being deployed. regularization_losses: Possibly empty list of regularization_losses to add to the clone losses. **kwargs: Dict of kwarg to pass to compute_gradients(). Returns: A tuple (clone_loss, clone_grads_and_vars). - clone_loss: A tensor for the total loss for the clone. Can be None. - clone_grads_and_vars: List of (gradient, variable) for the clone. Can be empty. """ sum_loss = _gather_clone_loss(clone, num_clones, regularization_losses) clone_grad = None if sum_loss is not None: with tf.device(clone.device): clone_grad = optimizer.compute_gradients(sum_loss, **kwargs) return sum_loss, clone_grad def optimize_clones(clones, optimizer, regularization_losses=None, clip_gradients=-1.0, **kwargs): """Compute clone losses and gradients for the given list of `Clones`. Note: The regularization_losses are added to the first clone losses. Args: clones: List of `Clones` created by `create_clones()`. optimizer: An `Optimizer` object. regularization_losses: Optional list of regularization losses. If None it will gather them from tf.GraphKeys.REGULARIZATION_LOSSES. Pass `[]` to exclude them. **kwargs: Optional list of keyword arguments to pass to `compute_gradients`. Returns: A tuple (total_loss, grads_and_vars). - total_loss: A Tensor containing the average of the clone losses including the regularization loss. - grads_and_vars: A List of tuples (gradient, variable) containing the sum of the gradients for each variable. """ grads_and_vars = [] clones_losses = [] num_clones = len(clones) if regularization_losses is None: regularization_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) for clone in clones: with tf.name_scope(clone.scope): clone_loss, clone_grad = _optimize_clone( optimizer, clone, num_clones, regularization_losses, **kwargs) if clip_gradients > 0: tf.logging.info('Clipping gradient by norm {}'.format(clip_gradients)) clone_grad = slim.learning.clip_gradient_norms( clone_grad, clip_gradients) if clone_loss is not None: clones_losses.append(clone_loss) grads_and_vars.append(clone_grad) # Only use regularization_losses for the first clone regularization_losses = None # Compute the total_loss summing all the clones_losses. total_loss = tf.add_n(clones_losses, name='total_loss') # Sum the gradients accross clones. grads_and_vars = _sum_clones_gradients(grads_and_vars) return total_loss, grads_and_vars def deploy(config, model_fn, args=None, kwargs=None, optimizer=None, summarize_gradients=False): """Deploys a Slim-constructed model across multiple clones. The deployment options are specified by the config object and support deploying one or several clones on different GPUs and one or several replicas of such clones. The argument `model_fn` is called `config.num_clones` times to create the model clones as `model_fn(*args, **kwargs)`. The optional argument `optimizer` is an `Optimizer` object. If not `None`, the deployed model is configured for training with that optimizer. If `config` specifies deployment on multiple replicas then the default tensorflow device is set appropriatly for each call to `model_fn` and for the slim variable creation functions: model and global variables will be created on the `ps` device, the clone operations will be on the `worker` device. Args: config: A `DeploymentConfig` object. model_fn: A callable. Called as `model_fn(*args, **kwargs)` args: Optional list of arguments to pass to `model_fn`. kwargs: Optional list of keyword arguments to pass to `model_fn`. optimizer: Optional `Optimizer` object. If passed the model is deployed for training with that optimizer. summarize_gradients: Whether or not add summaries to the gradients. Returns: A `DeployedModel` namedtuple. """ # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Create Clones. clones = create_clones(config, model_fn, args, kwargs) first_clone = clones[0] # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by model_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone.scope) train_op = None total_loss = None with tf.device(config.optimizer_device()): if optimizer: # Place the global step on the device storing the variables. with tf.device(config.variables_device()): global_step = slim.get_or_create_global_step() # Compute the gradients for the clones. total_loss, clones_gradients = optimize_clones(clones, optimizer) if clones_gradients: if summarize_gradients: # Add summaries to the gradients. summaries |= set(_add_gradients_summaries(clones_gradients)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_op = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') else: clones_losses = [] regularization_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) for clone in clones: with tf.name_scope(clone.scope): clone_loss = _gather_clone_loss(clone, len(clones), regularization_losses) if clone_loss is not None: clones_losses.append(clone_loss) # Only use regularization_losses for the first clone regularization_losses = None if clones_losses: total_loss = tf.add_n(clones_losses, name='total_loss') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone.scope)) if total_loss is not None: # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) if summaries: # Merge all summaries together. summary_op = tf.merge_summary(list(summaries), name='summary_op') else: summary_op = None return DeployedModel(train_op, summary_op, total_loss, clones) def _sum_clones_gradients(clone_grads): """Calculate the sum gradient for each shared variable across all clones. This function assumes that the clone_grads has been scaled appropriately by 1 / num_clones. Args: clone_grads: A List of List of tuples (gradient, variable), one list per `Clone`. Returns: List of tuples of (gradient, variable) where the gradient has been summed across all clones. """ sum_grads = [] for grad_and_vars in zip(*clone_grads): # Note that each grad_and_vars looks like the following: # ((grad_var0_clone0, var0), ... (grad_varN_cloneN, varN)) grads = [] var = grad_and_vars[0][1] for g, v in grad_and_vars: assert v == var if g is not None: grads.append(g) if grads: if len(grads) > 1: sum_grad = tf.add_n(grads, name=var.op.name + '/sum_grads') else: sum_grad = grads[0] sum_grads.append((sum_grad, var)) return sum_grads def _add_gradients_summaries(grads_and_vars): """Add histogram summaries to gradients. Note: The summaries are also added to the SUMMARIES collection. Args: grads_and_vars: A list of gradient to variable pairs (tuples). Returns: The _list_ of the added summaries for grads_and_vars. """ summaries = [] for grad, var in grads_and_vars: if grad is not None: if isinstance(grad, tf.IndexedSlices): grad_values = grad.values else: grad_values = grad summaries.append(tf.histogram_summary(var.op.name + ':gradient', grad_values)) summaries.append(tf.histogram_summary(var.op.name + ':gradient_norm', tf.global_norm([grad_values]))) else: tf.logging.info('Var %s has no gradient', var.op.name) return summaries class DeploymentConfig(object): """Configuration for deploying a model with `deploy()`. You can pass an instance of this class to `deploy()` to specify exactly how to deploy the model to build. If you do not pass one, an instance built from the default deployment_hparams will be used. """ def __init__(self, num_clones=1, clone_on_cpu=False, replica_id=0, num_replicas=1, num_ps_tasks=0, worker_job_name='worker', ps_job_name='ps'): """Create a DeploymentConfig. The config describes how to deploy a model across multiple clones and replicas. The model will be replicated `num_clones` times in each replica. If `clone_on_cpu` is True, each clone will placed on CPU. If `num_replicas` is 1, the model is deployed via a single process. In that case `worker_device`, `num_ps_tasks`, and `ps_device` are ignored. If `num_replicas` is greater than 1, then `worker_device` and `ps_device` must specify TensorFlow devices for the `worker` and `ps` jobs and `num_ps_tasks` must be positive. Args: num_clones: Number of model clones to deploy in each replica. clone_on_cpu: If True clones would be placed on CPU. replica_id: Integer. Index of the replica for which the model is deployed. Usually 0 for the chief replica. num_replicas: Number of replicas to use. num_ps_tasks: Number of tasks for the `ps` job. 0 to not use replicas. worker_job_name: A name for the worker job. ps_job_name: A name for the parameter server job. Raises: ValueError: If the arguments are invalid. """ if num_replicas > 1: if num_ps_tasks < 1: raise ValueError('When using replicas num_ps_tasks must be positive') if num_replicas > 1 or num_ps_tasks > 0: if not worker_job_name: raise ValueError('Must specify worker_job_name when using replicas') if not ps_job_name: raise ValueError('Must specify ps_job_name when using parameter server') if replica_id >= num_replicas: raise ValueError('replica_id must be less than num_replicas') self._num_clones = num_clones self._clone_on_cpu = clone_on_cpu self._replica_id = replica_id self._num_replicas = num_replicas self._num_ps_tasks = num_ps_tasks self._ps_device = '/job:' + ps_job_name if num_ps_tasks > 0 else '' self._worker_device = '/job:' + worker_job_name if num_ps_tasks > 0 else '' @property def num_clones(self): return self._num_clones @property def clone_on_cpu(self): return self._clone_on_cpu @property def replica_id(self): return self._replica_id @property def num_replicas(self): return self._num_replicas @property def num_ps_tasks(self): return self._num_ps_tasks @property def ps_device(self): return self._ps_device @property def worker_device(self): return self._worker_device def caching_device(self): """Returns the device to use for caching variables. Variables are cached on the worker CPU when using replicas. Returns: A device string or None if the variables do not need to be cached. """ if self._num_ps_tasks > 0: return lambda op: op.device else: return None def clone_device(self, clone_index): """Device used to create the clone and all the ops inside the clone. Args: clone_index: Int, representing the clone_index. Returns: A value suitable for `tf.device()`. Raises: ValueError: if `clone_index` is greater or equal to the number of clones". """ if clone_index >= self._num_clones: raise ValueError('clone_index must be less than num_clones') device = '' if self._num_ps_tasks > 0: device += self._worker_device if self._clone_on_cpu: device += '/cpu:0' else: if self._num_clones > 1: device += '/gpu:%d' % clone_index return device def clone_scope(self, clone_index): """Name scope to create the clone. Args: clone_index: Int, representing the clone_index. Returns: A name_scope suitable for `tf.name_scope()`. Raises: ValueError: if `clone_index` is greater or equal to the number of clones". """ if clone_index >= self._num_clones: raise ValueError('clone_index must be less than num_clones') scope = '' if self._num_clones > 1: scope = 'clone_%d' % clone_index return scope def optimizer_device(self): """Device to use with the optimizer. Returns: A value suitable for `tf.device()`. """ if self._num_ps_tasks > 0 or self._num_clones > 0: return self._worker_device + '/cpu:0' else: return '' def inputs_device(self): """Device to use to build the inputs. Returns: A value suitable for `tf.device()`. """ device = '' if self._num_ps_tasks > 0: device += self._worker_device device += '/cpu:0' return device def variables_device(self): """Returns the device to use for variables created inside the clone. Returns: A value suitable for `tf.device()`. """ device = '' if self._num_ps_tasks > 0: device += self._ps_device device += '/cpu:0' class _PSDeviceChooser(object): """Slim device chooser for variables when using PS.""" def __init__(self, device, tasks): self._device = device self._tasks = tasks self._task = 0 def choose(self, op): if op.device: return op.device node_def = op if isinstance(op, tf.NodeDef) else op.node_def if node_def.op == 'Variable': t = self._task self._task = (self._task + 1) % self._tasks d = '%s/task:%d' % (self._device, t) return d else: return op.device if not self._num_ps_tasks: return device else: chooser = _PSDeviceChooser(device, self._num_ps_tasks) return chooser.choose ================================================ FILE: models/slim/deployment/model_deploy_test.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Tests for model_deploy.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import tensorflow as tf from deployment import model_deploy slim = tf.contrib.slim class DeploymentConfigTest(tf.test.TestCase): def testDefaults(self): deploy_config = model_deploy.DeploymentConfig() self.assertEqual(slim.get_variables(), []) self.assertEqual(deploy_config.caching_device(), None) self.assertDeviceEqual(deploy_config.clone_device(0), '') self.assertEqual(deploy_config.clone_scope(0), '') self.assertDeviceEqual(deploy_config.optimizer_device(), 'CPU:0') self.assertDeviceEqual(deploy_config.inputs_device(), 'CPU:0') self.assertDeviceEqual(deploy_config.variables_device(), 'CPU:0') def testCPUonly(self): deploy_config = model_deploy.DeploymentConfig(clone_on_cpu=True) self.assertEqual(deploy_config.caching_device(), None) self.assertDeviceEqual(deploy_config.clone_device(0), 'CPU:0') self.assertEqual(deploy_config.clone_scope(0), '') self.assertDeviceEqual(deploy_config.optimizer_device(), 'CPU:0') self.assertDeviceEqual(deploy_config.inputs_device(), 'CPU:0') self.assertDeviceEqual(deploy_config.variables_device(), 'CPU:0') def testMultiGPU(self): deploy_config = model_deploy.DeploymentConfig(num_clones=2) self.assertEqual(deploy_config.caching_device(), None) self.assertDeviceEqual(deploy_config.clone_device(0), 'GPU:0') self.assertDeviceEqual(deploy_config.clone_device(1), 'GPU:1') self.assertEqual(deploy_config.clone_scope(0), 'clone_0') self.assertEqual(deploy_config.clone_scope(1), 'clone_1') self.assertDeviceEqual(deploy_config.optimizer_device(), 'CPU:0') self.assertDeviceEqual(deploy_config.inputs_device(), 'CPU:0') self.assertDeviceEqual(deploy_config.variables_device(), 'CPU:0') def testPS(self): deploy_config = model_deploy.DeploymentConfig(num_clones=1, num_ps_tasks=1) self.assertDeviceEqual(deploy_config.clone_device(0), '/job:worker') self.assertEqual(deploy_config.clone_scope(0), '') self.assertDeviceEqual(deploy_config.optimizer_device(), '/job:worker/device:CPU:0') self.assertDeviceEqual(deploy_config.inputs_device(), '/job:worker/device:CPU:0') with tf.device(deploy_config.variables_device()): a = tf.Variable(0) b = tf.Variable(0) c = tf.no_op() d = slim.variable('a', [], caching_device=deploy_config.caching_device()) self.assertDeviceEqual(a.device, '/job:ps/task:0/device:CPU:0') self.assertDeviceEqual(a.device, a.value().device) self.assertDeviceEqual(b.device, '/job:ps/task:0/device:CPU:0') self.assertDeviceEqual(b.device, b.value().device) self.assertDeviceEqual(c.device, '') self.assertDeviceEqual(d.device, '/job:ps/task:0/device:CPU:0') self.assertDeviceEqual(d.value().device, '') def testMultiGPUPS(self): deploy_config = model_deploy.DeploymentConfig(num_clones=2, num_ps_tasks=1) self.assertEqual(deploy_config.caching_device()(tf.no_op()), '') self.assertDeviceEqual(deploy_config.clone_device(0), '/job:worker/device:GPU:0') self.assertDeviceEqual(deploy_config.clone_device(1), '/job:worker/device:GPU:1') self.assertEqual(deploy_config.clone_scope(0), 'clone_0') self.assertEqual(deploy_config.clone_scope(1), 'clone_1') self.assertDeviceEqual(deploy_config.optimizer_device(), '/job:worker/device:CPU:0') self.assertDeviceEqual(deploy_config.inputs_device(), '/job:worker/device:CPU:0') def testReplicasPS(self): deploy_config = model_deploy.DeploymentConfig(num_replicas=2, num_ps_tasks=2) self.assertDeviceEqual(deploy_config.clone_device(0), '/job:worker') self.assertEqual(deploy_config.clone_scope(0), '') self.assertDeviceEqual(deploy_config.optimizer_device(), '/job:worker/device:CPU:0') self.assertDeviceEqual(deploy_config.inputs_device(), '/job:worker/device:CPU:0') def testReplicasMultiGPUPS(self): deploy_config = model_deploy.DeploymentConfig(num_replicas=2, num_clones=2, num_ps_tasks=2) self.assertDeviceEqual(deploy_config.clone_device(0), '/job:worker/device:GPU:0') self.assertDeviceEqual(deploy_config.clone_device(1), '/job:worker/device:GPU:1') self.assertEqual(deploy_config.clone_scope(0), 'clone_0') self.assertEqual(deploy_config.clone_scope(1), 'clone_1') self.assertDeviceEqual(deploy_config.optimizer_device(), '/job:worker/device:CPU:0') self.assertDeviceEqual(deploy_config.inputs_device(), '/job:worker/device:CPU:0') def testVariablesPS(self): deploy_config = model_deploy.DeploymentConfig(num_ps_tasks=2) with tf.device(deploy_config.variables_device()): a = tf.Variable(0) b = tf.Variable(0) c = tf.no_op() d = slim.variable('a', [], caching_device=deploy_config.caching_device()) self.assertDeviceEqual(a.device, '/job:ps/task:0/device:CPU:0') self.assertDeviceEqual(a.device, a.value().device) self.assertDeviceEqual(b.device, '/job:ps/task:1/device:CPU:0') self.assertDeviceEqual(b.device, b.value().device) self.assertDeviceEqual(c.device, '') self.assertDeviceEqual(d.device, '/job:ps/task:0/device:CPU:0') self.assertDeviceEqual(d.value().device, '') def LogisticClassifier(inputs, labels, scope=None, reuse=None): with tf.variable_scope(scope, 'LogisticClassifier', [inputs, labels], reuse=reuse): predictions = slim.fully_connected(inputs, 1, activation_fn=tf.sigmoid, scope='fully_connected') slim.losses.log_loss(predictions, labels) return predictions def BatchNormClassifier(inputs, labels, scope=None, reuse=None): with tf.variable_scope(scope, 'BatchNormClassifier', [inputs, labels], reuse=reuse): inputs = slim.batch_norm(inputs, decay=0.1) predictions = slim.fully_connected(inputs, 1, activation_fn=tf.sigmoid, scope='fully_connected') slim.losses.log_loss(predictions, labels) return predictions class CreatecloneTest(tf.test.TestCase): def setUp(self): # Create an easy training set: np.random.seed(0) self._inputs = np.zeros((16, 4)) self._labels = np.random.randint(0, 2, size=(16, 1)).astype(np.float32) self._logdir = self.get_temp_dir() for i in range(16): j = int(2 * self._labels[i] + np.random.randint(0, 2)) self._inputs[i, j] = 1 def testCreateLogisticClassifier(self): g = tf.Graph() with g.as_default(): tf.set_random_seed(0) tf_inputs = tf.constant(self._inputs, dtype=tf.float32) tf_labels = tf.constant(self._labels, dtype=tf.float32) model_fn = LogisticClassifier clone_args = (tf_inputs, tf_labels) deploy_config = model_deploy.DeploymentConfig(num_clones=1) self.assertEqual(slim.get_variables(), []) clones = model_deploy.create_clones(deploy_config, model_fn, clone_args) clone = clones[0] self.assertEqual(len(slim.get_variables()), 2) for v in slim.get_variables(): self.assertDeviceEqual(v.device, 'CPU:0') self.assertDeviceEqual(v.value().device, 'CPU:0') self.assertEqual(clone.outputs.op.name, 'LogisticClassifier/fully_connected/Sigmoid') self.assertEqual(clone.scope, '') self.assertDeviceEqual(clone.device, '') self.assertEqual(len(slim.losses.get_losses()), 1) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self.assertEqual(update_ops, []) def testCreateSingleclone(self): g = tf.Graph() with g.as_default(): tf.set_random_seed(0) tf_inputs = tf.constant(self._inputs, dtype=tf.float32) tf_labels = tf.constant(self._labels, dtype=tf.float32) model_fn = BatchNormClassifier clone_args = (tf_inputs, tf_labels) deploy_config = model_deploy.DeploymentConfig(num_clones=1) self.assertEqual(slim.get_variables(), []) clones = model_deploy.create_clones(deploy_config, model_fn, clone_args) clone = clones[0] self.assertEqual(len(slim.get_variables()), 5) for v in slim.get_variables(): self.assertDeviceEqual(v.device, 'CPU:0') self.assertDeviceEqual(v.value().device, 'CPU:0') self.assertEqual(clone.outputs.op.name, 'BatchNormClassifier/fully_connected/Sigmoid') self.assertEqual(clone.scope, '') self.assertDeviceEqual(clone.device, '') self.assertEqual(len(slim.losses.get_losses()), 1) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self.assertEqual(len(update_ops), 2) def testCreateMulticlone(self): g = tf.Graph() with g.as_default(): tf.set_random_seed(0) tf_inputs = tf.constant(self._inputs, dtype=tf.float32) tf_labels = tf.constant(self._labels, dtype=tf.float32) model_fn = BatchNormClassifier clone_args = (tf_inputs, tf_labels) num_clones = 4 deploy_config = model_deploy.DeploymentConfig(num_clones=num_clones) self.assertEqual(slim.get_variables(), []) clones = model_deploy.create_clones(deploy_config, model_fn, clone_args) self.assertEqual(len(slim.get_variables()), 5) for v in slim.get_variables(): self.assertDeviceEqual(v.device, 'CPU:0') self.assertDeviceEqual(v.value().device, 'CPU:0') self.assertEqual(len(clones), num_clones) for i, clone in enumerate(clones): self.assertEqual( clone.outputs.op.name, 'clone_%d/BatchNormClassifier/fully_connected/Sigmoid' % i) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, clone.scope) self.assertEqual(len(update_ops), 2) self.assertEqual(clone.scope, 'clone_%d/' % i) self.assertDeviceEqual(clone.device, 'GPU:%d' % i) def testCreateOnecloneWithPS(self): g = tf.Graph() with g.as_default(): tf.set_random_seed(0) tf_inputs = tf.constant(self._inputs, dtype=tf.float32) tf_labels = tf.constant(self._labels, dtype=tf.float32) model_fn = BatchNormClassifier clone_args = (tf_inputs, tf_labels) deploy_config = model_deploy.DeploymentConfig(num_clones=1, num_ps_tasks=1) self.assertEqual(slim.get_variables(), []) clones = model_deploy.create_clones(deploy_config, model_fn, clone_args) self.assertEqual(len(clones), 1) clone = clones[0] self.assertEqual(clone.outputs.op.name, 'BatchNormClassifier/fully_connected/Sigmoid') self.assertDeviceEqual(clone.device, '/job:worker') self.assertEqual(clone.scope, '') self.assertEqual(len(slim.get_variables()), 5) for v in slim.get_variables(): self.assertDeviceEqual(v.device, '/job:ps/task:0/CPU:0') self.assertDeviceEqual(v.device, v.value().device) def testCreateMulticloneWithPS(self): g = tf.Graph() with g.as_default(): tf.set_random_seed(0) tf_inputs = tf.constant(self._inputs, dtype=tf.float32) tf_labels = tf.constant(self._labels, dtype=tf.float32) model_fn = BatchNormClassifier clone_args = (tf_inputs, tf_labels) deploy_config = model_deploy.DeploymentConfig(num_clones=2, num_ps_tasks=2) self.assertEqual(slim.get_variables(), []) clones = model_deploy.create_clones(deploy_config, model_fn, clone_args) self.assertEqual(len(slim.get_variables()), 5) for i, v in enumerate(slim.get_variables()): t = i % 2 self.assertDeviceEqual(v.device, '/job:ps/task:%d/device:CPU:0' % t) self.assertDeviceEqual(v.device, v.value().device) self.assertEqual(len(clones), 2) for i, clone in enumerate(clones): self.assertEqual( clone.outputs.op.name, 'clone_%d/BatchNormClassifier/fully_connected/Sigmoid' % i) self.assertEqual(clone.scope, 'clone_%d/' % i) self.assertDeviceEqual(clone.device, '/job:worker/device:GPU:%d' % i) class OptimizeclonesTest(tf.test.TestCase): def setUp(self): # Create an easy training set: np.random.seed(0) self._inputs = np.zeros((16, 4)) self._labels = np.random.randint(0, 2, size=(16, 1)).astype(np.float32) self._logdir = self.get_temp_dir() for i in range(16): j = int(2 * self._labels[i] + np.random.randint(0, 2)) self._inputs[i, j] = 1 def testCreateLogisticClassifier(self): g = tf.Graph() with g.as_default(): tf.set_random_seed(0) tf_inputs = tf.constant(self._inputs, dtype=tf.float32) tf_labels = tf.constant(self._labels, dtype=tf.float32) model_fn = LogisticClassifier clone_args = (tf_inputs, tf_labels) deploy_config = model_deploy.DeploymentConfig(num_clones=1) self.assertEqual(slim.get_variables(), []) clones = model_deploy.create_clones(deploy_config, model_fn, clone_args) self.assertEqual(len(slim.get_variables()), 2) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self.assertEqual(update_ops, []) optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0) total_loss, grads_and_vars = model_deploy.optimize_clones(clones, optimizer) self.assertEqual(len(grads_and_vars), len(tf.trainable_variables())) self.assertEqual(total_loss.op.name, 'total_loss') for g, v in grads_and_vars: self.assertDeviceEqual(g.device, '') self.assertDeviceEqual(v.device, 'CPU:0') def testCreateSingleclone(self): g = tf.Graph() with g.as_default(): tf.set_random_seed(0) tf_inputs = tf.constant(self._inputs, dtype=tf.float32) tf_labels = tf.constant(self._labels, dtype=tf.float32) model_fn = BatchNormClassifier clone_args = (tf_inputs, tf_labels) deploy_config = model_deploy.DeploymentConfig(num_clones=1) self.assertEqual(slim.get_variables(), []) clones = model_deploy.create_clones(deploy_config, model_fn, clone_args) self.assertEqual(len(slim.get_variables()), 5) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self.assertEqual(len(update_ops), 2) optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0) total_loss, grads_and_vars = model_deploy.optimize_clones(clones, optimizer) self.assertEqual(len(grads_and_vars), len(tf.trainable_variables())) self.assertEqual(total_loss.op.name, 'total_loss') for g, v in grads_and_vars: self.assertDeviceEqual(g.device, '') self.assertDeviceEqual(v.device, 'CPU:0') def testCreateMulticlone(self): g = tf.Graph() with g.as_default(): tf.set_random_seed(0) tf_inputs = tf.constant(self._inputs, dtype=tf.float32) tf_labels = tf.constant(self._labels, dtype=tf.float32) model_fn = BatchNormClassifier clone_args = (tf_inputs, tf_labels) num_clones = 4 deploy_config = model_deploy.DeploymentConfig(num_clones=num_clones) self.assertEqual(slim.get_variables(), []) clones = model_deploy.create_clones(deploy_config, model_fn, clone_args) self.assertEqual(len(slim.get_variables()), 5) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self.assertEqual(len(update_ops), num_clones * 2) optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0) total_loss, grads_and_vars = model_deploy.optimize_clones(clones, optimizer) self.assertEqual(len(grads_and_vars), len(tf.trainable_variables())) self.assertEqual(total_loss.op.name, 'total_loss') for g, v in grads_and_vars: self.assertDeviceEqual(g.device, '') self.assertDeviceEqual(v.device, 'CPU:0') def testCreateMulticloneCPU(self): g = tf.Graph() with g.as_default(): tf.set_random_seed(0) tf_inputs = tf.constant(self._inputs, dtype=tf.float32) tf_labels = tf.constant(self._labels, dtype=tf.float32) model_fn = BatchNormClassifier model_args = (tf_inputs, tf_labels) num_clones = 4 deploy_config = model_deploy.DeploymentConfig(num_clones=num_clones, clone_on_cpu=True) self.assertEqual(slim.get_variables(), []) clones = model_deploy.create_clones(deploy_config, model_fn, model_args) self.assertEqual(len(slim.get_variables()), 5) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self.assertEqual(len(update_ops), num_clones * 2) optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0) total_loss, grads_and_vars = model_deploy.optimize_clones(clones, optimizer) self.assertEqual(len(grads_and_vars), len(tf.trainable_variables())) self.assertEqual(total_loss.op.name, 'total_loss') for g, v in grads_and_vars: self.assertDeviceEqual(g.device, '') self.assertDeviceEqual(v.device, 'CPU:0') def testCreateOnecloneWithPS(self): g = tf.Graph() with g.as_default(): tf.set_random_seed(0) tf_inputs = tf.constant(self._inputs, dtype=tf.float32) tf_labels = tf.constant(self._labels, dtype=tf.float32) model_fn = BatchNormClassifier model_args = (tf_inputs, tf_labels) deploy_config = model_deploy.DeploymentConfig(num_clones=1, num_ps_tasks=1) self.assertEqual(slim.get_variables(), []) clones = model_deploy.create_clones(deploy_config, model_fn, model_args) self.assertEqual(len(slim.get_variables()), 5) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self.assertEqual(len(update_ops), 2) optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0) total_loss, grads_and_vars = model_deploy.optimize_clones(clones, optimizer) self.assertEqual(len(grads_and_vars), len(tf.trainable_variables())) self.assertEqual(total_loss.op.name, 'total_loss') for g, v in grads_and_vars: self.assertDeviceEqual(g.device, '/job:worker') self.assertDeviceEqual(v.device, '/job:ps/task:0/CPU:0') class DeployTest(tf.test.TestCase): def setUp(self): # Create an easy training set: np.random.seed(0) self._inputs = np.zeros((16, 4)) self._labels = np.random.randint(0, 2, size=(16, 1)).astype(np.float32) self._logdir = self.get_temp_dir() for i in range(16): j = int(2 * self._labels[i] + np.random.randint(0, 2)) self._inputs[i, j] = 1 def testLocalTrainOp(self): g = tf.Graph() with g.as_default(): tf.set_random_seed(0) tf_inputs = tf.constant(self._inputs, dtype=tf.float32) tf_labels = tf.constant(self._labels, dtype=tf.float32) model_fn = BatchNormClassifier model_args = (tf_inputs, tf_labels) deploy_config = model_deploy.DeploymentConfig(num_clones=2, clone_on_cpu=True) optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0) self.assertEqual(slim.get_variables(), []) model = model_deploy.deploy(deploy_config, model_fn, model_args, optimizer=optimizer) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self.assertEqual(len(update_ops), 4) self.assertEqual(len(model.clones), 2) self.assertEqual(model.total_loss.op.name, 'total_loss') self.assertEqual(model.summary_op.op.name, 'summary_op/summary_op') self.assertEqual(model.train_op.op.name, 'train_op') with tf.Session() as sess: sess.run(tf.global_variables_initializer()) moving_mean = tf.contrib.framework.get_variables_by_name( 'moving_mean')[0] moving_variance = tf.contrib.framework.get_variables_by_name( 'moving_variance')[0] initial_loss = sess.run(model.total_loss) initial_mean, initial_variance = sess.run([moving_mean, moving_variance]) self.assertAllClose(initial_mean, [0.0, 0.0, 0.0, 0.0]) self.assertAllClose(initial_variance, [1.0, 1.0, 1.0, 1.0]) for _ in range(10): sess.run(model.train_op) final_loss = sess.run(model.total_loss) self.assertLess(final_loss, initial_loss / 10.0) final_mean, final_variance = sess.run([moving_mean, moving_variance]) self.assertAllClose(final_mean, [0.125, 0.25, 0.375, 0.25]) self.assertAllClose(final_variance, [0.109375, 0.1875, 0.234375, 0.1875]) def testNoSummariesOnGPU(self): with tf.Graph().as_default(): deploy_config = model_deploy.DeploymentConfig(num_clones=2) # clone function creates a fully_connected layer with a regularizer loss. def ModelFn(): inputs = tf.constant(1.0, shape=(10, 20), dtype=tf.float32) reg = tf.contrib.layers.l2_regularizer(0.001) tf.contrib.layers.fully_connected(inputs, 30, weights_regularizer=reg) model = model_deploy.deploy( deploy_config, ModelFn, optimizer=tf.train.GradientDescentOptimizer(1.0)) # The model summary op should have a few summary inputs and all of them # should be on the CPU. self.assertTrue(model.summary_op.op.inputs) for inp in model.summary_op.op.inputs: self.assertEqual('/device:CPU:0', inp.device) def testNoSummariesOnGPUForEvals(self): with tf.Graph().as_default(): deploy_config = model_deploy.DeploymentConfig(num_clones=2) # clone function creates a fully_connected layer with a regularizer loss. def ModelFn(): inputs = tf.constant(1.0, shape=(10, 20), dtype=tf.float32) reg = tf.contrib.layers.l2_regularizer(0.001) tf.contrib.layers.fully_connected(inputs, 30, weights_regularizer=reg) # No optimizer here, it's an eval. model = model_deploy.deploy(deploy_config, ModelFn) # The model summary op should have a few summary inputs and all of them # should be on the CPU. self.assertTrue(model.summary_op.op.inputs) for inp in model.summary_op.op.inputs: self.assertEqual('/device:CPU:0', inp.device) if __name__ == '__main__': tf.test.main() ================================================ FILE: models/slim/nets/__init__.py ================================================ ================================================ FILE: models/slim/nets/alexnet.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Contains a model definition for AlexNet. This work was first described in: ImageNet Classification with Deep Convolutional Neural Networks Alex Krizhevsky, Ilya Sutskever and Geoffrey E. Hinton and later refined in: One weird trick for parallelizing convolutional neural networks Alex Krizhevsky, 2014 Here we provide the implementation proposed in "One weird trick" and not "ImageNet Classification", as per the paper, the LRN layers have been removed. Usage: with slim.arg_scope(alexnet.alexnet_v2_arg_scope()): outputs, end_points = alexnet.alexnet_v2(inputs) @@alexnet_v2 """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf slim = tf.contrib.slim trunc_normal = lambda stddev: tf.truncated_normal_initializer(0.0, stddev) def alexnet_v2_arg_scope(weight_decay=0.0005): with slim.arg_scope([slim.conv2d, slim.fully_connected], activation_fn=tf.nn.relu, biases_initializer=tf.constant_initializer(0.1), weights_regularizer=slim.l2_regularizer(weight_decay)): with slim.arg_scope([slim.conv2d], padding='SAME'): with slim.arg_scope([slim.max_pool2d], padding='VALID') as arg_sc: return arg_sc def alexnet_v2(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.5, spatial_squeeze=True, scope='alexnet_v2'): """AlexNet version 2. Described in: http://arxiv.org/pdf/1404.5997v2.pdf Parameters from: github.com/akrizhevsky/cuda-convnet2/blob/master/layers/ layers-imagenet-1gpu.cfg Note: All the fully_connected layers have been transformed to conv2d layers. To use in classification mode, resize input to 224x224. To use in fully convolutional mode, set spatial_squeeze to false. The LRN layers have been removed and change the initializers from random_normal_initializer to xavier_initializer. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. spatial_squeeze: whether or not should squeeze the spatial dimensions of the outputs. Useful to remove unnecessary dimensions for classification. scope: Optional scope for the variables. Returns: the last op containing the log predictions and end_points dict. """ with tf.variable_scope(scope, 'alexnet_v2', [inputs]) as sc: end_points_collection = sc.name + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d. with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d], outputs_collections=[end_points_collection]): net = slim.conv2d(inputs, 64, [11, 11], 4, padding='VALID', scope='conv1') net = slim.max_pool2d(net, [3, 3], 2, scope='pool1') net = slim.conv2d(net, 192, [5, 5], scope='conv2') net = slim.max_pool2d(net, [3, 3], 2, scope='pool2') net = slim.conv2d(net, 384, [3, 3], scope='conv3') net = slim.conv2d(net, 384, [3, 3], scope='conv4') net = slim.conv2d(net, 256, [3, 3], scope='conv5') net = slim.max_pool2d(net, [3, 3], 2, scope='pool5') # Use conv2d instead of fully_connected layers. with slim.arg_scope([slim.conv2d], weights_initializer=trunc_normal(0.005), biases_initializer=tf.constant_initializer(0.1)): net = slim.conv2d(net, 4096, [5, 5], padding='VALID', scope='fc6') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout6') net = slim.conv2d(net, 4096, [1, 1], scope='fc7') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout7') net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, biases_initializer=tf.zeros_initializer, scope='fc8') # Convert end_points_collection into a end_point dict. end_points = slim.utils.convert_collection_to_dict(end_points_collection) if spatial_squeeze: net = tf.squeeze(net, [1, 2], name='fc8/squeezed') end_points[sc.name + '/fc8'] = net return net, end_points alexnet_v2.default_image_size = 224 ================================================ FILE: models/slim/nets/alexnet_test.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Tests for slim.nets.alexnet.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from nets import alexnet slim = tf.contrib.slim class AlexnetV2Test(tf.test.TestCase): def testBuild(self): batch_size = 5 height, width = 224, 224 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = alexnet.alexnet_v2(inputs, num_classes) self.assertEquals(logits.op.name, 'alexnet_v2/fc8/squeezed') self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) def testFullyConvolutional(self): batch_size = 1 height, width = 300, 400 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = alexnet.alexnet_v2(inputs, num_classes, spatial_squeeze=False) self.assertEquals(logits.op.name, 'alexnet_v2/fc8/BiasAdd') self.assertListEqual(logits.get_shape().as_list(), [batch_size, 4, 7, num_classes]) def testEndPoints(self): batch_size = 5 height, width = 224, 224 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) _, end_points = alexnet.alexnet_v2(inputs, num_classes) expected_names = ['alexnet_v2/conv1', 'alexnet_v2/pool1', 'alexnet_v2/conv2', 'alexnet_v2/pool2', 'alexnet_v2/conv3', 'alexnet_v2/conv4', 'alexnet_v2/conv5', 'alexnet_v2/pool5', 'alexnet_v2/fc6', 'alexnet_v2/fc7', 'alexnet_v2/fc8' ] self.assertSetEqual(set(end_points.keys()), set(expected_names)) def testModelVariables(self): batch_size = 5 height, width = 224, 224 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) alexnet.alexnet_v2(inputs, num_classes) expected_names = ['alexnet_v2/conv1/weights', 'alexnet_v2/conv1/biases', 'alexnet_v2/conv2/weights', 'alexnet_v2/conv2/biases', 'alexnet_v2/conv3/weights', 'alexnet_v2/conv3/biases', 'alexnet_v2/conv4/weights', 'alexnet_v2/conv4/biases', 'alexnet_v2/conv5/weights', 'alexnet_v2/conv5/biases', 'alexnet_v2/fc6/weights', 'alexnet_v2/fc6/biases', 'alexnet_v2/fc7/weights', 'alexnet_v2/fc7/biases', 'alexnet_v2/fc8/weights', 'alexnet_v2/fc8/biases', ] model_variables = [v.op.name for v in slim.get_model_variables()] self.assertSetEqual(set(model_variables), set(expected_names)) def testEvaluation(self): batch_size = 2 height, width = 224, 224 num_classes = 1000 with self.test_session(): eval_inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = alexnet.alexnet_v2(eval_inputs, is_training=False) self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) predictions = tf.argmax(logits, 1) self.assertListEqual(predictions.get_shape().as_list(), [batch_size]) def testTrainEvalWithReuse(self): train_batch_size = 2 eval_batch_size = 1 train_height, train_width = 224, 224 eval_height, eval_width = 300, 400 num_classes = 1000 with self.test_session(): train_inputs = tf.random_uniform( (train_batch_size, train_height, train_width, 3)) logits, _ = alexnet.alexnet_v2(train_inputs) self.assertListEqual(logits.get_shape().as_list(), [train_batch_size, num_classes]) tf.get_variable_scope().reuse_variables() eval_inputs = tf.random_uniform( (eval_batch_size, eval_height, eval_width, 3)) logits, _ = alexnet.alexnet_v2(eval_inputs, is_training=False, spatial_squeeze=False) self.assertListEqual(logits.get_shape().as_list(), [eval_batch_size, 4, 7, num_classes]) logits = tf.reduce_mean(logits, [1, 2]) predictions = tf.argmax(logits, 1) self.assertEquals(predictions.get_shape().as_list(), [eval_batch_size]) def testForward(self): batch_size = 1 height, width = 224, 224 with self.test_session() as sess: inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = alexnet.alexnet_v2(inputs) sess.run(tf.global_variables_initializer()) output = sess.run(logits) self.assertTrue(output.any()) if __name__ == '__main__': tf.test.main() ================================================ FILE: models/slim/nets/cifarnet.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Contains a variant of the CIFAR-10 model definition.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf slim = tf.contrib.slim trunc_normal = lambda stddev: tf.truncated_normal_initializer(stddev=stddev) def cifarnet(images, num_classes=10, is_training=False, dropout_keep_prob=0.5, prediction_fn=slim.softmax, scope='CifarNet'): """Creates a variant of the CifarNet model. Note that since the output is a set of 'logits', the values fall in the interval of (-infinity, infinity). Consequently, to convert the outputs to a probability distribution over the characters, one will need to convert them using the softmax function: logits = cifarnet.cifarnet(images, is_training=False) probabilities = tf.nn.softmax(logits) predictions = tf.argmax(logits, 1) Args: images: A batch of `Tensors` of size [batch_size, height, width, channels]. num_classes: the number of classes in the dataset. is_training: specifies whether or not we're currently training the model. This variable will determine the behaviour of the dropout layer. dropout_keep_prob: the percentage of activation values that are retained. prediction_fn: a function to get predictions out of logits. scope: Optional variable_scope. Returns: logits: the pre-softmax activations, a tensor of size [batch_size, `num_classes`] end_points: a dictionary from components of the network to the corresponding activation. """ end_points = {} with tf.variable_scope(scope, 'CifarNet', [images, num_classes]): net = slim.conv2d(images, 64, [5, 5], scope='conv1') end_points['conv1'] = net net = slim.max_pool2d(net, [2, 2], 2, scope='pool1') end_points['pool1'] = net net = tf.nn.lrn(net, 4, bias=1.0, alpha=0.001/9.0, beta=0.75, name='norm1') net = slim.conv2d(net, 64, [5, 5], scope='conv2') end_points['conv2'] = net net = tf.nn.lrn(net, 4, bias=1.0, alpha=0.001/9.0, beta=0.75, name='norm2') net = slim.max_pool2d(net, [2, 2], 2, scope='pool2') end_points['pool2'] = net net = slim.flatten(net) end_points['Flatten'] = net net = slim.fully_connected(net, 384, scope='fc3') end_points['fc3'] = net net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout3') net = slim.fully_connected(net, 192, scope='fc4') end_points['fc4'] = net logits = slim.fully_connected(net, num_classes, biases_initializer=tf.zeros_initializer, weights_initializer=trunc_normal(1/192.0), weights_regularizer=None, activation_fn=None, scope='logits') end_points['Logits'] = logits end_points['Predictions'] = prediction_fn(logits, scope='Predictions') return logits, end_points cifarnet.default_image_size = 32 def cifarnet_arg_scope(weight_decay=0.004): """Defines the default cifarnet argument scope. Args: weight_decay: The weight decay to use for regularizing the model. Returns: An `arg_scope` to use for the inception v3 model. """ with slim.arg_scope( [slim.conv2d], weights_initializer=tf.truncated_normal_initializer(stddev=5e-2), activation_fn=tf.nn.relu): with slim.arg_scope( [slim.fully_connected], biases_initializer=tf.constant_initializer(0.1), weights_initializer=trunc_normal(0.04), weights_regularizer=slim.l2_regularizer(weight_decay), activation_fn=tf.nn.relu) as sc: return sc ================================================ FILE: models/slim/nets/inception.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Brings all inception models under one namespace.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function # pylint: disable=unused-import from nets.inception_resnet_v2 import inception_resnet_v2 from nets.inception_resnet_v2 import inception_resnet_v2_arg_scope from nets.inception_v1 import inception_v1 from nets.inception_v1 import inception_v1_arg_scope from nets.inception_v1 import inception_v1_base from nets.inception_v2 import inception_v2 from nets.inception_v2 import inception_v2_arg_scope from nets.inception_v2 import inception_v2_base from nets.inception_v2_tsn import inception_v2_tsn from nets.inception_v2_tsn import inception_v2_tsn_arg_scope from nets.inception_v2_tsn import inception_v2_tsn_base from nets.inception_v3 import inception_v3 from nets.inception_v3 import inception_v3_arg_scope from nets.inception_v3 import inception_v3_base from nets.inception_v4 import inception_v4 from nets.inception_v4 import inception_v4_arg_scope from nets.inception_v4 import inception_v4_base # pylint: enable=unused-import ================================================ FILE: models/slim/nets/inception_resnet_v2.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Contains the definition of the Inception Resnet V2 architecture. As described in http://arxiv.org/abs/1602.07261. Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke, Alex Alemi """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf slim = tf.contrib.slim def block35(net, scale=1.0, activation_fn=tf.nn.relu, scope=None, reuse=None): """Builds the 35x35 resnet block.""" with tf.variable_scope(scope, 'Block35', [net], reuse=reuse): with tf.variable_scope('Branch_0'): tower_conv = slim.conv2d(net, 32, 1, scope='Conv2d_1x1') with tf.variable_scope('Branch_1'): tower_conv1_0 = slim.conv2d(net, 32, 1, scope='Conv2d_0a_1x1') tower_conv1_1 = slim.conv2d(tower_conv1_0, 32, 3, scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): tower_conv2_0 = slim.conv2d(net, 32, 1, scope='Conv2d_0a_1x1') tower_conv2_1 = slim.conv2d(tower_conv2_0, 48, 3, scope='Conv2d_0b_3x3') tower_conv2_2 = slim.conv2d(tower_conv2_1, 64, 3, scope='Conv2d_0c_3x3') mixed = tf.concat(axis=3, values=[tower_conv, tower_conv1_1, tower_conv2_2]) up = slim.conv2d(mixed, net.get_shape()[3], 1, normalizer_fn=None, activation_fn=None, scope='Conv2d_1x1') net += scale * up if activation_fn: net = activation_fn(net) return net def block17(net, scale=1.0, activation_fn=tf.nn.relu, scope=None, reuse=None): """Builds the 17x17 resnet block.""" with tf.variable_scope(scope, 'Block17', [net], reuse=reuse): with tf.variable_scope('Branch_0'): tower_conv = slim.conv2d(net, 192, 1, scope='Conv2d_1x1') with tf.variable_scope('Branch_1'): tower_conv1_0 = slim.conv2d(net, 128, 1, scope='Conv2d_0a_1x1') tower_conv1_1 = slim.conv2d(tower_conv1_0, 160, [1, 7], scope='Conv2d_0b_1x7') tower_conv1_2 = slim.conv2d(tower_conv1_1, 192, [7, 1], scope='Conv2d_0c_7x1') mixed = tf.concat(axis=3, values=[tower_conv, tower_conv1_2]) up = slim.conv2d(mixed, net.get_shape()[3], 1, normalizer_fn=None, activation_fn=None, scope='Conv2d_1x1') net += scale * up if activation_fn: net = activation_fn(net) return net def block8(net, scale=1.0, activation_fn=tf.nn.relu, scope=None, reuse=None): """Builds the 8x8 resnet block.""" with tf.variable_scope(scope, 'Block8', [net], reuse=reuse): with tf.variable_scope('Branch_0'): tower_conv = slim.conv2d(net, 192, 1, scope='Conv2d_1x1') with tf.variable_scope('Branch_1'): tower_conv1_0 = slim.conv2d(net, 192, 1, scope='Conv2d_0a_1x1') tower_conv1_1 = slim.conv2d(tower_conv1_0, 224, [1, 3], scope='Conv2d_0b_1x3') tower_conv1_2 = slim.conv2d(tower_conv1_1, 256, [3, 1], scope='Conv2d_0c_3x1') mixed = tf.concat(axis=3, values=[tower_conv, tower_conv1_2]) up = slim.conv2d(mixed, net.get_shape()[3], 1, normalizer_fn=None, activation_fn=None, scope='Conv2d_1x1') net += scale * up if activation_fn: net = activation_fn(net) return net def inception_resnet_v2(inputs, num_classes=1001, is_training=True, dropout_keep_prob=0.8, reuse=None, scope='InceptionResnetV2'): """Creates the Inception Resnet V2 model. Args: inputs: a 4-D tensor of size [batch_size, height, width, 3]. num_classes: number of predicted classes. is_training: whether is training or not. dropout_keep_prob: float, the fraction to keep before final layer. reuse: whether or not the network and its variables should be reused. To be able to reuse 'scope' must be given. scope: Optional variable_scope. Returns: logits: the logits outputs of the model. end_points: the set of end_points from the inception model. """ end_points = {} with tf.variable_scope(scope, 'InceptionResnetV2', [inputs], reuse=reuse): with slim.arg_scope([slim.batch_norm, slim.dropout], is_training=is_training): with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], stride=1, padding='SAME'): # 149 x 149 x 32 net = slim.conv2d(inputs, 32, 3, stride=2, padding='VALID', scope='Conv2d_1a_3x3') end_points['Conv2d_1a_3x3'] = net # 147 x 147 x 32 net = slim.conv2d(net, 32, 3, padding='VALID', scope='Conv2d_2a_3x3') end_points['Conv2d_2a_3x3'] = net # 147 x 147 x 64 net = slim.conv2d(net, 64, 3, scope='Conv2d_2b_3x3') end_points['Conv2d_2b_3x3'] = net # 73 x 73 x 64 net = slim.max_pool2d(net, 3, stride=2, padding='VALID', scope='MaxPool_3a_3x3') end_points['MaxPool_3a_3x3'] = net # 73 x 73 x 80 net = slim.conv2d(net, 80, 1, padding='VALID', scope='Conv2d_3b_1x1') end_points['Conv2d_3b_1x1'] = net # 71 x 71 x 192 net = slim.conv2d(net, 192, 3, padding='VALID', scope='Conv2d_4a_3x3') end_points['Conv2d_4a_3x3'] = net # 35 x 35 x 192 net = slim.max_pool2d(net, 3, stride=2, padding='VALID', scope='MaxPool_5a_3x3') end_points['MaxPool_5a_3x3'] = net # 35 x 35 x 320 with tf.variable_scope('Mixed_5b'): with tf.variable_scope('Branch_0'): tower_conv = slim.conv2d(net, 96, 1, scope='Conv2d_1x1') with tf.variable_scope('Branch_1'): tower_conv1_0 = slim.conv2d(net, 48, 1, scope='Conv2d_0a_1x1') tower_conv1_1 = slim.conv2d(tower_conv1_0, 64, 5, scope='Conv2d_0b_5x5') with tf.variable_scope('Branch_2'): tower_conv2_0 = slim.conv2d(net, 64, 1, scope='Conv2d_0a_1x1') tower_conv2_1 = slim.conv2d(tower_conv2_0, 96, 3, scope='Conv2d_0b_3x3') tower_conv2_2 = slim.conv2d(tower_conv2_1, 96, 3, scope='Conv2d_0c_3x3') with tf.variable_scope('Branch_3'): tower_pool = slim.avg_pool2d(net, 3, stride=1, padding='SAME', scope='AvgPool_0a_3x3') tower_pool_1 = slim.conv2d(tower_pool, 64, 1, scope='Conv2d_0b_1x1') net = tf.concat(axis=3, values=[tower_conv, tower_conv1_1, tower_conv2_2, tower_pool_1]) end_points['Mixed_5b'] = net net = slim.repeat(net, 10, block35, scale=0.17) # 17 x 17 x 1024 with tf.variable_scope('Mixed_6a'): with tf.variable_scope('Branch_0'): tower_conv = slim.conv2d(net, 384, 3, stride=2, padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_1'): tower_conv1_0 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1') tower_conv1_1 = slim.conv2d(tower_conv1_0, 256, 3, scope='Conv2d_0b_3x3') tower_conv1_2 = slim.conv2d(tower_conv1_1, 384, 3, stride=2, padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_2'): tower_pool = slim.max_pool2d(net, 3, stride=2, padding='VALID', scope='MaxPool_1a_3x3') net = tf.concat(axis=3, values=[tower_conv, tower_conv1_2, tower_pool]) end_points['Mixed_6a'] = net net = slim.repeat(net, 20, block17, scale=0.10) # Auxillary tower with tf.variable_scope('AuxLogits'): aux = slim.avg_pool2d(net, 5, stride=3, padding='VALID', scope='Conv2d_1a_3x3') aux = slim.conv2d(aux, 128, 1, scope='Conv2d_1b_1x1') aux = slim.conv2d(aux, 768, aux.get_shape()[1:3], padding='VALID', scope='Conv2d_2a_5x5') aux = slim.flatten(aux) aux = slim.fully_connected(aux, num_classes, activation_fn=None, scope='Logits') end_points['AuxLogits'] = aux with tf.variable_scope('Mixed_7a'): with tf.variable_scope('Branch_0'): tower_conv = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1') tower_conv_1 = slim.conv2d(tower_conv, 384, 3, stride=2, padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_1'): tower_conv1 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1') tower_conv1_1 = slim.conv2d(tower_conv1, 288, 3, stride=2, padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_2'): tower_conv2 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1') tower_conv2_1 = slim.conv2d(tower_conv2, 288, 3, scope='Conv2d_0b_3x3') tower_conv2_2 = slim.conv2d(tower_conv2_1, 320, 3, stride=2, padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_3'): tower_pool = slim.max_pool2d(net, 3, stride=2, padding='VALID', scope='MaxPool_1a_3x3') net = tf.concat(axis=3, values=[tower_conv_1, tower_conv1_1, tower_conv2_2, tower_pool]) end_points['Mixed_7a'] = net net = slim.repeat(net, 9, block8, scale=0.20) net = block8(net, activation_fn=None) net = slim.conv2d(net, 1536, 1, scope='Conv2d_7b_1x1') end_points['Conv2d_7b_1x1'] = net with tf.variable_scope('Logits'): end_points['PrePool'] = net net = slim.avg_pool2d(net, net.get_shape()[1:3], padding='VALID', scope='AvgPool_1a_8x8') net = slim.flatten(net) net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='Dropout') end_points['PreLogitsFlatten'] = net logits = slim.fully_connected(net, num_classes, activation_fn=None, scope='Logits') end_points['Logits'] = logits end_points['Predictions'] = tf.nn.softmax(logits, name='Predictions') return logits, end_points inception_resnet_v2.default_image_size = 299 def inception_resnet_v2_arg_scope(weight_decay=0.00004, batch_norm_decay=0.9997, batch_norm_epsilon=0.001): """Yields the scope with the default parameters for inception_resnet_v2. Args: weight_decay: the weight decay for weights variables. batch_norm_decay: decay for the moving average of batch_norm momentums. batch_norm_epsilon: small float added to variance to avoid dividing by zero. Returns: a arg_scope with the parameters needed for inception_resnet_v2. """ # Set weight_decay for weights in conv2d and fully_connected layers. with slim.arg_scope([slim.conv2d, slim.fully_connected], weights_regularizer=slim.l2_regularizer(weight_decay), biases_regularizer=slim.l2_regularizer(weight_decay)): batch_norm_params = { 'decay': batch_norm_decay, 'epsilon': batch_norm_epsilon, } # Set activation_fn and parameters for batch_norm. with slim.arg_scope([slim.conv2d], activation_fn=tf.nn.relu, normalizer_fn=slim.batch_norm, normalizer_params=batch_norm_params) as scope: return scope ================================================ FILE: models/slim/nets/inception_resnet_v2_test.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Tests for slim.inception_resnet_v2.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from nets import inception class InceptionTest(tf.test.TestCase): def testBuildLogits(self): batch_size = 5 height, width = 299, 299 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = inception.inception_resnet_v2(inputs, num_classes) self.assertTrue(logits.op.name.startswith('InceptionResnetV2/Logits')) self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) def testBuildEndPoints(self): batch_size = 5 height, width = 299, 299 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) _, end_points = inception.inception_resnet_v2(inputs, num_classes) self.assertTrue('Logits' in end_points) logits = end_points['Logits'] self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) self.assertTrue('AuxLogits' in end_points) aux_logits = end_points['AuxLogits'] self.assertListEqual(aux_logits.get_shape().as_list(), [batch_size, num_classes]) pre_pool = end_points['PrePool'] self.assertListEqual(pre_pool.get_shape().as_list(), [batch_size, 8, 8, 1536]) def testVariablesSetDevice(self): batch_size = 5 height, width = 299, 299 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) # Force all Variables to reside on the device. with tf.variable_scope('on_cpu'), tf.device('/cpu:0'): inception.inception_resnet_v2(inputs, num_classes) with tf.variable_scope('on_gpu'), tf.device('/gpu:0'): inception.inception_resnet_v2(inputs, num_classes) for v in tf.get_collection(tf.GraphKeys.VARIABLES, scope='on_cpu'): self.assertDeviceEqual(v.device, '/cpu:0') for v in tf.get_collection(tf.GraphKeys.VARIABLES, scope='on_gpu'): self.assertDeviceEqual(v.device, '/gpu:0') def testHalfSizeImages(self): batch_size = 5 height, width = 150, 150 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) logits, end_points = inception.inception_resnet_v2(inputs, num_classes) self.assertTrue(logits.op.name.startswith('InceptionResnetV2/Logits')) self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) pre_pool = end_points['PrePool'] self.assertListEqual(pre_pool.get_shape().as_list(), [batch_size, 3, 3, 1536]) def testUnknownBatchSize(self): batch_size = 1 height, width = 299, 299 num_classes = 1000 with self.test_session() as sess: inputs = tf.placeholder(tf.float32, (None, height, width, 3)) logits, _ = inception.inception_resnet_v2(inputs, num_classes) self.assertTrue(logits.op.name.startswith('InceptionResnetV2/Logits')) self.assertListEqual(logits.get_shape().as_list(), [None, num_classes]) images = tf.random_uniform((batch_size, height, width, 3)) sess.run(tf.global_variables_initializer()) output = sess.run(logits, {inputs: images.eval()}) self.assertEquals(output.shape, (batch_size, num_classes)) def testEvaluation(self): batch_size = 2 height, width = 299, 299 num_classes = 1000 with self.test_session() as sess: eval_inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = inception.inception_resnet_v2(eval_inputs, num_classes, is_training=False) predictions = tf.argmax(logits, 1) sess.run(tf.global_variables_initializer()) output = sess.run(predictions) self.assertEquals(output.shape, (batch_size,)) def testTrainEvalWithReuse(self): train_batch_size = 5 eval_batch_size = 2 height, width = 150, 150 num_classes = 1000 with self.test_session() as sess: train_inputs = tf.random_uniform((train_batch_size, height, width, 3)) inception.inception_resnet_v2(train_inputs, num_classes) eval_inputs = tf.random_uniform((eval_batch_size, height, width, 3)) logits, _ = inception.inception_resnet_v2(eval_inputs, num_classes, is_training=False, reuse=True) predictions = tf.argmax(logits, 1) sess.run(tf.global_variables_initializer()) output = sess.run(predictions) self.assertEquals(output.shape, (eval_batch_size,)) if __name__ == '__main__': tf.test.main() ================================================ FILE: models/slim/nets/inception_utils.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Contains common code shared by all inception models. Usage of arg scope: with slim.arg_scope(inception_arg_scope()): logits, end_points = inception.inception_v3(images, num_classes, is_training=is_training) """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf slim = tf.contrib.slim def inception_arg_scope(weight_decay=0.00004, use_batch_norm=True, batch_norm_decay=0.9997, batch_norm_epsilon=0.001): """Defines the default arg scope for inception models. Args: weight_decay: The weight decay to use for regularizing the model. use_batch_norm: "If `True`, batch_norm is applied after each convolution. batch_norm_decay: Decay for batch norm moving average. batch_norm_epsilon: Small float added to variance to avoid dividing by zero in batch norm. Returns: An `arg_scope` to use for the inception models. """ batch_norm_params = { # Decay for the moving averages. 'decay': batch_norm_decay, # epsilon to prevent 0s in variance. 'epsilon': batch_norm_epsilon, # collection containing update_ops. 'updates_collections': tf.GraphKeys.UPDATE_OPS, } if use_batch_norm: normalizer_fn = slim.batch_norm normalizer_params = batch_norm_params else: normalizer_fn = None normalizer_params = {} # Set weight_decay for weights in Conv and FC layers. with slim.arg_scope([slim.conv2d, slim.fully_connected], weights_regularizer=slim.l2_regularizer(weight_decay)): with slim.arg_scope( [slim.conv2d], weights_initializer=slim.variance_scaling_initializer(), activation_fn=tf.nn.relu, normalizer_fn=normalizer_fn, normalizer_params=normalizer_params) as sc: return sc ================================================ FILE: models/slim/nets/inception_v1.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Contains the definition for inception v1 classification network.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from nets import inception_utils slim = tf.contrib.slim trunc_normal = lambda stddev: tf.truncated_normal_initializer(0.0, stddev) def inception_v1_base(inputs, final_endpoint='Mixed_5c', scope='InceptionV1'): """Defines the Inception V1 base architecture. This architecture is defined in: Going deeper with convolutions Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich. http://arxiv.org/pdf/1409.4842v1.pdf. Args: inputs: a tensor of size [batch_size, height, width, channels]. final_endpoint: specifies the endpoint to construct the network up to. It can be one of ['Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1', 'Conv2d_2c_3x3', 'MaxPool_3a_3x3', 'Mixed_3b', 'Mixed_3c', 'MaxPool_4a_3x3', 'Mixed_4b', 'Mixed_4c', 'Mixed_4d', 'Mixed_4e', 'Mixed_4f', 'MaxPool_5a_2x2', 'Mixed_5b', 'Mixed_5c'] scope: Optional variable_scope. Returns: A dictionary from components of the network to the corresponding activation. Raises: ValueError: if final_endpoint is not set to one of the predefined values. """ end_points = {} with tf.variable_scope(scope, 'InceptionV1', [inputs]): with slim.arg_scope( [slim.conv2d, slim.fully_connected], weights_initializer=trunc_normal(0.01)): with slim.arg_scope([slim.conv2d, slim.max_pool2d], stride=1, padding='SAME'): end_point = 'Conv2d_1a_7x7' net = slim.conv2d(inputs, 64, [7, 7], stride=2, scope=end_point) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'MaxPool_2a_3x3' net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Conv2d_2b_1x1' net = slim.conv2d(net, 64, [1, 1], scope=end_point) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Conv2d_2c_3x3' net = slim.conv2d(net, 192, [3, 3], scope=end_point) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'MaxPool_3a_3x3' net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Mixed_3b' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 64, [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 96, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 128, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, 16, [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, 32, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') branch_3 = slim.conv2d(branch_3, 32, [1, 1], scope='Conv2d_0b_1x1') net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Mixed_3c' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 128, [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 128, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 192, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, 32, [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, 96, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1') net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'MaxPool_4a_3x3' net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Mixed_4b' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 192, [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 96, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 208, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, 16, [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, 48, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1') net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Mixed_4c' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 160, [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 112, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 224, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, 24, [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, 64, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1') net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Mixed_4d' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 128, [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 128, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 256, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, 24, [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, 64, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1') net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Mixed_4e' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 112, [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 144, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 288, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, 32, [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, 64, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1') net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Mixed_4f' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 256, [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 160, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 320, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, 32, [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, 128, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') branch_3 = slim.conv2d(branch_3, 128, [1, 1], scope='Conv2d_0b_1x1') net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'MaxPool_5a_2x2' net = slim.max_pool2d(net, [2, 2], stride=2, scope=end_point) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Mixed_5b' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 256, [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 160, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 320, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, 32, [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, 128, [3, 3], scope='Conv2d_0a_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') branch_3 = slim.conv2d(branch_3, 128, [1, 1], scope='Conv2d_0b_1x1') net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if final_endpoint == end_point: return net, end_points end_point = 'Mixed_5c' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 384, [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 192, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 384, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, 48, [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, 128, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') branch_3 = slim.conv2d(branch_3, 128, [1, 1], scope='Conv2d_0b_1x1') net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if final_endpoint == end_point: return net, end_points raise ValueError('Unknown final endpoint %s' % final_endpoint) def inception_v1(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.8, prediction_fn=slim.softmax, spatial_squeeze=True, reuse=None, scope='InceptionV1'): """Defines the Inception V1 architecture. This architecture is defined in: Going deeper with convolutions Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich. http://arxiv.org/pdf/1409.4842v1.pdf. The default image size used to train this network is 224x224. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether is training or not. dropout_keep_prob: the percentage of activation values that are retained. prediction_fn: a function to get predictions out of logits. spatial_squeeze: if True, logits is of shape is [B, C], if false logits is of shape [B, 1, 1, C], where B is batch_size and C is number of classes. reuse: whether or not the network and its variables should be reused. To be able to reuse 'scope' must be given. scope: Optional variable_scope. Returns: logits: the pre-softmax activations, a tensor of size [batch_size, num_classes] end_points: a dictionary from components of the network to the corresponding activation. """ # Final pooling and prediction with tf.variable_scope(scope, 'InceptionV1', [inputs, num_classes], reuse=reuse) as scope: with slim.arg_scope([slim.batch_norm, slim.dropout], is_training=is_training): net, end_points = inception_v1_base(inputs, scope=scope) with tf.variable_scope('Logits'): net = slim.avg_pool2d(net, [7, 7], stride=1, scope='MaxPool_0a_7x7') net = slim.dropout(net, dropout_keep_prob, scope='Dropout_0b') logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='Conv2d_0c_1x1') if spatial_squeeze: logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze') end_points['Logits'] = logits end_points['Predictions'] = prediction_fn(logits, scope='Predictions') return logits, end_points inception_v1.default_image_size = 224 inception_v1_arg_scope = inception_utils.inception_arg_scope ================================================ FILE: models/slim/nets/inception_v1_test.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Tests for nets.inception_v1.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import tensorflow as tf from nets import inception slim = tf.contrib.slim class InceptionV1Test(tf.test.TestCase): def testBuildClassificationNetwork(self): batch_size = 5 height, width = 224, 224 num_classes = 1000 inputs = tf.random_uniform((batch_size, height, width, 3)) logits, end_points = inception.inception_v1(inputs, num_classes) self.assertTrue(logits.op.name.startswith('InceptionV1/Logits')) self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) self.assertTrue('Predictions' in end_points) self.assertListEqual(end_points['Predictions'].get_shape().as_list(), [batch_size, num_classes]) def testBuildBaseNetwork(self): batch_size = 5 height, width = 224, 224 inputs = tf.random_uniform((batch_size, height, width, 3)) mixed_6c, end_points = inception.inception_v1_base(inputs) self.assertTrue(mixed_6c.op.name.startswith('InceptionV1/Mixed_5c')) self.assertListEqual(mixed_6c.get_shape().as_list(), [batch_size, 7, 7, 1024]) expected_endpoints = ['Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1', 'Conv2d_2c_3x3', 'MaxPool_3a_3x3', 'Mixed_3b', 'Mixed_3c', 'MaxPool_4a_3x3', 'Mixed_4b', 'Mixed_4c', 'Mixed_4d', 'Mixed_4e', 'Mixed_4f', 'MaxPool_5a_2x2', 'Mixed_5b', 'Mixed_5c'] self.assertItemsEqual(end_points.keys(), expected_endpoints) def testBuildOnlyUptoFinalEndpoint(self): batch_size = 5 height, width = 224, 224 endpoints = ['Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1', 'Conv2d_2c_3x3', 'MaxPool_3a_3x3', 'Mixed_3b', 'Mixed_3c', 'MaxPool_4a_3x3', 'Mixed_4b', 'Mixed_4c', 'Mixed_4d', 'Mixed_4e', 'Mixed_4f', 'MaxPool_5a_2x2', 'Mixed_5b', 'Mixed_5c'] for index, endpoint in enumerate(endpoints): with tf.Graph().as_default(): inputs = tf.random_uniform((batch_size, height, width, 3)) out_tensor, end_points = inception.inception_v1_base( inputs, final_endpoint=endpoint) self.assertTrue(out_tensor.op.name.startswith( 'InceptionV1/' + endpoint)) self.assertItemsEqual(endpoints[:index+1], end_points) def testBuildAndCheckAllEndPointsUptoMixed5c(self): batch_size = 5 height, width = 224, 224 inputs = tf.random_uniform((batch_size, height, width, 3)) _, end_points = inception.inception_v1_base(inputs, final_endpoint='Mixed_5c') endpoints_shapes = {'Conv2d_1a_7x7': [5, 112, 112, 64], 'MaxPool_2a_3x3': [5, 56, 56, 64], 'Conv2d_2b_1x1': [5, 56, 56, 64], 'Conv2d_2c_3x3': [5, 56, 56, 192], 'MaxPool_3a_3x3': [5, 28, 28, 192], 'Mixed_3b': [5, 28, 28, 256], 'Mixed_3c': [5, 28, 28, 480], 'MaxPool_4a_3x3': [5, 14, 14, 480], 'Mixed_4b': [5, 14, 14, 512], 'Mixed_4c': [5, 14, 14, 512], 'Mixed_4d': [5, 14, 14, 512], 'Mixed_4e': [5, 14, 14, 528], 'Mixed_4f': [5, 14, 14, 832], 'MaxPool_5a_2x2': [5, 7, 7, 832], 'Mixed_5b': [5, 7, 7, 832], 'Mixed_5c': [5, 7, 7, 1024]} self.assertItemsEqual(endpoints_shapes.keys(), end_points.keys()) for endpoint_name in endpoints_shapes: expected_shape = endpoints_shapes[endpoint_name] self.assertTrue(endpoint_name in end_points) self.assertListEqual(end_points[endpoint_name].get_shape().as_list(), expected_shape) def testModelHasExpectedNumberOfParameters(self): batch_size = 5 height, width = 224, 224 inputs = tf.random_uniform((batch_size, height, width, 3)) with slim.arg_scope(inception.inception_v1_arg_scope()): inception.inception_v1_base(inputs) total_params, _ = slim.model_analyzer.analyze_vars( slim.get_model_variables()) self.assertAlmostEqual(5607184, total_params) def testHalfSizeImages(self): batch_size = 5 height, width = 112, 112 inputs = tf.random_uniform((batch_size, height, width, 3)) mixed_5c, _ = inception.inception_v1_base(inputs) self.assertTrue(mixed_5c.op.name.startswith('InceptionV1/Mixed_5c')) self.assertListEqual(mixed_5c.get_shape().as_list(), [batch_size, 4, 4, 1024]) def testUnknownImageShape(self): tf.reset_default_graph() batch_size = 2 height, width = 224, 224 num_classes = 1000 input_np = np.random.uniform(0, 1, (batch_size, height, width, 3)) with self.test_session() as sess: inputs = tf.placeholder(tf.float32, shape=(batch_size, None, None, 3)) logits, end_points = inception.inception_v1(inputs, num_classes) self.assertTrue(logits.op.name.startswith('InceptionV1/Logits')) self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) pre_pool = end_points['Mixed_5c'] feed_dict = {inputs: input_np} tf.global_variables_initializer().run() pre_pool_out = sess.run(pre_pool, feed_dict=feed_dict) self.assertListEqual(list(pre_pool_out.shape), [batch_size, 7, 7, 1024]) def testUnknowBatchSize(self): batch_size = 1 height, width = 224, 224 num_classes = 1000 inputs = tf.placeholder(tf.float32, (None, height, width, 3)) logits, _ = inception.inception_v1(inputs, num_classes) self.assertTrue(logits.op.name.startswith('InceptionV1/Logits')) self.assertListEqual(logits.get_shape().as_list(), [None, num_classes]) images = tf.random_uniform((batch_size, height, width, 3)) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) output = sess.run(logits, {inputs: images.eval()}) self.assertEquals(output.shape, (batch_size, num_classes)) def testEvaluation(self): batch_size = 2 height, width = 224, 224 num_classes = 1000 eval_inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = inception.inception_v1(eval_inputs, num_classes, is_training=False) predictions = tf.argmax(logits, 1) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) output = sess.run(predictions) self.assertEquals(output.shape, (batch_size,)) def testTrainEvalWithReuse(self): train_batch_size = 5 eval_batch_size = 2 height, width = 224, 224 num_classes = 1000 train_inputs = tf.random_uniform((train_batch_size, height, width, 3)) inception.inception_v1(train_inputs, num_classes) eval_inputs = tf.random_uniform((eval_batch_size, height, width, 3)) logits, _ = inception.inception_v1(eval_inputs, num_classes, reuse=True) predictions = tf.argmax(logits, 1) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) output = sess.run(predictions) self.assertEquals(output.shape, (eval_batch_size,)) def testLogitsNotSqueezed(self): num_classes = 25 images = tf.random_uniform([1, 224, 224, 3]) logits, _ = inception.inception_v1(images, num_classes=num_classes, spatial_squeeze=False) with self.test_session() as sess: tf.global_variables_initializer().run() logits_out = sess.run(logits) self.assertListEqual(list(logits_out.shape), [1, 1, 1, num_classes]) if __name__ == '__main__': tf.test.main() ================================================ FILE: models/slim/nets/inception_v2.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Contains the definition for inception v2 classification network.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from nets import inception_utils slim = tf.contrib.slim trunc_normal = lambda stddev: tf.truncated_normal_initializer(0.0, stddev) def inception_v2_base(inputs, final_endpoint='Mixed_5c', min_depth=16, depth_multiplier=1.0, scope=None): """Inception v2 (6a2). Constructs an Inception v2 network from inputs to the given final endpoint. This method can construct the network up to the layer inception(5b) as described in http://arxiv.org/abs/1502.03167. Args: inputs: a tensor of shape [batch_size, height, width, channels]. final_endpoint: specifies the endpoint to construct the network up to. It can be one of ['Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1', 'Conv2d_2c_3x3', 'MaxPool_3a_3x3', 'Mixed_3b', 'Mixed_3c', 'Mixed_4a', 'Mixed_4b', 'Mixed_4c', 'Mixed_4d', 'Mixed_4e', 'Mixed_5a', 'Mixed_5b', 'Mixed_5c']. min_depth: Minimum depth value (number of channels) for all convolution ops. Enforced when depth_multiplier < 1, and not an active constraint when depth_multiplier >= 1. depth_multiplier: Float multiplier for the depth (number of channels) for all convolution ops. The value must be greater than zero. Typical usage will be to set this value in (0, 1) to reduce the number of parameters or computation cost of the model. scope: Optional variable_scope. Returns: tensor_out: output tensor corresponding to the final_endpoint. end_points: a set of activations for external use, for example summaries or losses. Raises: ValueError: if final_endpoint is not set to one of the predefined values, or depth_multiplier <= 0 """ # end_points will collect relevant activations for external use, for example # summaries or losses. end_points = {} # Used to find thinned depths for each layer. if depth_multiplier <= 0: raise ValueError('depth_multiplier is not greater than zero.') depth = lambda d: max(int(d * depth_multiplier), min_depth) with tf.variable_scope(scope, 'InceptionV2', [inputs]): with slim.arg_scope( [slim.conv2d, slim.max_pool2d, slim.avg_pool2d, slim.separable_conv2d], stride=1, padding='SAME'): # Note that sizes in the comments below assume an input spatial size of # 224x224, however, the inputs can be of any size greater 32x32. # 224 x 224 x 3 end_point = 'Conv2d_1a_7x7' # depthwise_multiplier here is different from depth_multiplier. # depthwise_multiplier determines the output channels of the initial # depthwise conv (see docs for tf.nn.separable_conv2d), while # depth_multiplier controls the # channels of the subsequent 1x1 # convolution. Must have # in_channels * depthwise_multipler <= out_channels # so that the separable convolution is not overparameterized. depthwise_multiplier = min(int(depth(64) / 3), 8) net = slim.separable_conv2d( inputs, depth(64), [7, 7], depth_multiplier=depthwise_multiplier, stride=2, weights_initializer=trunc_normal(1.0), scope=end_point) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 112 x 112 x 64 end_point = 'MaxPool_2a_3x3' net = slim.max_pool2d(net, [3, 3], scope=end_point, stride=2) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 56 x 56 x 64 end_point = 'Conv2d_2b_1x1' net = slim.conv2d(net, depth(64), [1, 1], scope=end_point, weights_initializer=trunc_normal(0.1)) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 56 x 56 x 64 end_point = 'Conv2d_2c_3x3' net = slim.conv2d(net, depth(192), [3, 3], scope=end_point) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 56 x 56 x 192 end_point = 'MaxPool_3a_3x3' net = slim.max_pool2d(net, [3, 3], scope=end_point, stride=2) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 28 x 28 x 192 # Inception module. end_point = 'Mixed_3b' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d( net, depth(64), [1, 1], weights_initializer=trunc_normal(0.09), scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(64), [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d( net, depth(64), [1, 1], weights_initializer=trunc_normal(0.09), scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], scope='Conv2d_0b_3x3') branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], scope='Conv2d_0c_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d( branch_3, depth(32), [1, 1], weights_initializer=trunc_normal(0.1), scope='Conv2d_0b_1x1') net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 28 x 28 x 256 end_point = 'Mixed_3c' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d( net, depth(64), [1, 1], weights_initializer=trunc_normal(0.09), scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(96), [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d( net, depth(64), [1, 1], weights_initializer=trunc_normal(0.09), scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], scope='Conv2d_0b_3x3') branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], scope='Conv2d_0c_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d( branch_3, depth(64), [1, 1], weights_initializer=trunc_normal(0.1), scope='Conv2d_0b_1x1') net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 28 x 28 x 320 end_point = 'Mixed_4a' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d( net, depth(128), [1, 1], weights_initializer=trunc_normal(0.09), scope='Conv2d_0a_1x1') branch_0 = slim.conv2d(branch_0, depth(160), [3, 3], stride=2, scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d( net, depth(64), [1, 1], weights_initializer=trunc_normal(0.09), scope='Conv2d_0a_1x1') branch_1 = slim.conv2d( branch_1, depth(96), [3, 3], scope='Conv2d_0b_3x3') branch_1 = slim.conv2d( branch_1, depth(96), [3, 3], stride=2, scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.max_pool2d( net, [3, 3], stride=2, scope='MaxPool_1a_3x3') net = tf.concat(3, [branch_0, branch_1, branch_2]) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 14 x 14 x 576 end_point = 'Mixed_4b' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(224), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d( net, depth(64), [1, 1], weights_initializer=trunc_normal(0.09), scope='Conv2d_0a_1x1') branch_1 = slim.conv2d( branch_1, depth(96), [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d( net, depth(96), [1, 1], weights_initializer=trunc_normal(0.09), scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(128), [3, 3], scope='Conv2d_0b_3x3') branch_2 = slim.conv2d(branch_2, depth(128), [3, 3], scope='Conv2d_0c_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d( branch_3, depth(128), [1, 1], weights_initializer=trunc_normal(0.1), scope='Conv2d_0b_1x1') net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 14 x 14 x 576 end_point = 'Mixed_4c' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d( net, depth(96), [1, 1], weights_initializer=trunc_normal(0.09), scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(128), [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d( net, depth(96), [1, 1], weights_initializer=trunc_normal(0.09), scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(128), [3, 3], scope='Conv2d_0b_3x3') branch_2 = slim.conv2d(branch_2, depth(128), [3, 3], scope='Conv2d_0c_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d( branch_3, depth(128), [1, 1], weights_initializer=trunc_normal(0.1), scope='Conv2d_0b_1x1') net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 14 x 14 x 576 end_point = 'Mixed_4d' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d( net, depth(128), [1, 1], weights_initializer=trunc_normal(0.09), scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(160), [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d( net, depth(128), [1, 1], weights_initializer=trunc_normal(0.09), scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(160), [3, 3], scope='Conv2d_0b_3x3') branch_2 = slim.conv2d(branch_2, depth(160), [3, 3], scope='Conv2d_0c_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d( branch_3, depth(96), [1, 1], weights_initializer=trunc_normal(0.1), scope='Conv2d_0b_1x1') net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 14 x 14 x 576 end_point = 'Mixed_4e' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(96), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d( net, depth(128), [1, 1], weights_initializer=trunc_normal(0.09), scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(192), [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d( net, depth(160), [1, 1], weights_initializer=trunc_normal(0.09), scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(192), [3, 3], scope='Conv2d_0b_3x3') branch_2 = slim.conv2d(branch_2, depth(192), [3, 3], scope='Conv2d_0c_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d( branch_3, depth(96), [1, 1], weights_initializer=trunc_normal(0.1), scope='Conv2d_0b_1x1') net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 14 x 14 x 576 end_point = 'Mixed_5a' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d( net, depth(128), [1, 1], weights_initializer=trunc_normal(0.09), scope='Conv2d_0a_1x1') branch_0 = slim.conv2d(branch_0, depth(192), [3, 3], stride=2, scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d( net, depth(192), [1, 1], weights_initializer=trunc_normal(0.09), scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(256), [3, 3], scope='Conv2d_0b_3x3') branch_1 = slim.conv2d(branch_1, depth(256), [3, 3], stride=2, scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.max_pool2d(net, [3, 3], stride=2, scope='MaxPool_1a_3x3') net = tf.concat(3, [branch_0, branch_1, branch_2]) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 7 x 7 x 1024 end_point = 'Mixed_5b' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(352), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d( net, depth(192), [1, 1], weights_initializer=trunc_normal(0.09), scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(320), [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d( net, depth(160), [1, 1], weights_initializer=trunc_normal(0.09), scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(224), [3, 3], scope='Conv2d_0b_3x3') branch_2 = slim.conv2d(branch_2, depth(224), [3, 3], scope='Conv2d_0c_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d( branch_3, depth(128), [1, 1], weights_initializer=trunc_normal(0.1), scope='Conv2d_0b_1x1') net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 7 x 7 x 1024 end_point = 'Mixed_5c' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(352), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d( net, depth(192), [1, 1], weights_initializer=trunc_normal(0.09), scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(320), [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d( net, depth(192), [1, 1], weights_initializer=trunc_normal(0.09), scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(224), [3, 3], scope='Conv2d_0b_3x3') branch_2 = slim.conv2d(branch_2, depth(224), [3, 3], scope='Conv2d_0c_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') branch_3 = slim.conv2d( branch_3, depth(128), [1, 1], weights_initializer=trunc_normal(0.1), scope='Conv2d_0b_1x1') net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if end_point == final_endpoint: return net, end_points raise ValueError('Unknown final endpoint %s' % final_endpoint) def inception_v2(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.8, min_depth=16, depth_multiplier=1.0, prediction_fn=slim.softmax, spatial_squeeze=True, reuse=None, scope='InceptionV2'): """Inception v2 model for classification. Constructs an Inception v2 network for classification as described in http://arxiv.org/abs/1502.03167. The default image size used to train this network is 224x224. Args: inputs: a tensor of shape [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether is training or not. dropout_keep_prob: the percentage of activation values that are retained. min_depth: Minimum depth value (number of channels) for all convolution ops. Enforced when depth_multiplier < 1, and not an active constraint when depth_multiplier >= 1. depth_multiplier: Float multiplier for the depth (number of channels) for all convolution ops. The value must be greater than zero. Typical usage will be to set this value in (0, 1) to reduce the number of parameters or computation cost of the model. prediction_fn: a function to get predictions out of logits. spatial_squeeze: if True, logits is of shape is [B, C], if false logits is of shape [B, 1, 1, C], where B is batch_size and C is number of classes. reuse: whether or not the network and its variables should be reused. To be able to reuse 'scope' must be given. scope: Optional variable_scope. Returns: logits: the pre-softmax activations, a tensor of size [batch_size, num_classes] end_points: a dictionary from components of the network to the corresponding activation. Raises: ValueError: if final_endpoint is not set to one of the predefined values, or depth_multiplier <= 0 """ if depth_multiplier <= 0: raise ValueError('depth_multiplier is not greater than zero.') # Final pooling and prediction with tf.variable_scope(scope, 'InceptionV2', [inputs, num_classes], reuse=reuse) as scope: with slim.arg_scope([slim.batch_norm, slim.dropout], is_training=is_training): net, end_points = inception_v2_base( inputs, scope=scope, min_depth=min_depth, depth_multiplier=depth_multiplier) with tf.variable_scope('Logits'): kernel_size = _reduced_kernel_size_for_small_input(net, [7, 7]) net = slim.avg_pool2d(net, kernel_size, padding='VALID', scope='AvgPool_1a_{}x{}'.format(*kernel_size)) # 1 x 1 x 1024 net = slim.dropout(net, keep_prob=dropout_keep_prob, scope='Dropout_1b') logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='Conv2d_1c_1x1') if spatial_squeeze: logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze') end_points['Logits'] = logits end_points['Predictions'] = prediction_fn(logits, scope='Predictions') return logits, end_points inception_v2.default_image_size = 224 def _reduced_kernel_size_for_small_input(input_tensor, kernel_size): """Define kernel size which is automatically reduced for small input. If the shape of the input images is unknown at graph construction time this function assumes that the input images are is large enough. Args: input_tensor: input tensor of size [batch_size, height, width, channels]. kernel_size: desired kernel size of length 2: [kernel_height, kernel_width] Returns: a tensor with the kernel size. TODO(jrru): Make this function work with unknown shapes. Theoretically, this can be done with the code below. Problems are two-fold: (1) If the shape was known, it will be lost. (2) inception.slim.ops._two_element_tuple cannot handle tensors that define the kernel size. shape = tf.shape(input_tensor) return = tf.pack([tf.minimum(shape[1], kernel_size[0]), tf.minimum(shape[2], kernel_size[1])]) """ shape = input_tensor.get_shape().as_list() if shape[1] is None or shape[2] is None: kernel_size_out = kernel_size else: kernel_size_out = [min(shape[1], kernel_size[0]), min(shape[2], kernel_size[1])] return kernel_size_out inception_v2_arg_scope = inception_utils.inception_arg_scope ================================================ FILE: models/slim/nets/inception_v2_test.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Tests for nets.inception_v2.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import tensorflow as tf from nets import inception slim = tf.contrib.slim class InceptionV2Test(tf.test.TestCase): def testBuildClassificationNetwork(self): batch_size = 5 height, width = 224, 224 num_classes = 1000 inputs = tf.random_uniform((batch_size, height, width, 3)) logits, end_points = inception.inception_v2(inputs, num_classes) self.assertTrue(logits.op.name.startswith('InceptionV2/Logits')) self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) self.assertTrue('Predictions' in end_points) self.assertListEqual(end_points['Predictions'].get_shape().as_list(), [batch_size, num_classes]) def testBuildBaseNetwork(self): batch_size = 5 height, width = 224, 224 inputs = tf.random_uniform((batch_size, height, width, 3)) mixed_5c, end_points = inception.inception_v2_base(inputs) self.assertTrue(mixed_5c.op.name.startswith('InceptionV2/Mixed_5c')) self.assertListEqual(mixed_5c.get_shape().as_list(), [batch_size, 7, 7, 1024]) expected_endpoints = ['Mixed_3b', 'Mixed_3c', 'Mixed_4a', 'Mixed_4b', 'Mixed_4c', 'Mixed_4d', 'Mixed_4e', 'Mixed_5a', 'Mixed_5b', 'Mixed_5c', 'Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1', 'Conv2d_2c_3x3', 'MaxPool_3a_3x3'] self.assertItemsEqual(end_points.keys(), expected_endpoints) def testBuildOnlyUptoFinalEndpoint(self): batch_size = 5 height, width = 224, 224 endpoints = ['Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1', 'Conv2d_2c_3x3', 'MaxPool_3a_3x3', 'Mixed_3b', 'Mixed_3c', 'Mixed_4a', 'Mixed_4b', 'Mixed_4c', 'Mixed_4d', 'Mixed_4e', 'Mixed_5a', 'Mixed_5b', 'Mixed_5c'] for index, endpoint in enumerate(endpoints): with tf.Graph().as_default(): inputs = tf.random_uniform((batch_size, height, width, 3)) out_tensor, end_points = inception.inception_v2_base( inputs, final_endpoint=endpoint) self.assertTrue(out_tensor.op.name.startswith( 'InceptionV2/' + endpoint)) self.assertItemsEqual(endpoints[:index+1], end_points) def testBuildAndCheckAllEndPointsUptoMixed5c(self): batch_size = 5 height, width = 224, 224 inputs = tf.random_uniform((batch_size, height, width, 3)) _, end_points = inception.inception_v2_base(inputs, final_endpoint='Mixed_5c') endpoints_shapes = {'Mixed_3b': [batch_size, 28, 28, 256], 'Mixed_3c': [batch_size, 28, 28, 320], 'Mixed_4a': [batch_size, 14, 14, 576], 'Mixed_4b': [batch_size, 14, 14, 576], 'Mixed_4c': [batch_size, 14, 14, 576], 'Mixed_4d': [batch_size, 14, 14, 576], 'Mixed_4e': [batch_size, 14, 14, 576], 'Mixed_5a': [batch_size, 7, 7, 1024], 'Mixed_5b': [batch_size, 7, 7, 1024], 'Mixed_5c': [batch_size, 7, 7, 1024], 'Conv2d_1a_7x7': [batch_size, 112, 112, 64], 'MaxPool_2a_3x3': [batch_size, 56, 56, 64], 'Conv2d_2b_1x1': [batch_size, 56, 56, 64], 'Conv2d_2c_3x3': [batch_size, 56, 56, 192], 'MaxPool_3a_3x3': [batch_size, 28, 28, 192]} self.assertItemsEqual(endpoints_shapes.keys(), end_points.keys()) for endpoint_name in endpoints_shapes: expected_shape = endpoints_shapes[endpoint_name] self.assertTrue(endpoint_name in end_points) self.assertListEqual(end_points[endpoint_name].get_shape().as_list(), expected_shape) def testModelHasExpectedNumberOfParameters(self): batch_size = 5 height, width = 224, 224 inputs = tf.random_uniform((batch_size, height, width, 3)) with slim.arg_scope(inception.inception_v2_arg_scope()): inception.inception_v2_base(inputs) total_params, _ = slim.model_analyzer.analyze_vars( slim.get_model_variables()) self.assertAlmostEqual(10173112, total_params) def testBuildEndPointsWithDepthMultiplierLessThanOne(self): batch_size = 5 height, width = 224, 224 num_classes = 1000 inputs = tf.random_uniform((batch_size, height, width, 3)) _, end_points = inception.inception_v2(inputs, num_classes) endpoint_keys = [key for key in end_points.keys() if key.startswith('Mixed') or key.startswith('Conv')] _, end_points_with_multiplier = inception.inception_v2( inputs, num_classes, scope='depth_multiplied_net', depth_multiplier=0.5) for key in endpoint_keys: original_depth = end_points[key].get_shape().as_list()[3] new_depth = end_points_with_multiplier[key].get_shape().as_list()[3] self.assertEqual(0.5 * original_depth, new_depth) def testBuildEndPointsWithDepthMultiplierGreaterThanOne(self): batch_size = 5 height, width = 224, 224 num_classes = 1000 inputs = tf.random_uniform((batch_size, height, width, 3)) _, end_points = inception.inception_v2(inputs, num_classes) endpoint_keys = [key for key in end_points.keys() if key.startswith('Mixed') or key.startswith('Conv')] _, end_points_with_multiplier = inception.inception_v2( inputs, num_classes, scope='depth_multiplied_net', depth_multiplier=2.0) for key in endpoint_keys: original_depth = end_points[key].get_shape().as_list()[3] new_depth = end_points_with_multiplier[key].get_shape().as_list()[3] self.assertEqual(2.0 * original_depth, new_depth) def testRaiseValueErrorWithInvalidDepthMultiplier(self): batch_size = 5 height, width = 224, 224 num_classes = 1000 inputs = tf.random_uniform((batch_size, height, width, 3)) with self.assertRaises(ValueError): _ = inception.inception_v2(inputs, num_classes, depth_multiplier=-0.1) with self.assertRaises(ValueError): _ = inception.inception_v2(inputs, num_classes, depth_multiplier=0.0) def testHalfSizeImages(self): batch_size = 5 height, width = 112, 112 num_classes = 1000 inputs = tf.random_uniform((batch_size, height, width, 3)) logits, end_points = inception.inception_v2(inputs, num_classes) self.assertTrue(logits.op.name.startswith('InceptionV2/Logits')) self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) pre_pool = end_points['Mixed_5c'] self.assertListEqual(pre_pool.get_shape().as_list(), [batch_size, 4, 4, 1024]) def testUnknownImageShape(self): tf.reset_default_graph() batch_size = 2 height, width = 224, 224 num_classes = 1000 input_np = np.random.uniform(0, 1, (batch_size, height, width, 3)) with self.test_session() as sess: inputs = tf.placeholder(tf.float32, shape=(batch_size, None, None, 3)) logits, end_points = inception.inception_v2(inputs, num_classes) self.assertTrue(logits.op.name.startswith('InceptionV2/Logits')) self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) pre_pool = end_points['Mixed_5c'] feed_dict = {inputs: input_np} tf.global_variables_initializer().run() pre_pool_out = sess.run(pre_pool, feed_dict=feed_dict) self.assertListEqual(list(pre_pool_out.shape), [batch_size, 7, 7, 1024]) def testUnknowBatchSize(self): batch_size = 1 height, width = 224, 224 num_classes = 1000 inputs = tf.placeholder(tf.float32, (None, height, width, 3)) logits, _ = inception.inception_v2(inputs, num_classes) self.assertTrue(logits.op.name.startswith('InceptionV2/Logits')) self.assertListEqual(logits.get_shape().as_list(), [None, num_classes]) images = tf.random_uniform((batch_size, height, width, 3)) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) output = sess.run(logits, {inputs: images.eval()}) self.assertEquals(output.shape, (batch_size, num_classes)) def testEvaluation(self): batch_size = 2 height, width = 224, 224 num_classes = 1000 eval_inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = inception.inception_v2(eval_inputs, num_classes, is_training=False) predictions = tf.argmax(logits, 1) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) output = sess.run(predictions) self.assertEquals(output.shape, (batch_size,)) def testTrainEvalWithReuse(self): train_batch_size = 5 eval_batch_size = 2 height, width = 150, 150 num_classes = 1000 train_inputs = tf.random_uniform((train_batch_size, height, width, 3)) inception.inception_v2(train_inputs, num_classes) eval_inputs = tf.random_uniform((eval_batch_size, height, width, 3)) logits, _ = inception.inception_v2(eval_inputs, num_classes, reuse=True) predictions = tf.argmax(logits, 1) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) output = sess.run(predictions) self.assertEquals(output.shape, (eval_batch_size,)) def testLogitsNotSqueezed(self): num_classes = 25 images = tf.random_uniform([1, 224, 224, 3]) logits, _ = inception.inception_v2(images, num_classes=num_classes, spatial_squeeze=False) with self.test_session() as sess: tf.global_variables_initializer().run() logits_out = sess.run(logits) self.assertListEqual(list(logits_out.shape), [1, 1, 1, num_classes]) if __name__ == '__main__': tf.test.main() ================================================ FILE: models/slim/nets/inception_v2_tsn.py ================================================ """Contains the definition for inception v2 (TSN) classification network.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from tensorflow.python.ops import init_ops from tensorflow.python.platform import tf_logging as logging slim = tf.contrib.slim trunc_normal = lambda stddev: tf.truncated_normal_initializer(0.0, stddev) random_normal = lambda stddev: tf.random_normal_initializer(0.0, stddev) def conv_set(net, num_outputs, filter_size, stride=1, weight_std=0.001, padding=0): if padding > 0: net = tf.pad(net, [[0, 0], [padding, padding], [padding, padding], [0, 0]]) net = slim.conv2d( net, num_outputs, filter_size, stride=stride, padding='VALID') net = slim.batch_norm(net, updates_collections=tf.GraphKeys.UPDATE_OPS, epsilon=1e-5, decay=0.9, scale=True) net = tf.nn.relu(net) return net def pool(net, pool_type='avg', kernel=3, stride=1, padding=0): if pool_type == 'avg': fn = slim.avg_pool2d elif pool_type == 'max': fn = slim.max_pool2d else: raise ValueError('Unknown pool type') with tf.name_scope('%s_pool' % pool_type): net = fn(net, [kernel, kernel], stride=stride, padding='VALID' if padding==0 else 'SAME') return net def inception_module(net, small_module=False, num_outputs=[64,64,64,32,64,96,96], force_max_pool=False): all_nets = [] if not small_module: with tf.variable_scope('1x1'): net_1 = conv_set(net, num_outputs[0], [1, 1]) all_nets.append(net_1) with tf.variable_scope('3x3_reduce'): net_2 = conv_set(net, num_outputs[1], [1, 1]) with tf.variable_scope('3x3'): net_2 = conv_set(net_2, num_outputs[2], [3, 3], padding=1, stride=2 if small_module else 1) all_nets.append(net_2) with tf.variable_scope('double_3x3_reduce'): net_3 = conv_set(net, num_outputs[4], [1, 1]) with tf.variable_scope('double_3x3_1'): net_3 = conv_set(net_3, num_outputs[5], [3, 3], padding=1) with tf.variable_scope('double_3x3_2'): net_3 = conv_set(net_3, num_outputs[6], [3, 3], padding=1, stride=2 if small_module else 1) all_nets.append(net_3) with tf.variable_scope('pool'): if small_module: net_4 = pool(net, 'max', 3, 2, 1) elif force_max_pool: net_4 = pool(net, 'max', 3, 1, 1) else: net_4 = pool(net, 'avg', 3, 1, 1) if not small_module: with tf.variable_scope('pool_proj'): net_4 = conv_set(net_4, num_outputs[3], [1, 1]) all_nets.append(net_4) net = tf.concat(all_nets, 3) return net def inception_v2_tsn_base(inputs, final_endpoint='Mixed_5c', min_depth=16, depth_multiplier=1.0, scope=None, is_training=False, train_top_bn=False): """Inception v2 (TSN code). """ # end_points will collect relevant activations for external use, for example # summaries or losses. end_points = {} with tf.variable_scope(scope, 'InceptionV2_TSN', [inputs]): # 224 x 224 x 3 end_point = 'conv1/7x7_s2' with tf.variable_scope(end_point): with slim.arg_scope( [slim.batch_norm], is_training=is_training if train_top_bn else False, trainable=True if train_top_bn else False): net = conv_set(inputs, 64, [7, 7], stride=2, padding=3) end_points[tf.get_variable_scope().name + '/' + end_point] = net if end_point == final_endpoint: return net, end_points # 112 x 112 x 64 end_point = 'pool1/3x3_s2' net = slim.max_pool2d(net, [3, 3], scope=end_point, stride=2, padding='SAME') # net = pool(net, 'max', 3, 2, 1) end_points[tf.get_variable_scope().name + '/' + end_point] = net if end_point == final_endpoint: return net, end_points # 56 x 56 x 64 end_point = 'conv2/3x3_reduce' with tf.variable_scope(end_point): net = conv_set(net, 64, [1, 1], weight_std=0.1, padding=0) # net = slim.max_pool2d(net, [3, 3], scope=end_point, stride=2, # padding='SAME') end_points[tf.get_variable_scope().name + '/' + end_point] = net if end_point == final_endpoint: return net, end_points end_point = 'conv2/3x3' with tf.variable_scope(end_point): net = conv_set(net, 192, [3, 3], weight_std=0.1, padding=1) end_points[tf.get_variable_scope().name + '/' + end_point] = net if end_point == final_endpoint: return net, end_points end_point = 'pool2/3x3_s2' net = slim.max_pool2d(net, [3, 3], scope=end_point, stride=2, padding='SAME') # net = pool(net, 'max', 3, 2, 1) end_points[tf.get_variable_scope().name + '/' + end_point] = net if end_point == final_endpoint: return net, end_points # Inception module. end_point = 'inception_3a' with tf.variable_scope(end_point): net = inception_module(net) end_points[tf.get_variable_scope().name + '/' + end_point] = net if end_point == final_endpoint: return net, end_points end_point = 'inception_3b' with tf.variable_scope(end_point): net = inception_module(net, num_outputs=[64,64,96,64,64,96,96]) end_points[tf.get_variable_scope().name + '/' + end_point] = net if end_point == final_endpoint: return net, end_points end_point = 'inception_3c' with tf.variable_scope(end_point): net = inception_module(net, small_module=True, num_outputs=[-1,128,160,-1,64,96,96]) end_points[tf.get_variable_scope().name + '/' + end_point] = net if end_point == final_endpoint: return net, end_points end_point = 'inception_4a' with tf.variable_scope(end_point): net = inception_module(net, num_outputs=[224,64,96,128,96,128,128]) end_points[tf.get_variable_scope().name + '/' + end_point] = net if end_point == final_endpoint: return net, end_points end_point = 'inception_4b' with tf.variable_scope(end_point): net = inception_module(net, num_outputs=[192,96,128,128,96,128,128]) end_points[tf.get_variable_scope().name + '/' + end_point] = net if end_point == final_endpoint: return net, end_points end_point = 'inception_4c' with tf.variable_scope(end_point): net = inception_module(net, num_outputs=[160,128,160,128,128,160,160]) end_points[tf.get_variable_scope().name + '/' + end_point] = net if end_point == final_endpoint: return net, end_points end_point = 'inception_4d' with tf.variable_scope(end_point): net = inception_module(net, num_outputs=[96,128,192,128,160,192,192]) end_points[tf.get_variable_scope().name + '/' + end_point] = net if end_point == final_endpoint: return net, end_points end_point = 'inception_4e' with tf.variable_scope(end_point): net = inception_module(net, small_module=True, num_outputs=[-1,128,192,-1,192,256,256]) end_points[tf.get_variable_scope().name + '/' + end_point] = net if end_point == final_endpoint: return net, end_points end_point = 'inception_5a' with tf.variable_scope(end_point): net = inception_module(net, num_outputs=[352,192,320,128,160,224,224]) end_points[tf.get_variable_scope().name + '/' + end_point] = net if end_point == final_endpoint: return net, end_points end_point = 'inception_5b' with tf.variable_scope(end_point): net = inception_module(net, num_outputs=[352,192,320,128,192,224,224], force_max_pool=True) end_points[tf.get_variable_scope().name + '/' + end_point] = net if end_point == final_endpoint: return net, end_points return net, end_points def inception_v2_tsn(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.2, min_depth=16, depth_multiplier=1.0, prediction_fn=slim.softmax, spatial_squeeze=True, reuse=None, conv_only=None, conv_endpoint='inception_5b', # conv_endpoint='inception_5a', # testing for now train_top_bn=False, scope='InceptionV2_TSN'): """Inception v2 model for video classification. """ if depth_multiplier <= 0: raise ValueError('depth_multiplier is not greater than zero.') # Final pooling and prediction with tf.variable_scope(scope, 'InceptionV2_TSN', [inputs, num_classes], reuse=reuse) as scope: with slim.arg_scope([slim.dropout], is_training=is_training): with slim.arg_scope([slim.batch_norm], is_training=False, trainable=False): net, end_points = inception_v2_tsn_base( inputs, scope=scope, min_depth=min_depth, depth_multiplier=depth_multiplier, final_endpoint=conv_endpoint if conv_only else None, is_training=is_training, train_top_bn=train_top_bn) if conv_only: return net, end_points with tf.variable_scope('Logits'): kernel_size = _reduced_kernel_size_for_small_input(net, [100, 100]) net = slim.avg_pool2d(net, kernel_size, padding='VALID', stride=1, scope='AvgPool_Logits_{}x{}'.format(*kernel_size)) # The following would give the same output/performance too. # net = tf.reduce_mean(net, axis=[1,2], keep_dims=True) # 1 x 1 x 1024 logging.info('Using dropout %f' % (1-dropout_keep_prob)) net = slim.dropout(net, keep_prob=dropout_keep_prob, scope='Dropout_Logits') logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, weights_initializer=random_normal(0.001), biases_initializer=init_ops.zeros_initializer()) if spatial_squeeze: logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze') end_points['Logits'] = logits end_points['Predictions'] = prediction_fn(logits, scope='Predictions') return logits, end_points inception_v2_tsn.default_image_size = 224 def _reduced_kernel_size_for_small_input(input_tensor, kernel_size): """Define kernel size which is automatically reduced for small input. If the shape of the input images is unknown at graph construction time this function assumes that the input images are is large enough. Args: input_tensor: input tensor of size [batch_size, height, width, channels]. kernel_size: desired kernel size of length 2: [kernel_height, kernel_width] Returns: a tensor with the kernel size. TODO(jrru): Make this function work with unknown shapes. Theoretically, this can be done with the code below. Problems are two-fold: (1) If the shape was known, it will be lost. (2) inception.slim.ops._two_element_tuple cannot handle tensors that define the kernel size. shape = tf.shape(input_tensor) return = tf.pack([tf.minimum(shape[1], kernel_size[0]), tf.minimum(shape[2], kernel_size[1])]) """ shape = input_tensor.get_shape().as_list() if shape[1] is None or shape[2] is None: kernel_size_out = kernel_size else: kernel_size_out = [min(shape[1], kernel_size[0]), min(shape[2], kernel_size[1])] return kernel_size_out def inception_v2_tsn_arg_scope(weight_decay=0.00004): """Defines the default InceptionV2 arg scope. Args: weight_decay: The weight decay to use for regularizing the model. Returns: An `arg_scope` to use for the inception v3 model. """ batch_norm_params = { # Decay for the moving averages. 'decay': 0.9997, # epsilon to prevent 0s in variance. 'epsilon': 0.001, # collection containing update_ops. 'updates_collections': tf.GraphKeys.UPDATE_OPS, # Allow a gamma variable 'scale': True, } # Set weight_decay for weights in Conv and FC layers. with slim.arg_scope([slim.conv2d, slim.fully_connected], weights_regularizer=slim.l2_regularizer(weight_decay)): with slim.arg_scope( [slim.conv2d], weights_initializer=tf.contrib.layers.xavier_initializer(), activation_fn=None, # manually added later, as I need to add BN after # the convolution biases_initializer=init_ops.constant_initializer(value=0.2), normalizer_fn=None) as sc: return sc ================================================ FILE: models/slim/nets/inception_v3.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Contains the definition for inception v3 classification network.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from nets import inception_utils slim = tf.contrib.slim trunc_normal = lambda stddev: tf.truncated_normal_initializer(0.0, stddev) random_normal = lambda stddev: tf.random_normal_initializer(0.0, stddev) def inception_v3_base(inputs, final_endpoint='Mixed_7c', min_depth=16, depth_multiplier=1.0, scope=None): """Inception model from http://arxiv.org/abs/1512.00567. Constructs an Inception v3 network from inputs to the given final endpoint. This method can construct the network up to the final inception block Mixed_7c. Note that the names of the layers in the paper do not correspond to the names of the endpoints registered by this function although they build the same network. Here is a mapping from the old_names to the new names: Old name | New name ======================================= conv0 | Conv2d_1a_3x3 conv1 | Conv2d_2a_3x3 conv2 | Conv2d_2b_3x3 pool1 | MaxPool_3a_3x3 conv3 | Conv2d_3b_1x1 conv4 | Conv2d_4a_3x3 pool2 | MaxPool_5a_3x3 mixed_35x35x256a | Mixed_5b mixed_35x35x288a | Mixed_5c mixed_35x35x288b | Mixed_5d mixed_17x17x768a | Mixed_6a mixed_17x17x768b | Mixed_6b mixed_17x17x768c | Mixed_6c mixed_17x17x768d | Mixed_6d mixed_17x17x768e | Mixed_6e mixed_8x8x1280a | Mixed_7a mixed_8x8x2048a | Mixed_7b mixed_8x8x2048b | Mixed_7c Args: inputs: a tensor of size [batch_size, height, width, channels]. final_endpoint: specifies the endpoint to construct the network up to. It can be one of ['Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', 'MaxPool_3a_3x3', 'Conv2d_3b_1x1', 'Conv2d_4a_3x3', 'MaxPool_5a_3x3', 'Mixed_5b', 'Mixed_5c', 'Mixed_5d', 'Mixed_6a', 'Mixed_6b', 'Mixed_6c', 'Mixed_6d', 'Mixed_6e', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c']. min_depth: Minimum depth value (number of channels) for all convolution ops. Enforced when depth_multiplier < 1, and not an active constraint when depth_multiplier >= 1. depth_multiplier: Float multiplier for the depth (number of channels) for all convolution ops. The value must be greater than zero. Typical usage will be to set this value in (0, 1) to reduce the number of parameters or computation cost of the model. scope: Optional variable_scope. Returns: tensor_out: output tensor corresponding to the final_endpoint. end_points: a set of activations for external use, for example summaries or losses. Raises: ValueError: if final_endpoint is not set to one of the predefined values, or depth_multiplier <= 0 """ # end_points will collect relevant activations for external use, for example # summaries or losses. end_points = {} if depth_multiplier <= 0: raise ValueError('depth_multiplier is not greater than zero.') depth = lambda d: max(int(d * depth_multiplier), min_depth) with tf.variable_scope(scope, 'InceptionV3', [inputs]): with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], stride=1, padding='VALID'): # 299 x 299 x 3 end_point = 'Conv2d_1a_3x3' net = slim.conv2d(inputs, depth(32), [3, 3], stride=2, scope=end_point) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 149 x 149 x 32 end_point = 'Conv2d_2a_3x3' net = slim.conv2d(net, depth(32), [3, 3], scope=end_point) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 147 x 147 x 32 end_point = 'Conv2d_2b_3x3' net = slim.conv2d(net, depth(64), [3, 3], padding='SAME', scope=end_point) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 147 x 147 x 64 end_point = 'MaxPool_3a_3x3' net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 73 x 73 x 64 end_point = 'Conv2d_3b_1x1' net = slim.conv2d(net, depth(80), [1, 1], scope=end_point) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 73 x 73 x 80. end_point = 'Conv2d_4a_3x3' net = slim.conv2d(net, depth(192), [3, 3], scope=end_point) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 71 x 71 x 192. end_point = 'MaxPool_5a_3x3' net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 35 x 35 x 192. # Inception blocks with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], stride=1, padding='SAME'): # mixed: 35 x 35 x 256. end_point = 'Mixed_5b' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(48), [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(64), [5, 5], scope='Conv2d_0b_5x5') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], scope='Conv2d_0b_3x3') branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], scope='Conv2d_0c_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, depth(32), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat([branch_0, branch_1, branch_2, branch_3], 3) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # mixed_1: 35 x 35 x 288. end_point = 'Mixed_5c' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(48), [1, 1], scope='Conv2d_0b_1x1') branch_1 = slim.conv2d(branch_1, depth(64), [5, 5], scope='Conv_1_0c_5x5') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], scope='Conv2d_0b_3x3') branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], scope='Conv2d_0c_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, depth(64), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat([branch_0, branch_1, branch_2, branch_3], 3) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # mixed_2: 35 x 35 x 288. end_point = 'Mixed_5d' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(48), [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(64), [5, 5], scope='Conv2d_0b_5x5') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], scope='Conv2d_0b_3x3') branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], scope='Conv2d_0c_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, depth(64), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat([branch_0, branch_1, branch_2, branch_3], 3) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # mixed_3: 17 x 17 x 768. end_point = 'Mixed_6a' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(384), [3, 3], stride=2, padding='VALID', scope='Conv2d_1a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(96), [3, 3], scope='Conv2d_0b_3x3') branch_1 = slim.conv2d(branch_1, depth(96), [3, 3], stride=2, padding='VALID', scope='Conv2d_1a_1x1') with tf.variable_scope('Branch_2'): branch_2 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', scope='MaxPool_1a_3x3') net = tf.concat([branch_0, branch_1, branch_2], 3) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # mixed4: 17 x 17 x 768. end_point = 'Mixed_6b' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(128), [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(128), [1, 7], scope='Conv2d_0b_1x7') branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], scope='Conv2d_0c_7x1') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(128), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(128), [7, 1], scope='Conv2d_0b_7x1') branch_2 = slim.conv2d(branch_2, depth(128), [1, 7], scope='Conv2d_0c_1x7') branch_2 = slim.conv2d(branch_2, depth(128), [7, 1], scope='Conv2d_0d_7x1') branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], scope='Conv2d_0e_1x7') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat([branch_0, branch_1, branch_2, branch_3], 3) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # mixed_5: 17 x 17 x 768. end_point = 'Mixed_6c' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(160), [1, 7], scope='Conv2d_0b_1x7') branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], scope='Conv2d_0c_7x1') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(160), [7, 1], scope='Conv2d_0b_7x1') branch_2 = slim.conv2d(branch_2, depth(160), [1, 7], scope='Conv2d_0c_1x7') branch_2 = slim.conv2d(branch_2, depth(160), [7, 1], scope='Conv2d_0d_7x1') branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], scope='Conv2d_0e_1x7') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat([branch_0, branch_1, branch_2, branch_3], 3) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # mixed_6: 17 x 17 x 768. end_point = 'Mixed_6d' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(160), [1, 7], scope='Conv2d_0b_1x7') branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], scope='Conv2d_0c_7x1') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(160), [7, 1], scope='Conv2d_0b_7x1') branch_2 = slim.conv2d(branch_2, depth(160), [1, 7], scope='Conv2d_0c_1x7') branch_2 = slim.conv2d(branch_2, depth(160), [7, 1], scope='Conv2d_0d_7x1') branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], scope='Conv2d_0e_1x7') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat([branch_0, branch_1, branch_2, branch_3], 3) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # mixed_7: 17 x 17 x 768. end_point = 'Mixed_6e' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(192), [1, 7], scope='Conv2d_0b_1x7') branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], scope='Conv2d_0c_7x1') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(192), [7, 1], scope='Conv2d_0b_7x1') branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], scope='Conv2d_0c_1x7') branch_2 = slim.conv2d(branch_2, depth(192), [7, 1], scope='Conv2d_0d_7x1') branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], scope='Conv2d_0e_1x7') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat([branch_0, branch_1, branch_2, branch_3], 3) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # mixed_8: 8 x 8 x 1280. end_point = 'Mixed_7a' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') branch_0 = slim.conv2d(branch_0, depth(320), [3, 3], stride=2, padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(192), [1, 7], scope='Conv2d_0b_1x7') branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], scope='Conv2d_0c_7x1') branch_1 = slim.conv2d(branch_1, depth(192), [3, 3], stride=2, padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', scope='MaxPool_1a_3x3') net = tf.concat([branch_0, branch_1, branch_2], 3) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # mixed_9: 8 x 8 x 2048. end_point = 'Mixed_7b' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(320), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(384), [1, 1], scope='Conv2d_0a_1x1') branch_1 = tf.concat([ slim.conv2d(branch_1, depth(384), [1, 3], scope='Conv2d_0b_1x3'), slim.conv2d(branch_1, depth(384), [3, 1], scope='Conv2d_0b_3x1')], 3) with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(448), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d( branch_2, depth(384), [3, 3], scope='Conv2d_0b_3x3') branch_2 = tf.concat([ slim.conv2d(branch_2, depth(384), [1, 3], scope='Conv2d_0c_1x3'), slim.conv2d(branch_2, depth(384), [3, 1], scope='Conv2d_0d_3x1')], 3) with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d( branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat([branch_0, branch_1, branch_2, branch_3], 3) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # mixed_10: 8 x 8 x 2048. end_point = 'Mixed_7c' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(320), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(384), [1, 1], scope='Conv2d_0a_1x1') branch_1 = tf.concat([ slim.conv2d(branch_1, depth(384), [1, 3], scope='Conv2d_0b_1x3'), slim.conv2d(branch_1, depth(384), [3, 1], scope='Conv2d_0c_3x1')], 3) with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(448), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d( branch_2, depth(384), [3, 3], scope='Conv2d_0b_3x3') branch_2 = tf.concat([ slim.conv2d(branch_2, depth(384), [1, 3], scope='Conv2d_0c_1x3'), slim.conv2d(branch_2, depth(384), [3, 1], scope='Conv2d_0d_3x1')], 3) with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d( branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat([branch_0, branch_1, branch_2, branch_3], 3) end_points[end_point] = net if end_point == final_endpoint: return net, end_points raise ValueError('Unknown final endpoint %s' % final_endpoint) def inception_v3(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.8, min_depth=16, depth_multiplier=1.0, prediction_fn=slim.softmax, spatial_squeeze=True, reuse=None, scope='InceptionV3'): """Inception model from http://arxiv.org/abs/1512.00567. "Rethinking the Inception Architecture for Computer Vision" Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, Zbigniew Wojna. With the default arguments this method constructs the exact model defined in the paper. However, one can experiment with variations of the inception_v3 network by changing arguments dropout_keep_prob, min_depth and depth_multiplier. The default image size used to train this network is 299x299. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether is training or not. dropout_keep_prob: the percentage of activation values that are retained. min_depth: Minimum depth value (number of channels) for all convolution ops. Enforced when depth_multiplier < 1, and not an active constraint when depth_multiplier >= 1. depth_multiplier: Float multiplier for the depth (number of channels) for all convolution ops. The value must be greater than zero. Typical usage will be to set this value in (0, 1) to reduce the number of parameters or computation cost of the model. prediction_fn: a function to get predictions out of logits. spatial_squeeze: if True, logits is of shape is [B, C], if false logits is of shape [B, 1, 1, C], where B is batch_size and C is number of classes. reuse: whether or not the network and its variables should be reused. To be able to reuse 'scope' must be given. scope: Optional variable_scope. Returns: logits: the pre-softmax activations, a tensor of size [batch_size, num_classes] end_points: a dictionary from components of the network to the corresponding activation. Raises: ValueError: if 'depth_multiplier' is less than or equal to zero. """ if depth_multiplier <= 0: raise ValueError('depth_multiplier is not greater than zero.') depth = lambda d: max(int(d * depth_multiplier), min_depth) with tf.variable_scope(scope, 'InceptionV3', [inputs, num_classes], reuse=reuse) as scope: with slim.arg_scope([slim.batch_norm, slim.dropout], is_training=is_training): net, end_points = inception_v3_base( inputs, scope=scope, min_depth=min_depth, depth_multiplier=depth_multiplier) # Auxiliary Head logits with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], stride=1, padding='SAME'): aux_logits = end_points['Mixed_6e'] with tf.variable_scope('AuxLogits'): # rgirdhar: for large images, in pose kernel_size = _reduced_kernel_size_for_small_input(net, [30, 30]) aux_logits = slim.avg_pool2d( aux_logits, kernel_size, stride=3, padding='VALID', scope='AvgPool_1a_5x5') aux_logits = slim.conv2d(aux_logits, depth(128), [1, 1], scope='Conv2d_1b_1x1') # Shape of feature map before the final layer. kernel_size = _reduced_kernel_size_for_small_input( aux_logits, [5, 5]) aux_logits = slim.conv2d( aux_logits, depth(768), kernel_size, weights_initializer=trunc_normal(0.01), padding='VALID', scope='Conv2d_2a_{}x{}'.format(*kernel_size)) aux_logits = slim.conv2d( aux_logits, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, weights_initializer=trunc_normal(0.001), scope='Conv2d_2b_1x1') if spatial_squeeze: aux_logits = tf.squeeze(aux_logits, [1, 2], name='SpatialSqueeze') end_points['AuxLogits'] = aux_logits # Final pooling and prediction with tf.variable_scope('Logits'): # kernel_size = _reduced_kernel_size_for_small_input(net, [8, 8]) # rgirdhar: for large images, in pose kernel_size = _reduced_kernel_size_for_small_input(net, [30, 30]) net = slim.avg_pool2d(net, kernel_size, padding='VALID', scope='AvgPool_1a_{}x{}'.format(*kernel_size)) # 1 x 1 x 2048 net = slim.dropout(net, keep_prob=dropout_keep_prob, scope='Dropout_1b') end_points['PreLogits'] = net # 2048 logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='Conv2d_1c_1x1') if spatial_squeeze: logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze') # 1000 end_points['Logits'] = logits end_points['Predictions'] = prediction_fn(logits, scope='Predictions') return logits, end_points inception_v3.default_image_size = 299 def _reduced_kernel_size_for_small_input(input_tensor, kernel_size): """Define kernel size which is automatically reduced for small input. If the shape of the input images is unknown at graph construction time this function assumes that the input images are is large enough. Args: input_tensor: input tensor of size [batch_size, height, width, channels]. kernel_size: desired kernel size of length 2: [kernel_height, kernel_width] Returns: a tensor with the kernel size. TODO(jrru): Make this function work with unknown shapes. Theoretically, this can be done with the code below. Problems are two-fold: (1) If the shape was known, it will be lost. (2) inception.slim.ops._two_element_tuple cannot handle tensors that define the kernel size. shape = tf.shape(input_tensor) return = tf.pack([tf.minimum(shape[1], kernel_size[0]), tf.minimum(shape[2], kernel_size[1])]) """ shape = input_tensor.get_shape().as_list() if shape[1] is None or shape[2] is None: kernel_size_out = kernel_size else: kernel_size_out = [min(shape[1], kernel_size[0]), min(shape[2], kernel_size[1])] return kernel_size_out inception_v3_arg_scope = inception_utils.inception_arg_scope ================================================ FILE: models/slim/nets/inception_v3_test.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Tests for nets.inception_v1.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import tensorflow as tf from nets import inception slim = tf.contrib.slim class InceptionV3Test(tf.test.TestCase): def testBuildClassificationNetwork(self): batch_size = 5 height, width = 299, 299 num_classes = 1000 inputs = tf.random_uniform((batch_size, height, width, 3)) logits, end_points = inception.inception_v3(inputs, num_classes) self.assertTrue(logits.op.name.startswith('InceptionV3/Logits')) self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) self.assertTrue('Predictions' in end_points) self.assertListEqual(end_points['Predictions'].get_shape().as_list(), [batch_size, num_classes]) def testBuildBaseNetwork(self): batch_size = 5 height, width = 299, 299 inputs = tf.random_uniform((batch_size, height, width, 3)) final_endpoint, end_points = inception.inception_v3_base(inputs) self.assertTrue(final_endpoint.op.name.startswith( 'InceptionV3/Mixed_7c')) self.assertListEqual(final_endpoint.get_shape().as_list(), [batch_size, 8, 8, 2048]) expected_endpoints = ['Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', 'MaxPool_3a_3x3', 'Conv2d_3b_1x1', 'Conv2d_4a_3x3', 'MaxPool_5a_3x3', 'Mixed_5b', 'Mixed_5c', 'Mixed_5d', 'Mixed_6a', 'Mixed_6b', 'Mixed_6c', 'Mixed_6d', 'Mixed_6e', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c'] self.assertItemsEqual(end_points.keys(), expected_endpoints) def testBuildOnlyUptoFinalEndpoint(self): batch_size = 5 height, width = 299, 299 endpoints = ['Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', 'MaxPool_3a_3x3', 'Conv2d_3b_1x1', 'Conv2d_4a_3x3', 'MaxPool_5a_3x3', 'Mixed_5b', 'Mixed_5c', 'Mixed_5d', 'Mixed_6a', 'Mixed_6b', 'Mixed_6c', 'Mixed_6d', 'Mixed_6e', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c'] for index, endpoint in enumerate(endpoints): with tf.Graph().as_default(): inputs = tf.random_uniform((batch_size, height, width, 3)) out_tensor, end_points = inception.inception_v3_base( inputs, final_endpoint=endpoint) self.assertTrue(out_tensor.op.name.startswith( 'InceptionV3/' + endpoint)) self.assertItemsEqual(endpoints[:index+1], end_points) def testBuildAndCheckAllEndPointsUptoMixed7c(self): batch_size = 5 height, width = 299, 299 inputs = tf.random_uniform((batch_size, height, width, 3)) _, end_points = inception.inception_v3_base( inputs, final_endpoint='Mixed_7c') endpoints_shapes = {'Conv2d_1a_3x3': [batch_size, 149, 149, 32], 'Conv2d_2a_3x3': [batch_size, 147, 147, 32], 'Conv2d_2b_3x3': [batch_size, 147, 147, 64], 'MaxPool_3a_3x3': [batch_size, 73, 73, 64], 'Conv2d_3b_1x1': [batch_size, 73, 73, 80], 'Conv2d_4a_3x3': [batch_size, 71, 71, 192], 'MaxPool_5a_3x3': [batch_size, 35, 35, 192], 'Mixed_5b': [batch_size, 35, 35, 256], 'Mixed_5c': [batch_size, 35, 35, 288], 'Mixed_5d': [batch_size, 35, 35, 288], 'Mixed_6a': [batch_size, 17, 17, 768], 'Mixed_6b': [batch_size, 17, 17, 768], 'Mixed_6c': [batch_size, 17, 17, 768], 'Mixed_6d': [batch_size, 17, 17, 768], 'Mixed_6e': [batch_size, 17, 17, 768], 'Mixed_7a': [batch_size, 8, 8, 1280], 'Mixed_7b': [batch_size, 8, 8, 2048], 'Mixed_7c': [batch_size, 8, 8, 2048]} self.assertItemsEqual(endpoints_shapes.keys(), end_points.keys()) for endpoint_name in endpoints_shapes: expected_shape = endpoints_shapes[endpoint_name] self.assertTrue(endpoint_name in end_points) self.assertListEqual(end_points[endpoint_name].get_shape().as_list(), expected_shape) def testModelHasExpectedNumberOfParameters(self): batch_size = 5 height, width = 299, 299 inputs = tf.random_uniform((batch_size, height, width, 3)) with slim.arg_scope(inception.inception_v3_arg_scope()): inception.inception_v3_base(inputs) total_params, _ = slim.model_analyzer.analyze_vars( slim.get_model_variables()) self.assertAlmostEqual(21802784, total_params) def testBuildEndPoints(self): batch_size = 5 height, width = 299, 299 num_classes = 1000 inputs = tf.random_uniform((batch_size, height, width, 3)) _, end_points = inception.inception_v3(inputs, num_classes) self.assertTrue('Logits' in end_points) logits = end_points['Logits'] self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) self.assertTrue('AuxLogits' in end_points) aux_logits = end_points['AuxLogits'] self.assertListEqual(aux_logits.get_shape().as_list(), [batch_size, num_classes]) self.assertTrue('Mixed_7c' in end_points) pre_pool = end_points['Mixed_7c'] self.assertListEqual(pre_pool.get_shape().as_list(), [batch_size, 8, 8, 2048]) self.assertTrue('PreLogits' in end_points) pre_logits = end_points['PreLogits'] self.assertListEqual(pre_logits.get_shape().as_list(), [batch_size, 1, 1, 2048]) def testBuildEndPointsWithDepthMultiplierLessThanOne(self): batch_size = 5 height, width = 299, 299 num_classes = 1000 inputs = tf.random_uniform((batch_size, height, width, 3)) _, end_points = inception.inception_v3(inputs, num_classes) endpoint_keys = [key for key in end_points.keys() if key.startswith('Mixed') or key.startswith('Conv')] _, end_points_with_multiplier = inception.inception_v3( inputs, num_classes, scope='depth_multiplied_net', depth_multiplier=0.5) for key in endpoint_keys: original_depth = end_points[key].get_shape().as_list()[3] new_depth = end_points_with_multiplier[key].get_shape().as_list()[3] self.assertEqual(0.5 * original_depth, new_depth) def testBuildEndPointsWithDepthMultiplierGreaterThanOne(self): batch_size = 5 height, width = 299, 299 num_classes = 1000 inputs = tf.random_uniform((batch_size, height, width, 3)) _, end_points = inception.inception_v3(inputs, num_classes) endpoint_keys = [key for key in end_points.keys() if key.startswith('Mixed') or key.startswith('Conv')] _, end_points_with_multiplier = inception.inception_v3( inputs, num_classes, scope='depth_multiplied_net', depth_multiplier=2.0) for key in endpoint_keys: original_depth = end_points[key].get_shape().as_list()[3] new_depth = end_points_with_multiplier[key].get_shape().as_list()[3] self.assertEqual(2.0 * original_depth, new_depth) def testRaiseValueErrorWithInvalidDepthMultiplier(self): batch_size = 5 height, width = 299, 299 num_classes = 1000 inputs = tf.random_uniform((batch_size, height, width, 3)) with self.assertRaises(ValueError): _ = inception.inception_v3(inputs, num_classes, depth_multiplier=-0.1) with self.assertRaises(ValueError): _ = inception.inception_v3(inputs, num_classes, depth_multiplier=0.0) def testHalfSizeImages(self): batch_size = 5 height, width = 150, 150 num_classes = 1000 inputs = tf.random_uniform((batch_size, height, width, 3)) logits, end_points = inception.inception_v3(inputs, num_classes) self.assertTrue(logits.op.name.startswith('InceptionV3/Logits')) self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) pre_pool = end_points['Mixed_7c'] self.assertListEqual(pre_pool.get_shape().as_list(), [batch_size, 3, 3, 2048]) def testUnknownImageShape(self): tf.reset_default_graph() batch_size = 2 height, width = 299, 299 num_classes = 1000 input_np = np.random.uniform(0, 1, (batch_size, height, width, 3)) with self.test_session() as sess: inputs = tf.placeholder(tf.float32, shape=(batch_size, None, None, 3)) logits, end_points = inception.inception_v3(inputs, num_classes) self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) pre_pool = end_points['Mixed_7c'] feed_dict = {inputs: input_np} tf.global_variables_initializer().run() pre_pool_out = sess.run(pre_pool, feed_dict=feed_dict) self.assertListEqual(list(pre_pool_out.shape), [batch_size, 8, 8, 2048]) def testUnknowBatchSize(self): batch_size = 1 height, width = 299, 299 num_classes = 1000 inputs = tf.placeholder(tf.float32, (None, height, width, 3)) logits, _ = inception.inception_v3(inputs, num_classes) self.assertTrue(logits.op.name.startswith('InceptionV3/Logits')) self.assertListEqual(logits.get_shape().as_list(), [None, num_classes]) images = tf.random_uniform((batch_size, height, width, 3)) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) output = sess.run(logits, {inputs: images.eval()}) self.assertEquals(output.shape, (batch_size, num_classes)) def testEvaluation(self): batch_size = 2 height, width = 299, 299 num_classes = 1000 eval_inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = inception.inception_v3(eval_inputs, num_classes, is_training=False) predictions = tf.argmax(logits, 1) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) output = sess.run(predictions) self.assertEquals(output.shape, (batch_size,)) def testTrainEvalWithReuse(self): train_batch_size = 5 eval_batch_size = 2 height, width = 150, 150 num_classes = 1000 train_inputs = tf.random_uniform((train_batch_size, height, width, 3)) inception.inception_v3(train_inputs, num_classes) eval_inputs = tf.random_uniform((eval_batch_size, height, width, 3)) logits, _ = inception.inception_v3(eval_inputs, num_classes, is_training=False, reuse=True) predictions = tf.argmax(logits, 1) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) output = sess.run(predictions) self.assertEquals(output.shape, (eval_batch_size,)) def testLogitsNotSqueezed(self): num_classes = 25 images = tf.random_uniform([1, 299, 299, 3]) logits, _ = inception.inception_v3(images, num_classes=num_classes, spatial_squeeze=False) with self.test_session() as sess: tf.global_variables_initializer().run() logits_out = sess.run(logits) self.assertListEqual(list(logits_out.shape), [1, 1, 1, num_classes]) if __name__ == '__main__': tf.test.main() ================================================ FILE: models/slim/nets/inception_v4.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Contains the definition of the Inception V4 architecture. As described in http://arxiv.org/abs/1602.07261. Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke, Alex Alemi """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from nets import inception_utils slim = tf.contrib.slim def block_inception_a(inputs, scope=None, reuse=None): """Builds Inception-A block for Inception v4 network.""" # By default use stride=1 and SAME padding with slim.arg_scope([slim.conv2d, slim.avg_pool2d, slim.max_pool2d], stride=1, padding='SAME'): with tf.variable_scope(scope, 'BlockInceptionA', [inputs], reuse=reuse): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(inputs, 96, [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(inputs, 64, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 96, [3, 3], scope='Conv2d_0b_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(inputs, 64, [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, 96, [3, 3], scope='Conv2d_0b_3x3') branch_2 = slim.conv2d(branch_2, 96, [3, 3], scope='Conv2d_0c_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(inputs, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, 96, [1, 1], scope='Conv2d_0b_1x1') return tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) def block_reduction_a(inputs, scope=None, reuse=None): """Builds Reduction-A block for Inception v4 network.""" # By default use stride=1 and SAME padding with slim.arg_scope([slim.conv2d, slim.avg_pool2d, slim.max_pool2d], stride=1, padding='SAME'): with tf.variable_scope(scope, 'BlockReductionA', [inputs], reuse=reuse): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(inputs, 384, [3, 3], stride=2, padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(inputs, 192, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 224, [3, 3], scope='Conv2d_0b_3x3') branch_1 = slim.conv2d(branch_1, 256, [3, 3], stride=2, padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.max_pool2d(inputs, [3, 3], stride=2, padding='VALID', scope='MaxPool_1a_3x3') return tf.concat(3, [branch_0, branch_1, branch_2]) def block_inception_b(inputs, scope=None, reuse=None): """Builds Inception-B block for Inception v4 network.""" # By default use stride=1 and SAME padding with slim.arg_scope([slim.conv2d, slim.avg_pool2d, slim.max_pool2d], stride=1, padding='SAME'): with tf.variable_scope(scope, 'BlockInceptionB', [inputs], reuse=reuse): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(inputs, 384, [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(inputs, 192, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 224, [1, 7], scope='Conv2d_0b_1x7') branch_1 = slim.conv2d(branch_1, 256, [7, 1], scope='Conv2d_0c_7x1') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(inputs, 192, [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, 192, [7, 1], scope='Conv2d_0b_7x1') branch_2 = slim.conv2d(branch_2, 224, [1, 7], scope='Conv2d_0c_1x7') branch_2 = slim.conv2d(branch_2, 224, [7, 1], scope='Conv2d_0d_7x1') branch_2 = slim.conv2d(branch_2, 256, [1, 7], scope='Conv2d_0e_1x7') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(inputs, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, 128, [1, 1], scope='Conv2d_0b_1x1') return tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) def block_reduction_b(inputs, scope=None, reuse=None): """Builds Reduction-B block for Inception v4 network.""" # By default use stride=1 and SAME padding with slim.arg_scope([slim.conv2d, slim.avg_pool2d, slim.max_pool2d], stride=1, padding='SAME'): with tf.variable_scope(scope, 'BlockReductionB', [inputs], reuse=reuse): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(inputs, 192, [1, 1], scope='Conv2d_0a_1x1') branch_0 = slim.conv2d(branch_0, 192, [3, 3], stride=2, padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(inputs, 256, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 256, [1, 7], scope='Conv2d_0b_1x7') branch_1 = slim.conv2d(branch_1, 320, [7, 1], scope='Conv2d_0c_7x1') branch_1 = slim.conv2d(branch_1, 320, [3, 3], stride=2, padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.max_pool2d(inputs, [3, 3], stride=2, padding='VALID', scope='MaxPool_1a_3x3') return tf.concat(3, [branch_0, branch_1, branch_2]) def block_inception_c(inputs, scope=None, reuse=None): """Builds Inception-C block for Inception v4 network.""" # By default use stride=1 and SAME padding with slim.arg_scope([slim.conv2d, slim.avg_pool2d, slim.max_pool2d], stride=1, padding='SAME'): with tf.variable_scope(scope, 'BlockInceptionC', [inputs], reuse=reuse): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(inputs, 256, [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(inputs, 384, [1, 1], scope='Conv2d_0a_1x1') branch_1 = tf.concat(3, [ slim.conv2d(branch_1, 256, [1, 3], scope='Conv2d_0b_1x3'), slim.conv2d(branch_1, 256, [3, 1], scope='Conv2d_0c_3x1')]) with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(inputs, 384, [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, 448, [3, 1], scope='Conv2d_0b_3x1') branch_2 = slim.conv2d(branch_2, 512, [1, 3], scope='Conv2d_0c_1x3') branch_2 = tf.concat(3, [ slim.conv2d(branch_2, 256, [1, 3], scope='Conv2d_0d_1x3'), slim.conv2d(branch_2, 256, [3, 1], scope='Conv2d_0e_3x1')]) with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(inputs, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, 256, [1, 1], scope='Conv2d_0b_1x1') return tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) def inception_v4_base(inputs, final_endpoint='Mixed_7d', scope=None): """Creates the Inception V4 network up to the given final endpoint. Args: inputs: a 4-D tensor of size [batch_size, height, width, 3]. final_endpoint: specifies the endpoint to construct the network up to. It can be one of [ 'Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', 'Mixed_3a', 'Mixed_4a', 'Mixed_5a', 'Mixed_5b', 'Mixed_5c', 'Mixed_5d', 'Mixed_5e', 'Mixed_6a', 'Mixed_6b', 'Mixed_6c', 'Mixed_6d', 'Mixed_6e', 'Mixed_6f', 'Mixed_6g', 'Mixed_6h', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c', 'Mixed_7d'] scope: Optional variable_scope. Returns: logits: the logits outputs of the model. end_points: the set of end_points from the inception model. Raises: ValueError: if final_endpoint is not set to one of the predefined values, """ end_points = {} def add_and_check_final(name, net): end_points[name] = net return name == final_endpoint with tf.variable_scope(scope, 'InceptionV4', [inputs]): with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], stride=1, padding='SAME'): # 299 x 299 x 3 net = slim.conv2d(inputs, 32, [3, 3], stride=2, padding='VALID', scope='Conv2d_1a_3x3') if add_and_check_final('Conv2d_1a_3x3', net): return net, end_points # 149 x 149 x 32 net = slim.conv2d(net, 32, [3, 3], padding='VALID', scope='Conv2d_2a_3x3') if add_and_check_final('Conv2d_2a_3x3', net): return net, end_points # 147 x 147 x 32 net = slim.conv2d(net, 64, [3, 3], scope='Conv2d_2b_3x3') if add_and_check_final('Conv2d_2b_3x3', net): return net, end_points # 147 x 147 x 64 with tf.variable_scope('Mixed_3a'): with tf.variable_scope('Branch_0'): branch_0 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', scope='MaxPool_0a_3x3') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 96, [3, 3], stride=2, padding='VALID', scope='Conv2d_0a_3x3') net = tf.concat(3, [branch_0, branch_1]) if add_and_check_final('Mixed_3a', net): return net, end_points # 73 x 73 x 160 with tf.variable_scope('Mixed_4a'): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 64, [1, 1], scope='Conv2d_0a_1x1') branch_0 = slim.conv2d(branch_0, 96, [3, 3], padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, 64, [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, 64, [1, 7], scope='Conv2d_0b_1x7') branch_1 = slim.conv2d(branch_1, 64, [7, 1], scope='Conv2d_0c_7x1') branch_1 = slim.conv2d(branch_1, 96, [3, 3], padding='VALID', scope='Conv2d_1a_3x3') net = tf.concat(3, [branch_0, branch_1]) if add_and_check_final('Mixed_4a', net): return net, end_points # 71 x 71 x 192 with tf.variable_scope('Mixed_5a'): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, 192, [3, 3], stride=2, padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_1'): branch_1 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', scope='MaxPool_1a_3x3') net = tf.concat(3, [branch_0, branch_1]) if add_and_check_final('Mixed_5a', net): return net, end_points # 35 x 35 x 384 # 4 x Inception-A blocks for idx in xrange(4): block_scope = 'Mixed_5' + chr(ord('b') + idx) net = block_inception_a(net, block_scope) if add_and_check_final(block_scope, net): return net, end_points # 35 x 35 x 384 # Reduction-A block net = block_reduction_a(net, 'Mixed_6a') if add_and_check_final('Mixed_6a', net): return net, end_points # 17 x 17 x 1024 # 7 x Inception-B blocks for idx in xrange(7): block_scope = 'Mixed_6' + chr(ord('b') + idx) net = block_inception_b(net, block_scope) if add_and_check_final(block_scope, net): return net, end_points # 17 x 17 x 1024 # Reduction-B block net = block_reduction_b(net, 'Mixed_7a') if add_and_check_final('Mixed_7a', net): return net, end_points # 8 x 8 x 1536 # 3 x Inception-C blocks for idx in xrange(3): block_scope = 'Mixed_7' + chr(ord('b') + idx) net = block_inception_c(net, block_scope) if add_and_check_final(block_scope, net): return net, end_points raise ValueError('Unknown final endpoint %s' % final_endpoint) def inception_v4(inputs, num_classes=1001, is_training=True, dropout_keep_prob=0.8, reuse=None, scope='InceptionV4', create_aux_logits=True): """Creates the Inception V4 model. Args: inputs: a 4-D tensor of size [batch_size, height, width, 3]. num_classes: number of predicted classes. is_training: whether is training or not. dropout_keep_prob: float, the fraction to keep before final layer. reuse: whether or not the network and its variables should be reused. To be able to reuse 'scope' must be given. scope: Optional variable_scope. create_aux_logits: Whether to include the auxilliary logits. Returns: logits: the logits outputs of the model. end_points: the set of end_points from the inception model. """ end_points = {} with tf.variable_scope(scope, 'InceptionV4', [inputs], reuse=reuse) as scope: with slim.arg_scope([slim.batch_norm, slim.dropout], is_training=is_training): net, end_points = inception_v4_base(inputs, scope=scope) with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], stride=1, padding='SAME'): # Auxiliary Head logits if create_aux_logits: with tf.variable_scope('AuxLogits'): # 17 x 17 x 1024 aux_logits = end_points['Mixed_6h'] aux_logits = slim.avg_pool2d(aux_logits, [5, 5], stride=3, padding='VALID', scope='AvgPool_1a_5x5') aux_logits = slim.conv2d(aux_logits, 128, [1, 1], scope='Conv2d_1b_1x1') aux_logits = slim.conv2d(aux_logits, 768, aux_logits.get_shape()[1:3], padding='VALID', scope='Conv2d_2a') aux_logits = slim.flatten(aux_logits) aux_logits = slim.fully_connected(aux_logits, num_classes, activation_fn=None, scope='Aux_logits') end_points['AuxLogits'] = aux_logits # Final pooling and prediction with tf.variable_scope('Logits'): # 8 x 8 x 1536 net = slim.avg_pool2d(net, net.get_shape()[1:3], padding='VALID', scope='AvgPool_1a') # 1 x 1 x 1536 net = slim.dropout(net, dropout_keep_prob, scope='Dropout_1b') net = slim.flatten(net, scope='PreLogitsFlatten') end_points['PreLogitsFlatten'] = net # 1536 logits = slim.fully_connected(net, num_classes, activation_fn=None, scope='Logits') end_points['Logits'] = logits end_points['Predictions'] = tf.nn.softmax(logits, name='Predictions') return logits, end_points inception_v4.default_image_size = 299 inception_v4_arg_scope = inception_utils.inception_arg_scope ================================================ FILE: models/slim/nets/inception_v4_test.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Tests for slim.inception_v4.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from nets import inception class InceptionTest(tf.test.TestCase): def testBuildLogits(self): batch_size = 5 height, width = 299, 299 num_classes = 1000 inputs = tf.random_uniform((batch_size, height, width, 3)) logits, end_points = inception.inception_v4(inputs, num_classes) auxlogits = end_points['AuxLogits'] predictions = end_points['Predictions'] self.assertTrue(auxlogits.op.name.startswith('InceptionV4/AuxLogits')) self.assertListEqual(auxlogits.get_shape().as_list(), [batch_size, num_classes]) self.assertTrue(logits.op.name.startswith('InceptionV4/Logits')) self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) self.assertTrue(predictions.op.name.startswith( 'InceptionV4/Logits/Predictions')) self.assertListEqual(predictions.get_shape().as_list(), [batch_size, num_classes]) def testBuildWithoutAuxLogits(self): batch_size = 5 height, width = 299, 299 num_classes = 1000 inputs = tf.random_uniform((batch_size, height, width, 3)) logits, endpoints = inception.inception_v4(inputs, num_classes, create_aux_logits=False) self.assertFalse('AuxLogits' in endpoints) self.assertTrue(logits.op.name.startswith('InceptionV4/Logits')) self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) def testAllEndPointsShapes(self): batch_size = 5 height, width = 299, 299 num_classes = 1000 inputs = tf.random_uniform((batch_size, height, width, 3)) _, end_points = inception.inception_v4(inputs, num_classes) endpoints_shapes = {'Conv2d_1a_3x3': [batch_size, 149, 149, 32], 'Conv2d_2a_3x3': [batch_size, 147, 147, 32], 'Conv2d_2b_3x3': [batch_size, 147, 147, 64], 'Mixed_3a': [batch_size, 73, 73, 160], 'Mixed_4a': [batch_size, 71, 71, 192], 'Mixed_5a': [batch_size, 35, 35, 384], # 4 x Inception-A blocks 'Mixed_5b': [batch_size, 35, 35, 384], 'Mixed_5c': [batch_size, 35, 35, 384], 'Mixed_5d': [batch_size, 35, 35, 384], 'Mixed_5e': [batch_size, 35, 35, 384], # Reduction-A block 'Mixed_6a': [batch_size, 17, 17, 1024], # 7 x Inception-B blocks 'Mixed_6b': [batch_size, 17, 17, 1024], 'Mixed_6c': [batch_size, 17, 17, 1024], 'Mixed_6d': [batch_size, 17, 17, 1024], 'Mixed_6e': [batch_size, 17, 17, 1024], 'Mixed_6f': [batch_size, 17, 17, 1024], 'Mixed_6g': [batch_size, 17, 17, 1024], 'Mixed_6h': [batch_size, 17, 17, 1024], # Reduction-A block 'Mixed_7a': [batch_size, 8, 8, 1536], # 3 x Inception-C blocks 'Mixed_7b': [batch_size, 8, 8, 1536], 'Mixed_7c': [batch_size, 8, 8, 1536], 'Mixed_7d': [batch_size, 8, 8, 1536], # Logits and predictions 'AuxLogits': [batch_size, num_classes], 'PreLogitsFlatten': [batch_size, 1536], 'Logits': [batch_size, num_classes], 'Predictions': [batch_size, num_classes]} self.assertItemsEqual(endpoints_shapes.keys(), end_points.keys()) for endpoint_name in endpoints_shapes: expected_shape = endpoints_shapes[endpoint_name] self.assertTrue(endpoint_name in end_points) self.assertListEqual(end_points[endpoint_name].get_shape().as_list(), expected_shape) def testBuildBaseNetwork(self): batch_size = 5 height, width = 299, 299 inputs = tf.random_uniform((batch_size, height, width, 3)) net, end_points = inception.inception_v4_base(inputs) self.assertTrue(net.op.name.startswith( 'InceptionV4/Mixed_7d')) self.assertListEqual(net.get_shape().as_list(), [batch_size, 8, 8, 1536]) expected_endpoints = [ 'Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', 'Mixed_3a', 'Mixed_4a', 'Mixed_5a', 'Mixed_5b', 'Mixed_5c', 'Mixed_5d', 'Mixed_5e', 'Mixed_6a', 'Mixed_6b', 'Mixed_6c', 'Mixed_6d', 'Mixed_6e', 'Mixed_6f', 'Mixed_6g', 'Mixed_6h', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c', 'Mixed_7d'] self.assertItemsEqual(end_points.keys(), expected_endpoints) for name, op in end_points.iteritems(): self.assertTrue(op.name.startswith('InceptionV4/' + name)) def testBuildOnlyUpToFinalEndpoint(self): batch_size = 5 height, width = 299, 299 all_endpoints = [ 'Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', 'Mixed_3a', 'Mixed_4a', 'Mixed_5a', 'Mixed_5b', 'Mixed_5c', 'Mixed_5d', 'Mixed_5e', 'Mixed_6a', 'Mixed_6b', 'Mixed_6c', 'Mixed_6d', 'Mixed_6e', 'Mixed_6f', 'Mixed_6g', 'Mixed_6h', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c', 'Mixed_7d'] for index, endpoint in enumerate(all_endpoints): with tf.Graph().as_default(): inputs = tf.random_uniform((batch_size, height, width, 3)) out_tensor, end_points = inception.inception_v4_base( inputs, final_endpoint=endpoint) self.assertTrue(out_tensor.op.name.startswith( 'InceptionV4/' + endpoint)) self.assertItemsEqual(all_endpoints[:index+1], end_points) def testVariablesSetDevice(self): batch_size = 5 height, width = 299, 299 num_classes = 1000 inputs = tf.random_uniform((batch_size, height, width, 3)) # Force all Variables to reside on the device. with tf.variable_scope('on_cpu'), tf.device('/cpu:0'): inception.inception_v4(inputs, num_classes) with tf.variable_scope('on_gpu'), tf.device('/gpu:0'): inception.inception_v4(inputs, num_classes) for v in tf.get_collection(tf.GraphKeys.VARIABLES, scope='on_cpu'): self.assertDeviceEqual(v.device, '/cpu:0') for v in tf.get_collection(tf.GraphKeys.VARIABLES, scope='on_gpu'): self.assertDeviceEqual(v.device, '/gpu:0') def testHalfSizeImages(self): batch_size = 5 height, width = 150, 150 num_classes = 1000 inputs = tf.random_uniform((batch_size, height, width, 3)) logits, end_points = inception.inception_v4(inputs, num_classes) self.assertTrue(logits.op.name.startswith('InceptionV4/Logits')) self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) pre_pool = end_points['Mixed_7d'] self.assertListEqual(pre_pool.get_shape().as_list(), [batch_size, 3, 3, 1536]) def testUnknownBatchSize(self): batch_size = 1 height, width = 299, 299 num_classes = 1000 with self.test_session() as sess: inputs = tf.placeholder(tf.float32, (None, height, width, 3)) logits, _ = inception.inception_v4(inputs, num_classes) self.assertTrue(logits.op.name.startswith('InceptionV4/Logits')) self.assertListEqual(logits.get_shape().as_list(), [None, num_classes]) images = tf.random_uniform((batch_size, height, width, 3)) sess.run(tf.global_variables_initializer()) output = sess.run(logits, {inputs: images.eval()}) self.assertEquals(output.shape, (batch_size, num_classes)) def testEvaluation(self): batch_size = 2 height, width = 299, 299 num_classes = 1000 with self.test_session() as sess: eval_inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = inception.inception_v4(eval_inputs, num_classes, is_training=False) predictions = tf.argmax(logits, 1) sess.run(tf.global_variables_initializer()) output = sess.run(predictions) self.assertEquals(output.shape, (batch_size,)) def testTrainEvalWithReuse(self): train_batch_size = 5 eval_batch_size = 2 height, width = 150, 150 num_classes = 1000 with self.test_session() as sess: train_inputs = tf.random_uniform((train_batch_size, height, width, 3)) inception.inception_v4(train_inputs, num_classes) eval_inputs = tf.random_uniform((eval_batch_size, height, width, 3)) logits, _ = inception.inception_v4(eval_inputs, num_classes, is_training=False, reuse=True) predictions = tf.argmax(logits, 1) sess.run(tf.global_variables_initializer()) output = sess.run(predictions) self.assertEquals(output.shape, (eval_batch_size,)) if __name__ == '__main__': tf.test.main() ================================================ FILE: models/slim/nets/lenet.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Contains a variant of the LeNet model definition.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf slim = tf.contrib.slim def lenet(images, num_classes=10, is_training=False, dropout_keep_prob=0.5, prediction_fn=slim.softmax, scope='LeNet'): """Creates a variant of the LeNet model. Note that since the output is a set of 'logits', the values fall in the interval of (-infinity, infinity). Consequently, to convert the outputs to a probability distribution over the characters, one will need to convert them using the softmax function: logits = lenet.lenet(images, is_training=False) probabilities = tf.nn.softmax(logits) predictions = tf.argmax(logits, 1) Args: images: A batch of `Tensors` of size [batch_size, height, width, channels]. num_classes: the number of classes in the dataset. is_training: specifies whether or not we're currently training the model. This variable will determine the behaviour of the dropout layer. dropout_keep_prob: the percentage of activation values that are retained. prediction_fn: a function to get predictions out of logits. scope: Optional variable_scope. Returns: logits: the pre-softmax activations, a tensor of size [batch_size, `num_classes`] end_points: a dictionary from components of the network to the corresponding activation. """ end_points = {} with tf.variable_scope(scope, 'LeNet', [images, num_classes]): net = slim.conv2d(images, 32, [5, 5], scope='conv1') net = slim.max_pool2d(net, [2, 2], 2, scope='pool1') net = slim.conv2d(net, 64, [5, 5], scope='conv2') net = slim.max_pool2d(net, [2, 2], 2, scope='pool2') net = slim.flatten(net) end_points['Flatten'] = net net = slim.fully_connected(net, 1024, scope='fc3') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout3') logits = slim.fully_connected(net, num_classes, activation_fn=None, scope='fc4') end_points['Logits'] = logits end_points['Predictions'] = prediction_fn(logits, scope='Predictions') return logits, end_points lenet.default_image_size = 28 def lenet_arg_scope(weight_decay=0.0): """Defines the default lenet argument scope. Args: weight_decay: The weight decay to use for regularizing the model. Returns: An `arg_scope` to use for the inception v3 model. """ with slim.arg_scope( [slim.conv2d, slim.fully_connected], weights_regularizer=slim.l2_regularizer(weight_decay), weights_initializer=tf.truncated_normal_initializer(stddev=0.1), activation_fn=tf.nn.relu) as sc: return sc ================================================ FILE: models/slim/nets/nets_factory.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Contains a factory for building various models.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import functools import tensorflow as tf import numpy as np import sys from nets import alexnet from nets import cifarnet from nets import inception from nets import lenet from nets import overfeat from nets import resnet_v1 from nets import resnet_v2 from nets import vgg sys.path.append('libs/tensorflow_compact_bilinear_pooling/') from compact_bilinear_pooling import compact_bilinear_pooling_layer slim = tf.contrib.slim networks_map = {'alexnet_v2': alexnet.alexnet_v2, 'cifarnet': cifarnet.cifarnet, 'overfeat': overfeat.overfeat, 'vgg_a': vgg.vgg_a, 'vgg_16': vgg.vgg_16, 'vgg_19': vgg.vgg_19, 'inception_v1': inception.inception_v1, 'inception_v2': inception.inception_v2, 'inception_v2_tsn': inception.inception_v2_tsn, 'inception_v3': inception.inception_v3, 'inception_v4': inception.inception_v4, 'inception_resnet_v2': inception.inception_resnet_v2, 'lenet': lenet.lenet, 'resnet_v1_50': resnet_v1.resnet_v1_50, 'resnet_v1_101': resnet_v1.resnet_v1_101, 'resnet_v1_152': resnet_v1.resnet_v1_152, 'resnet_v1_200': resnet_v1.resnet_v1_200, 'resnet_v2_50': resnet_v2.resnet_v2_50, 'resnet_v2_101': resnet_v2.resnet_v2_101, 'resnet_v2_152': resnet_v2.resnet_v2_152, 'resnet_v2_200': resnet_v2.resnet_v2_200, } last_conv_map = {'inception_v3': 'Mixed_7c', 'inception_v2_tsn': 'InceptionV2_TSN/inception_5b', 'resnet_v1_101': 'resnet_v1_101/block4', 'vgg_16': 'vgg_16/conv5', } arg_scopes_map = {'alexnet_v2': alexnet.alexnet_v2_arg_scope, 'cifarnet': cifarnet.cifarnet_arg_scope, 'overfeat': overfeat.overfeat_arg_scope, 'vgg_a': vgg.vgg_arg_scope, 'vgg_16': vgg.vgg_arg_scope, 'vgg_19': vgg.vgg_arg_scope, 'inception_v1': inception.inception_v3_arg_scope, 'inception_v2': inception.inception_v3_arg_scope, 'inception_v2_tsn': inception.inception_v2_tsn_arg_scope, 'inception_v3': inception.inception_v3_arg_scope, 'inception_v4': inception.inception_v4_arg_scope, 'inception_resnet_v2': inception.inception_resnet_v2_arg_scope, 'lenet': lenet.lenet_arg_scope, 'resnet_v1_50': resnet_v1.resnet_arg_scope, 'resnet_v1_101': resnet_v1.resnet_arg_scope, 'resnet_v1_152': resnet_v1.resnet_arg_scope, 'resnet_v1_200': resnet_v1.resnet_arg_scope, 'resnet_v2_50': resnet_v2.resnet_arg_scope, 'resnet_v2_101': resnet_v2.resnet_arg_scope, 'resnet_v2_152': resnet_v2.resnet_arg_scope, 'resnet_v2_200': resnet_v2.resnet_arg_scope, } def get_network_fn(name, num_classes, num_pose_keypoints, cfg, weight_decay=0.0, is_training=False): """Returns a network_fn such as `logits, end_points = network_fn(images)`. Args: name: The name of the network. num_classes: The number of classes to use for classification. num_pose_keypoints: The number of channels to output for pose. weight_decay: The l2 coefficient for the model weights. is_training: `True` if the model is being used for training and `False` otherwise. Returns: network_fn: A function that applies the model to a batch of images. It has the following signature: logits, end_points = network_fn(images) Raises: ValueError: If network `name` is not recognized. """ if name not in networks_map: raise ValueError('Name of network unknown %s' % name) arg_scope = arg_scopes_map[name](weight_decay=weight_decay) func = networks_map[name] @functools.wraps(func) def network_fn(images): with slim.arg_scope(arg_scope): frames_per_video = 1 # same for single image datasets if images.get_shape().ndims == 5: im_shape = images.get_shape().as_list() frames_per_video = im_shape[1] images = tf.reshape( images, [-1, im_shape[-3], im_shape[-2], im_shape[-1]]) # Main Network Function kwargs = {} if cfg.NET.DROPOUT >= 0: # if -1, then just ignore it and use nw def. kwargs['dropout_keep_prob'] = (1-cfg.NET.DROPOUT) logits, end_points = func(images, num_classes, is_training=is_training, train_top_bn=cfg.NET.TRAIN_TOP_BN, **kwargs) # rgirdhar: add another end point for heatmap prediction try: last_conv = end_points[last_conv_map[name]] except: raise ValueError('End point {} not found. Choose from: {}'.format( last_conv_map[name], ' '.join(end_points))) random_normal = lambda stddev: tf.random_normal_initializer(0.0, stddev) with slim.arg_scope([slim.dropout], is_training=is_training, keep_prob=0.2 if cfg.NET.DROPOUT < 0 else (1.0-cfg.NET.DROPOUT)): with tf.variable_scope('PoseLogits'): last_conv_pose_name = getattr( cfg.NET.LAST_CONV_MAP_FOR_POSE, name) last_conv_pose = end_points[last_conv_pose_name] pose_pre_logits = slim.conv2d( last_conv_pose, 768, [1, 1], weights_initializer=random_normal(0.001), activation_fn=tf.nn.relu, normalizer_fn=None, biases_initializer=tf.zeros_initializer(), padding='SAME', scope='ExtraConv2d_1x1') pose_logits = slim.conv2d(pose_pre_logits, num_pose_keypoints, [1, 1], activation_fn=None, normalizer_fn=None, scope='Conv2d_1c_1x1') end_points['PoseLogits'] = pose_logits if cfg.NET.USE_POSE_ATTENTION_LOGITS: with tf.variable_scope('PoseAttention'): # use the pose prediction as an attention map to get the features # step1: split pose logits over channels pose_logits_parts = tf.split( pose_logits, pose_logits.get_shape().as_list()[-1], axis=pose_logits.get_shape().ndims-1) part_logits = [] # allows to choose which dimension of pose to use for heatmaps parts_to_use = pose_logits_parts if cfg.NET.USE_POSE_ATTENTION_LOGITS_DIMS != [-1]: parts_to_use = (np.array(pose_logits_parts)[ cfg.NET.USE_POSE_ATTENTION_LOGITS_DIMS]).tolist() tf.logging.info('Using {} parts for pose attention logits'.format( len(parts_to_use))) for part in parts_to_use: part_logits.append(tf.reduce_mean(part * last_conv, axis=[1,2], keep_dims=True)) if cfg.NET.USE_POSE_ATTENTION_LOGITS_AVGED_HMAP: part_logits.append(tf.reduce_mean( last_conv * tf.reduce_mean(pose_logits, axis=-1, keep_dims=True), axis=[1,2], keep_dims=True)) part_logits.append(tf.reduce_mean(last_conv, axis=[1,2], keep_dims=True)) net = tf.concat(part_logits, axis=-1) net = slim.dropout(net) logits = slim.conv2d(net, num_classes, [1, 1], weights_initializer=random_normal(0.001), biases_initializer=tf.zeros_initializer(), activation_fn=None, normalizer_fn=None) elif cfg.NET.USE_POSE_LOGITS_DIRECTLY: with tf.variable_scope('ActionFromPose'): net = tf.reduce_mean( pose_pre_logits, axis=[1, 2], keep_dims=True) net = slim.conv2d(net, 768, [1, 1], normalizer_fn=None, weights_initializer=random_normal(0.001), biases_initializer=tf.zeros_initializer()) if cfg.NET.USE_POSE_LOGITS_DIRECTLY_PLUS_LOGITS: net = tf.concat([ net, tf.reduce_mean(last_conv, axis=[1, 2], keep_dims=True)], axis=-1) net = slim.dropout(net) logits = slim.conv2d(net, num_classes, [1, 1], weights_initializer=random_normal(0.001), biases_initializer=tf.zeros_initializer(), activation_fn=None, normalizer_fn=None) elif cfg.NET.USE_POSE_LOGITS_DIRECTLY_v2: with tf.variable_scope('ActionFromPose_v2'): net = tf.concat([ pose_pre_logits, last_conv], axis=-1) if cfg.NET.USE_POSE_LOGITS_DIRECTLY_v2_EXTRA_LAYER: net = tf.nn.relu(net) net = slim.conv2d(net, net.get_shape().as_list()[-1], [1, 1], weights_initializer=random_normal(0.001), biases_initializer=tf.zeros_initializer()) net = tf.reduce_mean(net, axis=[1, 2], keep_dims=True) net = slim.dropout(net) logits = slim.conv2d(net, num_classes, [1, 1], weights_initializer=random_normal(0.001), biases_initializer=tf.zeros_initializer(), activation_fn=None, normalizer_fn=None) elif cfg.NET.USE_COMPACT_BILINEAR_POOLING: last_conv_shape = last_conv.get_shape().as_list() net = compact_bilinear_pooling_layer( last_conv, last_conv, last_conv_shape[-1]) net.set_shape([last_conv_shape[0], last_conv_shape[-1]]) net = tf.expand_dims(tf.expand_dims( net, 1), 1) net = slim.dropout(net) logits = slim.conv2d(net, num_classes, [1, 1], weights_initializer=random_normal(0.001), biases_initializer=tf.zeros_initializer(), activation_fn=None, normalizer_fn=None) elif cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION: with tf.variable_scope('PosePrelogitsBasedAttention'): # If the following is set, just train on top of image features, # don't add the prelogits at all. This was useful as pose seemed to # not help with it at all. if cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_SINGLE_LAYER_ATT: net = last_conv else: net = pose_pre_logits # nMaps = num_classes if cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_PER_CLASS else 1 # For simplicity, since multiple maps doesn't seem to help, I'm # not allowing that to keep the following code simple. # nMaps = 1 # For NIPS2017 rebuttal, they wanted to see nums with per-class # attention, so doing that too nMaps = num_classes if cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_PER_CLASS else 1 all_att_logits = [] for rank_id in range(cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_RANK): scope_name = 'Conv2d_PrePose_Attn' if rank_id >= 1: scope_name += str(rank_id) net = slim.conv2d(net, nMaps, [1, 1], weights_initializer=random_normal(0.001), biases_initializer=tf.zeros_initializer(), activation_fn=None, normalizer_fn=None, scope=scope_name) all_att_logits.append(net) if len(all_att_logits) > 1: attention_logits = tf.stack(all_att_logits, axis=-1) else: attention_logits = all_att_logits[0] if cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_SOFTMAX_ATT: # bring the number of channels earlier to make softmax easier attention_logits = tf.transpose(attention_logits, [0, 3, 1, 2]) att_shape = attention_logits.get_shape().as_list() attention_logits = tf.reshape( attention_logits, [att_shape[0], att_shape[1], -1]) attention_logits = tf.nn.softmax(attention_logits) attention_logits = tf.reshape(attention_logits, att_shape) attention_logits = tf.transpose(attention_logits, [0, 2, 3, 1]) if cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_RELU_ATT: attention_logits = tf.nn.relu(attention_logits) end_points['PosePrelogitsBasedAttention'] = attention_logits if cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_WITH_POSE_FEAT: if cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_WITH_POSE_FEAT_2LAYER: pose_logits = slim.conv2d( pose_logits, pose_logits.get_shape()[-1], [1, 1], weights_initializer=random_normal(0.001), biases_initializer=tf.zeros_initializer()) last_conv = tf.concat([last_conv, pose_logits], axis=-1) last_conv = slim.dropout(last_conv) # Top-down attention all_logits = [] for _ in range(cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_RANK): logits = slim.conv2d(last_conv, num_classes, [1, 1], weights_initializer=random_normal(0.001), biases_initializer=tf.zeros_initializer(), activation_fn=None, normalizer_fn=None) all_logits.append(logits) if len(all_logits) > 1: logits = tf.stack(all_logits, axis=-1) else: logits = all_logits[0] end_points['TopDownAttention'] = logits # attended_feats = [] # for attention_logit in tf.unstack(attention_logits, axis=-1): # attended_feats.append(tf.reduce_mean( # tf.expand_dims(attention_logit, axis=-1) * logits, # axis=[1,2], # keep_dims=True)) # attended_feat = tf.stack(attended_feats, axis=-1) # # Since only 1 attention map (asserted above) # logits = attended_feat[..., 0] # better way to do the above: logits = tf.reduce_mean( attention_logits * logits, axis=[1, 2], keep_dims=True) if logits.get_shape().ndims == 5: # i.e. rank was > 1 logits = tf.reduce_sum(logits, axis=-1) # if nMaps == 1: # # remove the extra dimension that is added for multi-class # # attention case # attended_feat = attended_feat[..., 0] # logits = slim.conv2d(attended_feat, num_classes, [1, 1], # weights_initializer=random_normal(0.001), # biases_initializer=tf.zeros_initializer(), # activation_fn=None, # normalizer_fn=None) # else: # logits = tf.concat([ # slim.conv2d(el, 1, [1, 1], # weights_initializer=random_normal(0.001), # biases_initializer=tf.zeros_initializer(), # activation_fn=None, # normalizer_fn=None) for el in # tf.unstack(attended_feat, axis=-1)], axis=-1) # This is just to protect against the case where I don't do any of the # above and get the original logits from the network, which has already # been squeezed, or in case of vgg 16, passed through fc layers if logits.get_shape().ndims > 2: logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze') end_points['Logits'] = logits if frames_per_video > 1: with tf.name_scope('FramePooling'): # for now stick with avg pool end_points['logits_beforePool'] = logits old_logits = logits logits = tf.stack([el for el in tf.split( old_logits, int(old_logits.get_shape().as_list()[0] / frames_per_video))]) if cfg.NET.USE_TEMPORAL_ATT: with tf.variable_scope('TemporalAttention'): logits = tf.expand_dims(logits, axis=-2) #[bs, 3, 1, nc] logits_att = slim.conv2d( logits, 1, [1, 1], weights_initializer=random_normal(0.001), biases_initializer=tf.constant_initializer( 1.0 / logits.get_shape().as_list()[1]), activation_fn=None, normalizer_fn=None) logits = logits * logits_att logits = tf.squeeze(logits, axis=-2) end_points['TemporalAttention'] = logits_att logits = tf.reduce_mean(logits, axis=1) return logits, end_points if hasattr(func, 'default_image_size'): network_fn.default_image_size = func.default_image_size return network_fn ================================================ FILE: models/slim/nets/nets_factory_test.py ================================================ # Copyright 2016 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Tests for slim.inception.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from nets import nets_factory class NetworksTest(tf.test.TestCase): def testGetNetworkFn(self): batch_size = 5 num_classes = 1000 for net in nets_factory.networks_map: with self.test_session(): net_fn = nets_factory.get_network_fn(net, num_classes) # Most networks use 224 as their default_image_size image_size = getattr(net_fn, 'default_image_size', 224) inputs = tf.random_uniform((batch_size, image_size, image_size, 3)) logits, end_points = net_fn(inputs) self.assertTrue(isinstance(logits, tf.Tensor)) self.assertTrue(isinstance(end_points, dict)) self.assertEqual(logits.get_shape().as_list()[0], batch_size) self.assertEqual(logits.get_shape().as_list()[-1], num_classes) if __name__ == '__main__': tf.test.main() ================================================ FILE: models/slim/nets/overfeat.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Contains the model definition for the OverFeat network. The definition for the network was obtained from: OverFeat: Integrated Recognition, Localization and Detection using Convolutional Networks Pierre Sermanet, David Eigen, Xiang Zhang, Michael Mathieu, Rob Fergus and Yann LeCun, 2014 http://arxiv.org/abs/1312.6229 Usage: with slim.arg_scope(overfeat.overfeat_arg_scope()): outputs, end_points = overfeat.overfeat(inputs) @@overfeat """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf slim = tf.contrib.slim trunc_normal = lambda stddev: tf.truncated_normal_initializer(0.0, stddev) def overfeat_arg_scope(weight_decay=0.0005): with slim.arg_scope([slim.conv2d, slim.fully_connected], activation_fn=tf.nn.relu, weights_regularizer=slim.l2_regularizer(weight_decay), biases_initializer=tf.zeros_initializer): with slim.arg_scope([slim.conv2d], padding='SAME'): with slim.arg_scope([slim.max_pool2d], padding='VALID') as arg_sc: return arg_sc def overfeat(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.5, spatial_squeeze=True, scope='overfeat'): """Contains the model definition for the OverFeat network. The definition for the network was obtained from: OverFeat: Integrated Recognition, Localization and Detection using Convolutional Networks Pierre Sermanet, David Eigen, Xiang Zhang, Michael Mathieu, Rob Fergus and Yann LeCun, 2014 http://arxiv.org/abs/1312.6229 Note: All the fully_connected layers have been transformed to conv2d layers. To use in classification mode, resize input to 231x231. To use in fully convolutional mode, set spatial_squeeze to false. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. spatial_squeeze: whether or not should squeeze the spatial dimensions of the outputs. Useful to remove unnecessary dimensions for classification. scope: Optional scope for the variables. Returns: the last op containing the log predictions and end_points dict. """ with tf.variable_scope(scope, 'overfeat', [inputs]) as sc: end_points_collection = sc.name + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d], outputs_collections=end_points_collection): net = slim.conv2d(inputs, 64, [11, 11], 4, padding='VALID', scope='conv1') net = slim.max_pool2d(net, [2, 2], scope='pool1') net = slim.conv2d(net, 256, [5, 5], padding='VALID', scope='conv2') net = slim.max_pool2d(net, [2, 2], scope='pool2') net = slim.conv2d(net, 512, [3, 3], scope='conv3') net = slim.conv2d(net, 1024, [3, 3], scope='conv4') net = slim.conv2d(net, 1024, [3, 3], scope='conv5') net = slim.max_pool2d(net, [2, 2], scope='pool5') with slim.arg_scope([slim.conv2d], weights_initializer=trunc_normal(0.005), biases_initializer=tf.constant_initializer(0.1)): # Use conv2d instead of fully_connected layers. net = slim.conv2d(net, 3072, [6, 6], padding='VALID', scope='fc6') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout6') net = slim.conv2d(net, 4096, [1, 1], scope='fc7') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout7') net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, biases_initializer=tf.zeros_initializer, scope='fc8') # Convert end_points_collection into a end_point dict. end_points = slim.utils.convert_collection_to_dict(end_points_collection) if spatial_squeeze: net = tf.squeeze(net, [1, 2], name='fc8/squeezed') end_points[sc.name + '/fc8'] = net return net, end_points overfeat.default_image_size = 231 ================================================ FILE: models/slim/nets/overfeat_test.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Tests for slim.nets.overfeat.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from nets import overfeat slim = tf.contrib.slim class OverFeatTest(tf.test.TestCase): def testBuild(self): batch_size = 5 height, width = 231, 231 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = overfeat.overfeat(inputs, num_classes) self.assertEquals(logits.op.name, 'overfeat/fc8/squeezed') self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) def testFullyConvolutional(self): batch_size = 1 height, width = 281, 281 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = overfeat.overfeat(inputs, num_classes, spatial_squeeze=False) self.assertEquals(logits.op.name, 'overfeat/fc8/BiasAdd') self.assertListEqual(logits.get_shape().as_list(), [batch_size, 2, 2, num_classes]) def testEndPoints(self): batch_size = 5 height, width = 231, 231 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) _, end_points = overfeat.overfeat(inputs, num_classes) expected_names = ['overfeat/conv1', 'overfeat/pool1', 'overfeat/conv2', 'overfeat/pool2', 'overfeat/conv3', 'overfeat/conv4', 'overfeat/conv5', 'overfeat/pool5', 'overfeat/fc6', 'overfeat/fc7', 'overfeat/fc8' ] self.assertSetEqual(set(end_points.keys()), set(expected_names)) def testModelVariables(self): batch_size = 5 height, width = 231, 231 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) overfeat.overfeat(inputs, num_classes) expected_names = ['overfeat/conv1/weights', 'overfeat/conv1/biases', 'overfeat/conv2/weights', 'overfeat/conv2/biases', 'overfeat/conv3/weights', 'overfeat/conv3/biases', 'overfeat/conv4/weights', 'overfeat/conv4/biases', 'overfeat/conv5/weights', 'overfeat/conv5/biases', 'overfeat/fc6/weights', 'overfeat/fc6/biases', 'overfeat/fc7/weights', 'overfeat/fc7/biases', 'overfeat/fc8/weights', 'overfeat/fc8/biases', ] model_variables = [v.op.name for v in slim.get_model_variables()] self.assertSetEqual(set(model_variables), set(expected_names)) def testEvaluation(self): batch_size = 2 height, width = 231, 231 num_classes = 1000 with self.test_session(): eval_inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = overfeat.overfeat(eval_inputs, is_training=False) self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) predictions = tf.argmax(logits, 1) self.assertListEqual(predictions.get_shape().as_list(), [batch_size]) def testTrainEvalWithReuse(self): train_batch_size = 2 eval_batch_size = 1 train_height, train_width = 231, 231 eval_height, eval_width = 281, 281 num_classes = 1000 with self.test_session(): train_inputs = tf.random_uniform( (train_batch_size, train_height, train_width, 3)) logits, _ = overfeat.overfeat(train_inputs) self.assertListEqual(logits.get_shape().as_list(), [train_batch_size, num_classes]) tf.get_variable_scope().reuse_variables() eval_inputs = tf.random_uniform( (eval_batch_size, eval_height, eval_width, 3)) logits, _ = overfeat.overfeat(eval_inputs, is_training=False, spatial_squeeze=False) self.assertListEqual(logits.get_shape().as_list(), [eval_batch_size, 2, 2, num_classes]) logits = tf.reduce_mean(logits, [1, 2]) predictions = tf.argmax(logits, 1) self.assertEquals(predictions.get_shape().as_list(), [eval_batch_size]) def testForward(self): batch_size = 1 height, width = 231, 231 with self.test_session() as sess: inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = overfeat.overfeat(inputs) sess.run(tf.global_variables_initializer()) output = sess.run(logits) self.assertTrue(output.any()) if __name__ == '__main__': tf.test.main() ================================================ FILE: models/slim/nets/resnet_utils.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Contains building blocks for various versions of Residual Networks. Residual networks (ResNets) were proposed in: Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun Deep Residual Learning for Image Recognition. arXiv:1512.03385, 2015 More variants were introduced in: Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun Identity Mappings in Deep Residual Networks. arXiv: 1603.05027, 2016 We can obtain different ResNet variants by changing the network depth, width, and form of residual unit. This module implements the infrastructure for building them. Concrete ResNet units and full ResNet networks are implemented in the accompanying resnet_v1.py and resnet_v2.py modules. Compared to https://github.com/KaimingHe/deep-residual-networks, in the current implementation we subsample the output activations in the last residual unit of each block, instead of subsampling the input activations in the first residual unit of each block. The two implementations give identical results but our implementation is more memory efficient. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import collections import tensorflow as tf slim = tf.contrib.slim class Block(collections.namedtuple('Block', ['scope', 'unit_fn', 'args'])): """A named tuple describing a ResNet block. Its parts are: scope: The scope of the `Block`. unit_fn: The ResNet unit function which takes as input a `Tensor` and returns another `Tensor` with the output of the ResNet unit. args: A list of length equal to the number of units in the `Block`. The list contains one (depth, depth_bottleneck, stride) tuple for each unit in the block to serve as argument to unit_fn. """ def subsample(inputs, factor, scope=None): """Subsamples the input along the spatial dimensions. Args: inputs: A `Tensor` of size [batch, height_in, width_in, channels]. factor: The subsampling factor. scope: Optional variable_scope. Returns: output: A `Tensor` of size [batch, height_out, width_out, channels] with the input, either intact (if factor == 1) or subsampled (if factor > 1). """ if factor == 1: return inputs else: return slim.max_pool2d(inputs, [1, 1], stride=factor, scope=scope) def conv2d_same(inputs, num_outputs, kernel_size, stride, rate=1, scope=None): """Strided 2-D convolution with 'SAME' padding. When stride > 1, then we do explicit zero-padding, followed by conv2d with 'VALID' padding. Note that net = conv2d_same(inputs, num_outputs, 3, stride=stride) is equivalent to net = slim.conv2d(inputs, num_outputs, 3, stride=1, padding='SAME') net = subsample(net, factor=stride) whereas net = slim.conv2d(inputs, num_outputs, 3, stride=stride, padding='SAME') is different when the input's height or width is even, which is why we add the current function. For more details, see ResnetUtilsTest.testConv2DSameEven(). Args: inputs: A 4-D tensor of size [batch, height_in, width_in, channels]. num_outputs: An integer, the number of output filters. kernel_size: An int with the kernel_size of the filters. stride: An integer, the output stride. rate: An integer, rate for atrous convolution. scope: Scope. Returns: output: A 4-D tensor of size [batch, height_out, width_out, channels] with the convolution output. """ if stride == 1: return slim.conv2d(inputs, num_outputs, kernel_size, stride=1, rate=rate, padding='SAME', scope=scope) else: kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1) pad_total = kernel_size_effective - 1 pad_beg = pad_total // 2 pad_end = pad_total - pad_beg inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) return slim.conv2d(inputs, num_outputs, kernel_size, stride=stride, rate=rate, padding='VALID', scope=scope) @slim.add_arg_scope def stack_blocks_dense(net, blocks, output_stride=None, outputs_collections=None): """Stacks ResNet `Blocks` and controls output feature density. First, this function creates scopes for the ResNet in the form of 'block_name/unit_1', 'block_name/unit_2', etc. Second, this function allows the user to explicitly control the ResNet output_stride, which is the ratio of the input to output spatial resolution. This is useful for dense prediction tasks such as semantic segmentation or object detection. Most ResNets consist of 4 ResNet blocks and subsample the activations by a factor of 2 when transitioning between consecutive ResNet blocks. This results to a nominal ResNet output_stride equal to 8. If we set the output_stride to half the nominal network stride (e.g., output_stride=4), then we compute responses twice. Control of the output feature density is implemented by atrous convolution. Args: net: A `Tensor` of size [batch, height, width, channels]. blocks: A list of length equal to the number of ResNet `Blocks`. Each element is a ResNet `Block` object describing the units in the `Block`. output_stride: If `None`, then the output will be computed at the nominal network stride. If output_stride is not `None`, it specifies the requested ratio of input to output spatial resolution, which needs to be equal to the product of unit strides from the start up to some level of the ResNet. For example, if the ResNet employs units with strides 1, 2, 1, 3, 4, 1, then valid values for the output_stride are 1, 2, 6, 24 or None (which is equivalent to output_stride=24). outputs_collections: Collection to add the ResNet block outputs. Returns: net: Output tensor with stride equal to the specified output_stride. Raises: ValueError: If the target output_stride is not valid. """ # The current_stride variable keeps track of the effective stride of the # activations. This allows us to invoke atrous convolution whenever applying # the next residual unit would result in the activations having stride larger # than the target output_stride. current_stride = 1 # The atrous convolution rate parameter. rate = 1 for block in blocks: with tf.variable_scope(block.scope, 'block', [net]) as sc: for i, unit in enumerate(block.args): if output_stride is not None and current_stride > output_stride: raise ValueError('The target output_stride cannot be reached.') with tf.variable_scope('unit_%d' % (i + 1), values=[net]): unit_depth, unit_depth_bottleneck, unit_stride = unit # If we have reached the target output_stride, then we need to employ # atrous convolution with stride=1 and multiply the atrous rate by the # current unit's stride for use in subsequent layers. if output_stride is not None and current_stride == output_stride: net = block.unit_fn(net, depth=unit_depth, depth_bottleneck=unit_depth_bottleneck, stride=1, rate=rate) rate *= unit_stride else: net = block.unit_fn(net, depth=unit_depth, depth_bottleneck=unit_depth_bottleneck, stride=unit_stride, rate=1) current_stride *= unit_stride net = slim.utils.collect_named_outputs(outputs_collections, sc.name, net) if output_stride is not None and current_stride != output_stride: raise ValueError('The target output_stride cannot be reached.') return net def resnet_arg_scope(weight_decay=0.0001, batch_norm_decay=0.997, batch_norm_epsilon=1e-5, batch_norm_scale=True): """Defines the default ResNet arg scope. TODO(gpapan): The batch-normalization related default values above are appropriate for use in conjunction with the reference ResNet models released at https://github.com/KaimingHe/deep-residual-networks. When training ResNets from scratch, they might need to be tuned. Args: weight_decay: The weight decay to use for regularizing the model. batch_norm_decay: The moving average decay when estimating layer activation statistics in batch normalization. batch_norm_epsilon: Small constant to prevent division by zero when normalizing activations by their variance in batch normalization. batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the activations in the batch normalization layer. Returns: An `arg_scope` to use for the resnet models. """ batch_norm_params = { 'decay': batch_norm_decay, 'epsilon': batch_norm_epsilon, 'scale': batch_norm_scale, 'updates_collections': tf.GraphKeys.UPDATE_OPS, } with slim.arg_scope( [slim.conv2d], weights_regularizer=slim.l2_regularizer(weight_decay), weights_initializer=slim.variance_scaling_initializer(), activation_fn=tf.nn.relu, normalizer_fn=slim.batch_norm, normalizer_params=batch_norm_params): with slim.arg_scope([slim.batch_norm], **batch_norm_params): # The following implies padding='SAME' for pool1, which makes feature # alignment easier for dense prediction tasks. This is also used in # https://github.com/facebook/fb.resnet.torch. However the accompanying # code of 'Deep Residual Learning for Image Recognition' uses # padding='VALID' for pool1. You can switch to that choice by setting # slim.arg_scope([slim.max_pool2d], padding='VALID'). with slim.arg_scope([slim.max_pool2d], padding='SAME') as arg_sc: return arg_sc ================================================ FILE: models/slim/nets/resnet_v1.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Contains definitions for the original form of Residual Networks. The 'v1' residual networks (ResNets) implemented in this module were proposed by: [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun Deep Residual Learning for Image Recognition. arXiv:1512.03385 Other variants were introduced in: [2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun Identity Mappings in Deep Residual Networks. arXiv: 1603.05027 The networks defined in this module utilize the bottleneck building block of [1] with projection shortcuts only for increasing depths. They employ batch normalization *after* every weight layer. This is the architecture used by MSRA in the Imagenet and MSCOCO 2016 competition models ResNet-101 and ResNet-152. See [2; Fig. 1a] for a comparison between the current 'v1' architecture and the alternative 'v2' architecture of [2] which uses batch normalization *before* every weight layer in the so-called full pre-activation units. Typical use: from tensorflow.contrib.slim.nets import resnet_v1 ResNet-101 for image classification into 1000 classes: # inputs has shape [batch, 224, 224, 3] with slim.arg_scope(resnet_v1.resnet_arg_scope()): net, end_points = resnet_v1.resnet_v1_101(inputs, 1000, is_training=False) ResNet-101 for semantic segmentation into 21 classes: # inputs has shape [batch, 513, 513, 3] with slim.arg_scope(resnet_v1.resnet_arg_scope()): net, end_points = resnet_v1.resnet_v1_101(inputs, 21, is_training=False, global_pool=False, output_stride=16) """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from nets import resnet_utils resnet_arg_scope = resnet_utils.resnet_arg_scope slim = tf.contrib.slim @slim.add_arg_scope def bottleneck(inputs, depth, depth_bottleneck, stride, rate=1, outputs_collections=None, scope=None): """Bottleneck residual unit variant with BN after convolutions. This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for its definition. Note that we use here the bottleneck variant which has an extra bottleneck layer. When putting together two consecutive ResNet blocks that use this unit, one should use stride = 2 in the last unit of the first block. Args: inputs: A tensor of size [batch, height, width, channels]. depth: The depth of the ResNet unit output. depth_bottleneck: The depth of the bottleneck layers. stride: The ResNet unit's stride. Determines the amount of downsampling of the units output compared to its input. rate: An integer, rate for atrous convolution. outputs_collections: Collection to add the ResNet unit output. scope: Optional variable_scope. Returns: The ResNet unit's output. """ with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc: depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4) if depth == depth_in: shortcut = resnet_utils.subsample(inputs, stride, 'shortcut') else: shortcut = slim.conv2d(inputs, depth, [1, 1], stride=stride, activation_fn=None, scope='shortcut') residual = slim.conv2d(inputs, depth_bottleneck, [1, 1], stride=1, scope='conv1') residual = resnet_utils.conv2d_same(residual, depth_bottleneck, 3, stride, rate=rate, scope='conv2') residual = slim.conv2d(residual, depth, [1, 1], stride=1, activation_fn=None, scope='conv3') output = tf.nn.relu(shortcut + residual) return slim.utils.collect_named_outputs(outputs_collections, sc.original_name_scope, output) def resnet_v1(inputs, blocks, num_classes=None, is_training=True, global_pool=True, output_stride=None, include_root_block=True, train_top_bn=False, dropout_keep_prob=1.0, reuse=None, scope=None): """Generator for v1 ResNet models. This function generates a family of ResNet v1 models. See the resnet_v1_*() methods for specific model instantiations, obtained by selecting different block instantiations that produce ResNets of various depths. Training for image classification on Imagenet is usually done with [224, 224] inputs, resulting in [7, 7] feature maps at the output of the last ResNet block for the ResNets defined in [1] that have nominal stride equal to 32. However, for dense prediction tasks we advise that one uses inputs with spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In this case the feature maps at the ResNet output will have spatial shape [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1] and corners exactly aligned with the input image corners, which greatly facilitates alignment of the features to the image. Using as input [225, 225] images results in [8, 8] feature maps at the output of the last ResNet block. For dense prediction tasks, the ResNet needs to run in fully-convolutional (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all have nominal stride equal to 32 and a good choice in FCN mode is to use output_stride=16 in order to increase the density of the computed features at small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915. Args: inputs: A tensor of size [batch, height_in, width_in, channels]. blocks: A list of length equal to the number of ResNet blocks. Each element is a resnet_utils.Block object describing the units in the block. num_classes: Number of predicted classes for classification tasks. If None we return the features before the logit layer. is_training: whether is training or not. global_pool: If True, we perform global average pooling before computing the logits. Set to True for image classification, False for dense prediction. output_stride: If None, then the output will be computed at the nominal network stride. If output_stride is not None, it specifies the requested ratio of input to output spatial resolution. include_root_block: If True, include the initial convolution followed by max-pooling, if False excludes it. train_top_bn: If True, then train batch norm for the root block, but make it testing mode for the rest of the network. If False (default), keep all the batch norms training. dropout_keep_prob: (0, 1]. If <1, will apply dropout on the final layer after avg pooling. reuse: whether or not the network and its variables should be reused. To be able to reuse 'scope' must be given. scope: Optional variable_scope. Returns: net: A rank-4 tensor of size [batch, height_out, width_out, channels_out]. If global_pool is False, then height_out and width_out are reduced by a factor of output_stride compared to the respective height_in and width_in, else both height_out and width_out equal one. If num_classes is None, then net is the output of the last ResNet block, potentially after global average pooling. If num_classes is not None, net contains the pre-softmax activations. end_points: A dictionary from components of the network to the corresponding activation. Raises: ValueError: If the target output_stride is not valid. """ with tf.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc: end_points_collection = sc.name + '_end_points' with slim.arg_scope([slim.conv2d, bottleneck, resnet_utils.stack_blocks_dense], outputs_collections=end_points_collection): with slim.arg_scope( [slim.batch_norm], is_training=is_training if not train_top_bn else False, trainable=True if not train_top_bn else False): net = inputs if include_root_block: if output_stride is not None: if output_stride % 4 != 0: raise ValueError('The output_stride needs to be a multiple of 4.') output_stride /= 4 with slim.arg_scope([slim.batch_norm], is_training=is_training, trainable=True): net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1') net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1') net = resnet_utils.stack_blocks_dense(net, blocks, output_stride) if global_pool: # Global average pooling. net = tf.reduce_mean(net, [1, 2], name='pool5', keep_dims=True) if dropout_keep_prob < 1.0: tf.logging.info('ResNet v1: Using dropout {}.'.format( 1-dropout_keep_prob)) net = slim.dropout(net, keep_prob=dropout_keep_prob, is_training=is_training) if num_classes is not None: net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='logits') # Convert end_points_collection into a dictionary of end_points. end_points = slim.utils.convert_collection_to_dict(end_points_collection) if num_classes is not None: end_points['predictions'] = slim.softmax(net, scope='predictions') return net, end_points resnet_v1.default_image_size = 224 def resnet_v1_50(inputs, num_classes=None, is_training=True, global_pool=True, output_stride=None, reuse=None, scope='resnet_v1_50'): """ResNet-50 model of [1]. See resnet_v1() for arg and return description.""" blocks = [ resnet_utils.Block( 'block1', bottleneck, [(256, 64, 1)] * 2 + [(256, 64, 2)]), resnet_utils.Block( 'block2', bottleneck, [(512, 128, 1)] * 3 + [(512, 128, 2)]), resnet_utils.Block( 'block3', bottleneck, [(1024, 256, 1)] * 5 + [(1024, 256, 2)]), resnet_utils.Block( 'block4', bottleneck, [(2048, 512, 1)] * 3) ] return resnet_v1(inputs, blocks, num_classes, is_training, global_pool=global_pool, output_stride=output_stride, include_root_block=True, reuse=reuse, scope=scope) def resnet_v1_101(inputs, num_classes=None, is_training=True, global_pool=True, output_stride=None, reuse=None, train_top_bn=None, dropout_keep_prob=1.0, scope='resnet_v1_101'): """ResNet-101 model of [1]. See resnet_v1() for arg and return description.""" blocks = [ resnet_utils.Block( 'block1', bottleneck, [(256, 64, 1)] * 2 + [(256, 64, 2)]), resnet_utils.Block( 'block2', bottleneck, [(512, 128, 1)] * 3 + [(512, 128, 2)]), resnet_utils.Block( 'block3', bottleneck, [(1024, 256, 1)] * 22 + [(1024, 256, 2)]), resnet_utils.Block( 'block4', bottleneck, [(2048, 512, 1)] * 3) ] return resnet_v1(inputs, blocks, num_classes, is_training, global_pool=global_pool, output_stride=output_stride, include_root_block=True, train_top_bn=train_top_bn, dropout_keep_prob=dropout_keep_prob, reuse=reuse, scope=scope) def resnet_v1_152(inputs, num_classes=None, is_training=True, global_pool=True, output_stride=None, reuse=None, scope='resnet_v1_152'): """ResNet-152 model of [1]. See resnet_v1() for arg and return description.""" blocks = [ resnet_utils.Block( 'block1', bottleneck, [(256, 64, 1)] * 2 + [(256, 64, 2)]), resnet_utils.Block( 'block2', bottleneck, [(512, 128, 1)] * 7 + [(512, 128, 2)]), resnet_utils.Block( 'block3', bottleneck, [(1024, 256, 1)] * 35 + [(1024, 256, 2)]), resnet_utils.Block( 'block4', bottleneck, [(2048, 512, 1)] * 3)] return resnet_v1(inputs, blocks, num_classes, is_training, global_pool=global_pool, output_stride=output_stride, include_root_block=True, reuse=reuse, scope=scope) def resnet_v1_200(inputs, num_classes=None, is_training=True, global_pool=True, output_stride=None, reuse=None, scope='resnet_v1_200'): """ResNet-200 model of [2]. See resnet_v1() for arg and return description.""" blocks = [ resnet_utils.Block( 'block1', bottleneck, [(256, 64, 1)] * 2 + [(256, 64, 2)]), resnet_utils.Block( 'block2', bottleneck, [(512, 128, 1)] * 23 + [(512, 128, 2)]), resnet_utils.Block( 'block3', bottleneck, [(1024, 256, 1)] * 35 + [(1024, 256, 2)]), resnet_utils.Block( 'block4', bottleneck, [(2048, 512, 1)] * 3)] return resnet_v1(inputs, blocks, num_classes, is_training, global_pool=global_pool, output_stride=output_stride, include_root_block=True, reuse=reuse, scope=scope) ================================================ FILE: models/slim/nets/resnet_v1_test.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Tests for slim.nets.resnet_v1.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import tensorflow as tf from nets import resnet_utils from nets import resnet_v1 slim = tf.contrib.slim def create_test_input(batch_size, height, width, channels): """Create test input tensor. Args: batch_size: The number of images per batch or `None` if unknown. height: The height of each image or `None` if unknown. width: The width of each image or `None` if unknown. channels: The number of channels per image or `None` if unknown. Returns: Either a placeholder `Tensor` of dimension [batch_size, height, width, channels] if any of the inputs are `None` or a constant `Tensor` with the mesh grid values along the spatial dimensions. """ if None in [batch_size, height, width, channels]: return tf.placeholder(tf.float32, (batch_size, height, width, channels)) else: return tf.to_float( np.tile( np.reshape( np.reshape(np.arange(height), [height, 1]) + np.reshape(np.arange(width), [1, width]), [1, height, width, 1]), [batch_size, 1, 1, channels])) class ResnetUtilsTest(tf.test.TestCase): def testSubsampleThreeByThree(self): x = tf.reshape(tf.to_float(tf.range(9)), [1, 3, 3, 1]) x = resnet_utils.subsample(x, 2) expected = tf.reshape(tf.constant([0, 2, 6, 8]), [1, 2, 2, 1]) with self.test_session(): self.assertAllClose(x.eval(), expected.eval()) def testSubsampleFourByFour(self): x = tf.reshape(tf.to_float(tf.range(16)), [1, 4, 4, 1]) x = resnet_utils.subsample(x, 2) expected = tf.reshape(tf.constant([0, 2, 8, 10]), [1, 2, 2, 1]) with self.test_session(): self.assertAllClose(x.eval(), expected.eval()) def testConv2DSameEven(self): n, n2 = 4, 2 # Input image. x = create_test_input(1, n, n, 1) # Convolution kernel. w = create_test_input(1, 3, 3, 1) w = tf.reshape(w, [3, 3, 1, 1]) tf.get_variable('Conv/weights', initializer=w) tf.get_variable('Conv/biases', initializer=tf.zeros([1])) tf.get_variable_scope().reuse_variables() y1 = slim.conv2d(x, 1, [3, 3], stride=1, scope='Conv') y1_expected = tf.to_float([[14, 28, 43, 26], [28, 48, 66, 37], [43, 66, 84, 46], [26, 37, 46, 22]]) y1_expected = tf.reshape(y1_expected, [1, n, n, 1]) y2 = resnet_utils.subsample(y1, 2) y2_expected = tf.to_float([[14, 43], [43, 84]]) y2_expected = tf.reshape(y2_expected, [1, n2, n2, 1]) y3 = resnet_utils.conv2d_same(x, 1, 3, stride=2, scope='Conv') y3_expected = y2_expected y4 = slim.conv2d(x, 1, [3, 3], stride=2, scope='Conv') y4_expected = tf.to_float([[48, 37], [37, 22]]) y4_expected = tf.reshape(y4_expected, [1, n2, n2, 1]) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) self.assertAllClose(y1.eval(), y1_expected.eval()) self.assertAllClose(y2.eval(), y2_expected.eval()) self.assertAllClose(y3.eval(), y3_expected.eval()) self.assertAllClose(y4.eval(), y4_expected.eval()) def testConv2DSameOdd(self): n, n2 = 5, 3 # Input image. x = create_test_input(1, n, n, 1) # Convolution kernel. w = create_test_input(1, 3, 3, 1) w = tf.reshape(w, [3, 3, 1, 1]) tf.get_variable('Conv/weights', initializer=w) tf.get_variable('Conv/biases', initializer=tf.zeros([1])) tf.get_variable_scope().reuse_variables() y1 = slim.conv2d(x, 1, [3, 3], stride=1, scope='Conv') y1_expected = tf.to_float([[14, 28, 43, 58, 34], [28, 48, 66, 84, 46], [43, 66, 84, 102, 55], [58, 84, 102, 120, 64], [34, 46, 55, 64, 30]]) y1_expected = tf.reshape(y1_expected, [1, n, n, 1]) y2 = resnet_utils.subsample(y1, 2) y2_expected = tf.to_float([[14, 43, 34], [43, 84, 55], [34, 55, 30]]) y2_expected = tf.reshape(y2_expected, [1, n2, n2, 1]) y3 = resnet_utils.conv2d_same(x, 1, 3, stride=2, scope='Conv') y3_expected = y2_expected y4 = slim.conv2d(x, 1, [3, 3], stride=2, scope='Conv') y4_expected = y2_expected with self.test_session() as sess: sess.run(tf.global_variables_initializer()) self.assertAllClose(y1.eval(), y1_expected.eval()) self.assertAllClose(y2.eval(), y2_expected.eval()) self.assertAllClose(y3.eval(), y3_expected.eval()) self.assertAllClose(y4.eval(), y4_expected.eval()) def _resnet_plain(self, inputs, blocks, output_stride=None, scope=None): """A plain ResNet without extra layers before or after the ResNet blocks.""" with tf.variable_scope(scope, values=[inputs]): with slim.arg_scope([slim.conv2d], outputs_collections='end_points'): net = resnet_utils.stack_blocks_dense(inputs, blocks, output_stride) end_points = dict(tf.get_collection('end_points')) return net, end_points def testEndPointsV1(self): """Test the end points of a tiny v1 bottleneck network.""" bottleneck = resnet_v1.bottleneck blocks = [resnet_utils.Block('block1', bottleneck, [(4, 1, 1), (4, 1, 2)]), resnet_utils.Block('block2', bottleneck, [(8, 2, 1), (8, 2, 1)])] inputs = create_test_input(2, 32, 16, 3) with slim.arg_scope(resnet_utils.resnet_arg_scope()): _, end_points = self._resnet_plain(inputs, blocks, scope='tiny') expected = [ 'tiny/block1/unit_1/bottleneck_v1/shortcut', 'tiny/block1/unit_1/bottleneck_v1/conv1', 'tiny/block1/unit_1/bottleneck_v1/conv2', 'tiny/block1/unit_1/bottleneck_v1/conv3', 'tiny/block1/unit_2/bottleneck_v1/conv1', 'tiny/block1/unit_2/bottleneck_v1/conv2', 'tiny/block1/unit_2/bottleneck_v1/conv3', 'tiny/block2/unit_1/bottleneck_v1/shortcut', 'tiny/block2/unit_1/bottleneck_v1/conv1', 'tiny/block2/unit_1/bottleneck_v1/conv2', 'tiny/block2/unit_1/bottleneck_v1/conv3', 'tiny/block2/unit_2/bottleneck_v1/conv1', 'tiny/block2/unit_2/bottleneck_v1/conv2', 'tiny/block2/unit_2/bottleneck_v1/conv3'] self.assertItemsEqual(expected, end_points) def _stack_blocks_nondense(self, net, blocks): """A simplified ResNet Block stacker without output stride control.""" for block in blocks: with tf.variable_scope(block.scope, 'block', [net]): for i, unit in enumerate(block.args): depth, depth_bottleneck, stride = unit with tf.variable_scope('unit_%d' % (i + 1), values=[net]): net = block.unit_fn(net, depth=depth, depth_bottleneck=depth_bottleneck, stride=stride, rate=1) return net def _atrousValues(self, bottleneck): """Verify the values of dense feature extraction by atrous convolution. Make sure that dense feature extraction by stack_blocks_dense() followed by subsampling gives identical results to feature extraction at the nominal network output stride using the simple self._stack_blocks_nondense() above. Args: bottleneck: The bottleneck function. """ blocks = [ resnet_utils.Block('block1', bottleneck, [(4, 1, 1), (4, 1, 2)]), resnet_utils.Block('block2', bottleneck, [(8, 2, 1), (8, 2, 2)]), resnet_utils.Block('block3', bottleneck, [(16, 4, 1), (16, 4, 2)]), resnet_utils.Block('block4', bottleneck, [(32, 8, 1), (32, 8, 1)]) ] nominal_stride = 8 # Test both odd and even input dimensions. height = 30 width = 31 with slim.arg_scope(resnet_utils.resnet_arg_scope()): with slim.arg_scope([slim.batch_norm], is_training=False): for output_stride in [1, 2, 4, 8, None]: with tf.Graph().as_default(): with self.test_session() as sess: tf.set_random_seed(0) inputs = create_test_input(1, height, width, 3) # Dense feature extraction followed by subsampling. output = resnet_utils.stack_blocks_dense(inputs, blocks, output_stride) if output_stride is None: factor = 1 else: factor = nominal_stride // output_stride output = resnet_utils.subsample(output, factor) # Make the two networks use the same weights. tf.get_variable_scope().reuse_variables() # Feature extraction at the nominal network rate. expected = self._stack_blocks_nondense(inputs, blocks) sess.run(tf.global_variables_initializer()) output, expected = sess.run([output, expected]) self.assertAllClose(output, expected, atol=1e-4, rtol=1e-4) def testAtrousValuesBottleneck(self): self._atrousValues(resnet_v1.bottleneck) class ResnetCompleteNetworkTest(tf.test.TestCase): """Tests with complete small ResNet v1 networks.""" def _resnet_small(self, inputs, num_classes=None, is_training=True, global_pool=True, output_stride=None, include_root_block=True, reuse=None, scope='resnet_v1_small'): """A shallow and thin ResNet v1 for faster tests.""" bottleneck = resnet_v1.bottleneck blocks = [ resnet_utils.Block( 'block1', bottleneck, [(4, 1, 1)] * 2 + [(4, 1, 2)]), resnet_utils.Block( 'block2', bottleneck, [(8, 2, 1)] * 2 + [(8, 2, 2)]), resnet_utils.Block( 'block3', bottleneck, [(16, 4, 1)] * 2 + [(16, 4, 2)]), resnet_utils.Block( 'block4', bottleneck, [(32, 8, 1)] * 2)] return resnet_v1.resnet_v1(inputs, blocks, num_classes, is_training=is_training, global_pool=global_pool, output_stride=output_stride, include_root_block=include_root_block, reuse=reuse, scope=scope) def testClassificationEndPoints(self): global_pool = True num_classes = 10 inputs = create_test_input(2, 224, 224, 3) with slim.arg_scope(resnet_utils.resnet_arg_scope()): logits, end_points = self._resnet_small(inputs, num_classes, global_pool=global_pool, scope='resnet') self.assertTrue(logits.op.name.startswith('resnet/logits')) self.assertListEqual(logits.get_shape().as_list(), [2, 1, 1, num_classes]) self.assertTrue('predictions' in end_points) self.assertListEqual(end_points['predictions'].get_shape().as_list(), [2, 1, 1, num_classes]) def testClassificationShapes(self): global_pool = True num_classes = 10 inputs = create_test_input(2, 224, 224, 3) with slim.arg_scope(resnet_utils.resnet_arg_scope()): _, end_points = self._resnet_small(inputs, num_classes, global_pool=global_pool, scope='resnet') endpoint_to_shape = { 'resnet/block1': [2, 28, 28, 4], 'resnet/block2': [2, 14, 14, 8], 'resnet/block3': [2, 7, 7, 16], 'resnet/block4': [2, 7, 7, 32]} for endpoint in endpoint_to_shape: shape = endpoint_to_shape[endpoint] self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape) def testFullyConvolutionalEndpointShapes(self): global_pool = False num_classes = 10 inputs = create_test_input(2, 321, 321, 3) with slim.arg_scope(resnet_utils.resnet_arg_scope()): _, end_points = self._resnet_small(inputs, num_classes, global_pool=global_pool, scope='resnet') endpoint_to_shape = { 'resnet/block1': [2, 41, 41, 4], 'resnet/block2': [2, 21, 21, 8], 'resnet/block3': [2, 11, 11, 16], 'resnet/block4': [2, 11, 11, 32]} for endpoint in endpoint_to_shape: shape = endpoint_to_shape[endpoint] self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape) def testRootlessFullyConvolutionalEndpointShapes(self): global_pool = False num_classes = 10 inputs = create_test_input(2, 128, 128, 3) with slim.arg_scope(resnet_utils.resnet_arg_scope()): _, end_points = self._resnet_small(inputs, num_classes, global_pool=global_pool, include_root_block=False, scope='resnet') endpoint_to_shape = { 'resnet/block1': [2, 64, 64, 4], 'resnet/block2': [2, 32, 32, 8], 'resnet/block3': [2, 16, 16, 16], 'resnet/block4': [2, 16, 16, 32]} for endpoint in endpoint_to_shape: shape = endpoint_to_shape[endpoint] self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape) def testAtrousFullyConvolutionalEndpointShapes(self): global_pool = False num_classes = 10 output_stride = 8 inputs = create_test_input(2, 321, 321, 3) with slim.arg_scope(resnet_utils.resnet_arg_scope()): _, end_points = self._resnet_small(inputs, num_classes, global_pool=global_pool, output_stride=output_stride, scope='resnet') endpoint_to_shape = { 'resnet/block1': [2, 41, 41, 4], 'resnet/block2': [2, 41, 41, 8], 'resnet/block3': [2, 41, 41, 16], 'resnet/block4': [2, 41, 41, 32]} for endpoint in endpoint_to_shape: shape = endpoint_to_shape[endpoint] self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape) def testAtrousFullyConvolutionalValues(self): """Verify dense feature extraction with atrous convolution.""" nominal_stride = 32 for output_stride in [4, 8, 16, 32, None]: with slim.arg_scope(resnet_utils.resnet_arg_scope()): with tf.Graph().as_default(): with self.test_session() as sess: tf.set_random_seed(0) inputs = create_test_input(2, 81, 81, 3) # Dense feature extraction followed by subsampling. output, _ = self._resnet_small(inputs, None, is_training=False, global_pool=False, output_stride=output_stride) if output_stride is None: factor = 1 else: factor = nominal_stride // output_stride output = resnet_utils.subsample(output, factor) # Make the two networks use the same weights. tf.get_variable_scope().reuse_variables() # Feature extraction at the nominal network rate. expected, _ = self._resnet_small(inputs, None, is_training=False, global_pool=False) sess.run(tf.global_variables_initializer()) self.assertAllClose(output.eval(), expected.eval(), atol=1e-4, rtol=1e-4) def testUnknownBatchSize(self): batch = 2 height, width = 65, 65 global_pool = True num_classes = 10 inputs = create_test_input(None, height, width, 3) with slim.arg_scope(resnet_utils.resnet_arg_scope()): logits, _ = self._resnet_small(inputs, num_classes, global_pool=global_pool, scope='resnet') self.assertTrue(logits.op.name.startswith('resnet/logits')) self.assertListEqual(logits.get_shape().as_list(), [None, 1, 1, num_classes]) images = create_test_input(batch, height, width, 3) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) output = sess.run(logits, {inputs: images.eval()}) self.assertEqual(output.shape, (batch, 1, 1, num_classes)) def testFullyConvolutionalUnknownHeightWidth(self): batch = 2 height, width = 65, 65 global_pool = False inputs = create_test_input(batch, None, None, 3) with slim.arg_scope(resnet_utils.resnet_arg_scope()): output, _ = self._resnet_small(inputs, None, global_pool=global_pool) self.assertListEqual(output.get_shape().as_list(), [batch, None, None, 32]) images = create_test_input(batch, height, width, 3) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) output = sess.run(output, {inputs: images.eval()}) self.assertEqual(output.shape, (batch, 3, 3, 32)) def testAtrousFullyConvolutionalUnknownHeightWidth(self): batch = 2 height, width = 65, 65 global_pool = False output_stride = 8 inputs = create_test_input(batch, None, None, 3) with slim.arg_scope(resnet_utils.resnet_arg_scope()): output, _ = self._resnet_small(inputs, None, global_pool=global_pool, output_stride=output_stride) self.assertListEqual(output.get_shape().as_list(), [batch, None, None, 32]) images = create_test_input(batch, height, width, 3) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) output = sess.run(output, {inputs: images.eval()}) self.assertEqual(output.shape, (batch, 9, 9, 32)) if __name__ == '__main__': tf.test.main() ================================================ FILE: models/slim/nets/resnet_v2.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Contains definitions for the preactivation form of Residual Networks. Residual networks (ResNets) were originally proposed in: [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun Deep Residual Learning for Image Recognition. arXiv:1512.03385 The full preactivation 'v2' ResNet variant implemented in this module was introduced by: [2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun Identity Mappings in Deep Residual Networks. arXiv: 1603.05027 The key difference of the full preactivation 'v2' variant compared to the 'v1' variant in [1] is the use of batch normalization before every weight layer. Another difference is that 'v2' ResNets do not include an activation function in the main pathway. Also see [2; Fig. 4e]. Typical use: from tensorflow.contrib.slim.nets import resnet_v2 ResNet-101 for image classification into 1000 classes: # inputs has shape [batch, 224, 224, 3] with slim.arg_scope(resnet_v2.resnet_arg_scope()): net, end_points = resnet_v2.resnet_v2_101(inputs, 1000, is_training=False) ResNet-101 for semantic segmentation into 21 classes: # inputs has shape [batch, 513, 513, 3] with slim.arg_scope(resnet_v2.resnet_arg_scope(is_training)): net, end_points = resnet_v2.resnet_v2_101(inputs, 21, is_training=False, global_pool=False, output_stride=16) """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from nets import resnet_utils slim = tf.contrib.slim resnet_arg_scope = resnet_utils.resnet_arg_scope @slim.add_arg_scope def bottleneck(inputs, depth, depth_bottleneck, stride, rate=1, outputs_collections=None, scope=None): """Bottleneck residual unit variant with BN before convolutions. This is the full preactivation residual unit variant proposed in [2]. See Fig. 1(b) of [2] for its definition. Note that we use here the bottleneck variant which has an extra bottleneck layer. When putting together two consecutive ResNet blocks that use this unit, one should use stride = 2 in the last unit of the first block. Args: inputs: A tensor of size [batch, height, width, channels]. depth: The depth of the ResNet unit output. depth_bottleneck: The depth of the bottleneck layers. stride: The ResNet unit's stride. Determines the amount of downsampling of the units output compared to its input. rate: An integer, rate for atrous convolution. outputs_collections: Collection to add the ResNet unit output. scope: Optional variable_scope. Returns: The ResNet unit's output. """ with tf.variable_scope(scope, 'bottleneck_v2', [inputs]) as sc: depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4) preact = slim.batch_norm(inputs, activation_fn=tf.nn.relu, scope='preact') if depth == depth_in: shortcut = resnet_utils.subsample(inputs, stride, 'shortcut') else: shortcut = slim.conv2d(preact, depth, [1, 1], stride=stride, normalizer_fn=None, activation_fn=None, scope='shortcut') residual = slim.conv2d(preact, depth_bottleneck, [1, 1], stride=1, scope='conv1') residual = resnet_utils.conv2d_same(residual, depth_bottleneck, 3, stride, rate=rate, scope='conv2') residual = slim.conv2d(residual, depth, [1, 1], stride=1, normalizer_fn=None, activation_fn=None, scope='conv3') output = shortcut + residual return slim.utils.collect_named_outputs(outputs_collections, sc.original_name_scope, output) def resnet_v2(inputs, blocks, num_classes=None, is_training=True, global_pool=True, output_stride=None, include_root_block=True, reuse=None, scope=None): """Generator for v2 (preactivation) ResNet models. This function generates a family of ResNet v2 models. See the resnet_v2_*() methods for specific model instantiations, obtained by selecting different block instantiations that produce ResNets of various depths. Training for image classification on Imagenet is usually done with [224, 224] inputs, resulting in [7, 7] feature maps at the output of the last ResNet block for the ResNets defined in [1] that have nominal stride equal to 32. However, for dense prediction tasks we advise that one uses inputs with spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In this case the feature maps at the ResNet output will have spatial shape [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1] and corners exactly aligned with the input image corners, which greatly facilitates alignment of the features to the image. Using as input [225, 225] images results in [8, 8] feature maps at the output of the last ResNet block. For dense prediction tasks, the ResNet needs to run in fully-convolutional (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all have nominal stride equal to 32 and a good choice in FCN mode is to use output_stride=16 in order to increase the density of the computed features at small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915. Args: inputs: A tensor of size [batch, height_in, width_in, channels]. blocks: A list of length equal to the number of ResNet blocks. Each element is a resnet_utils.Block object describing the units in the block. num_classes: Number of predicted classes for classification tasks. If None we return the features before the logit layer. is_training: whether is training or not. global_pool: If True, we perform global average pooling before computing the logits. Set to True for image classification, False for dense prediction. output_stride: If None, then the output will be computed at the nominal network stride. If output_stride is not None, it specifies the requested ratio of input to output spatial resolution. include_root_block: If True, include the initial convolution followed by max-pooling, if False excludes it. If excluded, `inputs` should be the results of an activation-less convolution. reuse: whether or not the network and its variables should be reused. To be able to reuse 'scope' must be given. scope: Optional variable_scope. Returns: net: A rank-4 tensor of size [batch, height_out, width_out, channels_out]. If global_pool is False, then height_out and width_out are reduced by a factor of output_stride compared to the respective height_in and width_in, else both height_out and width_out equal one. If num_classes is None, then net is the output of the last ResNet block, potentially after global average pooling. If num_classes is not None, net contains the pre-softmax activations. end_points: A dictionary from components of the network to the corresponding activation. Raises: ValueError: If the target output_stride is not valid. """ with tf.variable_scope(scope, 'resnet_v2', [inputs], reuse=reuse) as sc: end_points_collection = sc.name + '_end_points' with slim.arg_scope([slim.conv2d, bottleneck, resnet_utils.stack_blocks_dense], outputs_collections=end_points_collection): with slim.arg_scope([slim.batch_norm], is_training=is_training): net = inputs if include_root_block: if output_stride is not None: if output_stride % 4 != 0: raise ValueError('The output_stride needs to be a multiple of 4.') output_stride /= 4 # We do not include batch normalization or activation functions in # conv1 because the first ResNet unit will perform these. Cf. # Appendix of [2]. with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None): net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1') net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1') net = resnet_utils.stack_blocks_dense(net, blocks, output_stride) # This is needed because the pre-activation variant does not have batch # normalization or activation functions in the residual unit output. See # Appendix of [2]. net = slim.batch_norm(net, activation_fn=tf.nn.relu, scope='postnorm') if global_pool: # Global average pooling. net = tf.reduce_mean(net, [1, 2], name='pool5', keep_dims=True) if num_classes is not None: net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='logits') # Convert end_points_collection into a dictionary of end_points. end_points = slim.utils.convert_collection_to_dict(end_points_collection) if num_classes is not None: end_points['predictions'] = slim.softmax(net, scope='predictions') return net, end_points resnet_v2.default_image_size = 224 def resnet_v2_50(inputs, num_classes=None, is_training=True, global_pool=True, output_stride=None, reuse=None, scope='resnet_v2_50'): """ResNet-50 model of [1]. See resnet_v2() for arg and return description.""" blocks = [ resnet_utils.Block( 'block1', bottleneck, [(256, 64, 1)] * 2 + [(256, 64, 2)]), resnet_utils.Block( 'block2', bottleneck, [(512, 128, 1)] * 3 + [(512, 128, 2)]), resnet_utils.Block( 'block3', bottleneck, [(1024, 256, 1)] * 5 + [(1024, 256, 2)]), resnet_utils.Block( 'block4', bottleneck, [(2048, 512, 1)] * 3)] return resnet_v2(inputs, blocks, num_classes, is_training=is_training, global_pool=global_pool, output_stride=output_stride, include_root_block=True, reuse=reuse, scope=scope) def resnet_v2_101(inputs, num_classes=None, is_training=True, global_pool=True, output_stride=None, reuse=None, scope='resnet_v2_101'): """ResNet-101 model of [1]. See resnet_v2() for arg and return description.""" blocks = [ resnet_utils.Block( 'block1', bottleneck, [(256, 64, 1)] * 2 + [(256, 64, 2)]), resnet_utils.Block( 'block2', bottleneck, [(512, 128, 1)] * 3 + [(512, 128, 2)]), resnet_utils.Block( 'block3', bottleneck, [(1024, 256, 1)] * 22 + [(1024, 256, 2)]), resnet_utils.Block( 'block4', bottleneck, [(2048, 512, 1)] * 3)] return resnet_v2(inputs, blocks, num_classes, is_training=is_training, global_pool=global_pool, output_stride=output_stride, include_root_block=True, reuse=reuse, scope=scope) def resnet_v2_152(inputs, num_classes=None, is_training=True, global_pool=True, output_stride=None, reuse=None, scope='resnet_v2_152'): """ResNet-152 model of [1]. See resnet_v2() for arg and return description.""" blocks = [ resnet_utils.Block( 'block1', bottleneck, [(256, 64, 1)] * 2 + [(256, 64, 2)]), resnet_utils.Block( 'block2', bottleneck, [(512, 128, 1)] * 7 + [(512, 128, 2)]), resnet_utils.Block( 'block3', bottleneck, [(1024, 256, 1)] * 35 + [(1024, 256, 2)]), resnet_utils.Block( 'block4', bottleneck, [(2048, 512, 1)] * 3)] return resnet_v2(inputs, blocks, num_classes, is_training=is_training, global_pool=global_pool, output_stride=output_stride, include_root_block=True, reuse=reuse, scope=scope) def resnet_v2_200(inputs, num_classes=None, is_training=True, global_pool=True, output_stride=None, reuse=None, scope='resnet_v2_200'): """ResNet-200 model of [2]. See resnet_v2() for arg and return description.""" blocks = [ resnet_utils.Block( 'block1', bottleneck, [(256, 64, 1)] * 2 + [(256, 64, 2)]), resnet_utils.Block( 'block2', bottleneck, [(512, 128, 1)] * 23 + [(512, 128, 2)]), resnet_utils.Block( 'block3', bottleneck, [(1024, 256, 1)] * 35 + [(1024, 256, 2)]), resnet_utils.Block( 'block4', bottleneck, [(2048, 512, 1)] * 3)] return resnet_v2(inputs, blocks, num_classes, is_training=is_training, global_pool=global_pool, output_stride=output_stride, include_root_block=True, reuse=reuse, scope=scope) ================================================ FILE: models/slim/nets/resnet_v2_test.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Tests for slim.nets.resnet_v2.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import tensorflow as tf from nets import resnet_utils from nets import resnet_v2 slim = tf.contrib.slim def create_test_input(batch_size, height, width, channels): """Create test input tensor. Args: batch_size: The number of images per batch or `None` if unknown. height: The height of each image or `None` if unknown. width: The width of each image or `None` if unknown. channels: The number of channels per image or `None` if unknown. Returns: Either a placeholder `Tensor` of dimension [batch_size, height, width, channels] if any of the inputs are `None` or a constant `Tensor` with the mesh grid values along the spatial dimensions. """ if None in [batch_size, height, width, channels]: return tf.placeholder(tf.float32, (batch_size, height, width, channels)) else: return tf.to_float( np.tile( np.reshape( np.reshape(np.arange(height), [height, 1]) + np.reshape(np.arange(width), [1, width]), [1, height, width, 1]), [batch_size, 1, 1, channels])) class ResnetUtilsTest(tf.test.TestCase): def testSubsampleThreeByThree(self): x = tf.reshape(tf.to_float(tf.range(9)), [1, 3, 3, 1]) x = resnet_utils.subsample(x, 2) expected = tf.reshape(tf.constant([0, 2, 6, 8]), [1, 2, 2, 1]) with self.test_session(): self.assertAllClose(x.eval(), expected.eval()) def testSubsampleFourByFour(self): x = tf.reshape(tf.to_float(tf.range(16)), [1, 4, 4, 1]) x = resnet_utils.subsample(x, 2) expected = tf.reshape(tf.constant([0, 2, 8, 10]), [1, 2, 2, 1]) with self.test_session(): self.assertAllClose(x.eval(), expected.eval()) def testConv2DSameEven(self): n, n2 = 4, 2 # Input image. x = create_test_input(1, n, n, 1) # Convolution kernel. w = create_test_input(1, 3, 3, 1) w = tf.reshape(w, [3, 3, 1, 1]) tf.get_variable('Conv/weights', initializer=w) tf.get_variable('Conv/biases', initializer=tf.zeros([1])) tf.get_variable_scope().reuse_variables() y1 = slim.conv2d(x, 1, [3, 3], stride=1, scope='Conv') y1_expected = tf.to_float([[14, 28, 43, 26], [28, 48, 66, 37], [43, 66, 84, 46], [26, 37, 46, 22]]) y1_expected = tf.reshape(y1_expected, [1, n, n, 1]) y2 = resnet_utils.subsample(y1, 2) y2_expected = tf.to_float([[14, 43], [43, 84]]) y2_expected = tf.reshape(y2_expected, [1, n2, n2, 1]) y3 = resnet_utils.conv2d_same(x, 1, 3, stride=2, scope='Conv') y3_expected = y2_expected y4 = slim.conv2d(x, 1, [3, 3], stride=2, scope='Conv') y4_expected = tf.to_float([[48, 37], [37, 22]]) y4_expected = tf.reshape(y4_expected, [1, n2, n2, 1]) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) self.assertAllClose(y1.eval(), y1_expected.eval()) self.assertAllClose(y2.eval(), y2_expected.eval()) self.assertAllClose(y3.eval(), y3_expected.eval()) self.assertAllClose(y4.eval(), y4_expected.eval()) def testConv2DSameOdd(self): n, n2 = 5, 3 # Input image. x = create_test_input(1, n, n, 1) # Convolution kernel. w = create_test_input(1, 3, 3, 1) w = tf.reshape(w, [3, 3, 1, 1]) tf.get_variable('Conv/weights', initializer=w) tf.get_variable('Conv/biases', initializer=tf.zeros([1])) tf.get_variable_scope().reuse_variables() y1 = slim.conv2d(x, 1, [3, 3], stride=1, scope='Conv') y1_expected = tf.to_float([[14, 28, 43, 58, 34], [28, 48, 66, 84, 46], [43, 66, 84, 102, 55], [58, 84, 102, 120, 64], [34, 46, 55, 64, 30]]) y1_expected = tf.reshape(y1_expected, [1, n, n, 1]) y2 = resnet_utils.subsample(y1, 2) y2_expected = tf.to_float([[14, 43, 34], [43, 84, 55], [34, 55, 30]]) y2_expected = tf.reshape(y2_expected, [1, n2, n2, 1]) y3 = resnet_utils.conv2d_same(x, 1, 3, stride=2, scope='Conv') y3_expected = y2_expected y4 = slim.conv2d(x, 1, [3, 3], stride=2, scope='Conv') y4_expected = y2_expected with self.test_session() as sess: sess.run(tf.global_variables_initializer()) self.assertAllClose(y1.eval(), y1_expected.eval()) self.assertAllClose(y2.eval(), y2_expected.eval()) self.assertAllClose(y3.eval(), y3_expected.eval()) self.assertAllClose(y4.eval(), y4_expected.eval()) def _resnet_plain(self, inputs, blocks, output_stride=None, scope=None): """A plain ResNet without extra layers before or after the ResNet blocks.""" with tf.variable_scope(scope, values=[inputs]): with slim.arg_scope([slim.conv2d], outputs_collections='end_points'): net = resnet_utils.stack_blocks_dense(inputs, blocks, output_stride) end_points = dict(tf.get_collection('end_points')) return net, end_points def testEndPointsV2(self): """Test the end points of a tiny v2 bottleneck network.""" bottleneck = resnet_v2.bottleneck blocks = [resnet_utils.Block('block1', bottleneck, [(4, 1, 1), (4, 1, 2)]), resnet_utils.Block('block2', bottleneck, [(8, 2, 1), (8, 2, 1)])] inputs = create_test_input(2, 32, 16, 3) with slim.arg_scope(resnet_utils.resnet_arg_scope()): _, end_points = self._resnet_plain(inputs, blocks, scope='tiny') expected = [ 'tiny/block1/unit_1/bottleneck_v2/shortcut', 'tiny/block1/unit_1/bottleneck_v2/conv1', 'tiny/block1/unit_1/bottleneck_v2/conv2', 'tiny/block1/unit_1/bottleneck_v2/conv3', 'tiny/block1/unit_2/bottleneck_v2/conv1', 'tiny/block1/unit_2/bottleneck_v2/conv2', 'tiny/block1/unit_2/bottleneck_v2/conv3', 'tiny/block2/unit_1/bottleneck_v2/shortcut', 'tiny/block2/unit_1/bottleneck_v2/conv1', 'tiny/block2/unit_1/bottleneck_v2/conv2', 'tiny/block2/unit_1/bottleneck_v2/conv3', 'tiny/block2/unit_2/bottleneck_v2/conv1', 'tiny/block2/unit_2/bottleneck_v2/conv2', 'tiny/block2/unit_2/bottleneck_v2/conv3'] self.assertItemsEqual(expected, end_points) def _stack_blocks_nondense(self, net, blocks): """A simplified ResNet Block stacker without output stride control.""" for block in blocks: with tf.variable_scope(block.scope, 'block', [net]): for i, unit in enumerate(block.args): depth, depth_bottleneck, stride = unit with tf.variable_scope('unit_%d' % (i + 1), values=[net]): net = block.unit_fn(net, depth=depth, depth_bottleneck=depth_bottleneck, stride=stride, rate=1) return net def _atrousValues(self, bottleneck): """Verify the values of dense feature extraction by atrous convolution. Make sure that dense feature extraction by stack_blocks_dense() followed by subsampling gives identical results to feature extraction at the nominal network output stride using the simple self._stack_blocks_nondense() above. Args: bottleneck: The bottleneck function. """ blocks = [ resnet_utils.Block('block1', bottleneck, [(4, 1, 1), (4, 1, 2)]), resnet_utils.Block('block2', bottleneck, [(8, 2, 1), (8, 2, 2)]), resnet_utils.Block('block3', bottleneck, [(16, 4, 1), (16, 4, 2)]), resnet_utils.Block('block4', bottleneck, [(32, 8, 1), (32, 8, 1)]) ] nominal_stride = 8 # Test both odd and even input dimensions. height = 30 width = 31 with slim.arg_scope(resnet_utils.resnet_arg_scope()): with slim.arg_scope([slim.batch_norm], is_training=False): for output_stride in [1, 2, 4, 8, None]: with tf.Graph().as_default(): with self.test_session() as sess: tf.set_random_seed(0) inputs = create_test_input(1, height, width, 3) # Dense feature extraction followed by subsampling. output = resnet_utils.stack_blocks_dense(inputs, blocks, output_stride) if output_stride is None: factor = 1 else: factor = nominal_stride // output_stride output = resnet_utils.subsample(output, factor) # Make the two networks use the same weights. tf.get_variable_scope().reuse_variables() # Feature extraction at the nominal network rate. expected = self._stack_blocks_nondense(inputs, blocks) sess.run(tf.global_variables_initializer()) output, expected = sess.run([output, expected]) self.assertAllClose(output, expected, atol=1e-4, rtol=1e-4) def testAtrousValuesBottleneck(self): self._atrousValues(resnet_v2.bottleneck) class ResnetCompleteNetworkTest(tf.test.TestCase): """Tests with complete small ResNet v2 networks.""" def _resnet_small(self, inputs, num_classes=None, is_training=True, global_pool=True, output_stride=None, include_root_block=True, reuse=None, scope='resnet_v2_small'): """A shallow and thin ResNet v2 for faster tests.""" bottleneck = resnet_v2.bottleneck blocks = [ resnet_utils.Block( 'block1', bottleneck, [(4, 1, 1)] * 2 + [(4, 1, 2)]), resnet_utils.Block( 'block2', bottleneck, [(8, 2, 1)] * 2 + [(8, 2, 2)]), resnet_utils.Block( 'block3', bottleneck, [(16, 4, 1)] * 2 + [(16, 4, 2)]), resnet_utils.Block( 'block4', bottleneck, [(32, 8, 1)] * 2)] return resnet_v2.resnet_v2(inputs, blocks, num_classes, is_training=is_training, global_pool=global_pool, output_stride=output_stride, include_root_block=include_root_block, reuse=reuse, scope=scope) def testClassificationEndPoints(self): global_pool = True num_classes = 10 inputs = create_test_input(2, 224, 224, 3) with slim.arg_scope(resnet_utils.resnet_arg_scope()): logits, end_points = self._resnet_small(inputs, num_classes, global_pool=global_pool, scope='resnet') self.assertTrue(logits.op.name.startswith('resnet/logits')) self.assertListEqual(logits.get_shape().as_list(), [2, 1, 1, num_classes]) self.assertTrue('predictions' in end_points) self.assertListEqual(end_points['predictions'].get_shape().as_list(), [2, 1, 1, num_classes]) def testClassificationShapes(self): global_pool = True num_classes = 10 inputs = create_test_input(2, 224, 224, 3) with slim.arg_scope(resnet_utils.resnet_arg_scope()): _, end_points = self._resnet_small(inputs, num_classes, global_pool=global_pool, scope='resnet') endpoint_to_shape = { 'resnet/block1': [2, 28, 28, 4], 'resnet/block2': [2, 14, 14, 8], 'resnet/block3': [2, 7, 7, 16], 'resnet/block4': [2, 7, 7, 32]} for endpoint in endpoint_to_shape: shape = endpoint_to_shape[endpoint] self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape) def testFullyConvolutionalEndpointShapes(self): global_pool = False num_classes = 10 inputs = create_test_input(2, 321, 321, 3) with slim.arg_scope(resnet_utils.resnet_arg_scope()): _, end_points = self._resnet_small(inputs, num_classes, global_pool=global_pool, scope='resnet') endpoint_to_shape = { 'resnet/block1': [2, 41, 41, 4], 'resnet/block2': [2, 21, 21, 8], 'resnet/block3': [2, 11, 11, 16], 'resnet/block4': [2, 11, 11, 32]} for endpoint in endpoint_to_shape: shape = endpoint_to_shape[endpoint] self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape) def testRootlessFullyConvolutionalEndpointShapes(self): global_pool = False num_classes = 10 inputs = create_test_input(2, 128, 128, 3) with slim.arg_scope(resnet_utils.resnet_arg_scope()): _, end_points = self._resnet_small(inputs, num_classes, global_pool=global_pool, include_root_block=False, scope='resnet') endpoint_to_shape = { 'resnet/block1': [2, 64, 64, 4], 'resnet/block2': [2, 32, 32, 8], 'resnet/block3': [2, 16, 16, 16], 'resnet/block4': [2, 16, 16, 32]} for endpoint in endpoint_to_shape: shape = endpoint_to_shape[endpoint] self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape) def testAtrousFullyConvolutionalEndpointShapes(self): global_pool = False num_classes = 10 output_stride = 8 inputs = create_test_input(2, 321, 321, 3) with slim.arg_scope(resnet_utils.resnet_arg_scope()): _, end_points = self._resnet_small(inputs, num_classes, global_pool=global_pool, output_stride=output_stride, scope='resnet') endpoint_to_shape = { 'resnet/block1': [2, 41, 41, 4], 'resnet/block2': [2, 41, 41, 8], 'resnet/block3': [2, 41, 41, 16], 'resnet/block4': [2, 41, 41, 32]} for endpoint in endpoint_to_shape: shape = endpoint_to_shape[endpoint] self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape) def testAtrousFullyConvolutionalValues(self): """Verify dense feature extraction with atrous convolution.""" nominal_stride = 32 for output_stride in [4, 8, 16, 32, None]: with slim.arg_scope(resnet_utils.resnet_arg_scope()): with tf.Graph().as_default(): with self.test_session() as sess: tf.set_random_seed(0) inputs = create_test_input(2, 81, 81, 3) # Dense feature extraction followed by subsampling. output, _ = self._resnet_small(inputs, None, is_training=False, global_pool=False, output_stride=output_stride) if output_stride is None: factor = 1 else: factor = nominal_stride // output_stride output = resnet_utils.subsample(output, factor) # Make the two networks use the same weights. tf.get_variable_scope().reuse_variables() # Feature extraction at the nominal network rate. expected, _ = self._resnet_small(inputs, None, is_training=False, global_pool=False) sess.run(tf.global_variables_initializer()) self.assertAllClose(output.eval(), expected.eval(), atol=1e-4, rtol=1e-4) def testUnknownBatchSize(self): batch = 2 height, width = 65, 65 global_pool = True num_classes = 10 inputs = create_test_input(None, height, width, 3) with slim.arg_scope(resnet_utils.resnet_arg_scope()): logits, _ = self._resnet_small(inputs, num_classes, global_pool=global_pool, scope='resnet') self.assertTrue(logits.op.name.startswith('resnet/logits')) self.assertListEqual(logits.get_shape().as_list(), [None, 1, 1, num_classes]) images = create_test_input(batch, height, width, 3) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) output = sess.run(logits, {inputs: images.eval()}) self.assertEqual(output.shape, (batch, 1, 1, num_classes)) def testFullyConvolutionalUnknownHeightWidth(self): batch = 2 height, width = 65, 65 global_pool = False inputs = create_test_input(batch, None, None, 3) with slim.arg_scope(resnet_utils.resnet_arg_scope()): output, _ = self._resnet_small(inputs, None, global_pool=global_pool) self.assertListEqual(output.get_shape().as_list(), [batch, None, None, 32]) images = create_test_input(batch, height, width, 3) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) output = sess.run(output, {inputs: images.eval()}) self.assertEqual(output.shape, (batch, 3, 3, 32)) def testAtrousFullyConvolutionalUnknownHeightWidth(self): batch = 2 height, width = 65, 65 global_pool = False output_stride = 8 inputs = create_test_input(batch, None, None, 3) with slim.arg_scope(resnet_utils.resnet_arg_scope()): output, _ = self._resnet_small(inputs, None, global_pool=global_pool, output_stride=output_stride) self.assertListEqual(output.get_shape().as_list(), [batch, None, None, 32]) images = create_test_input(batch, height, width, 3) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) output = sess.run(output, {inputs: images.eval()}) self.assertEqual(output.shape, (batch, 9, 9, 32)) if __name__ == '__main__': tf.test.main() ================================================ FILE: models/slim/nets/vgg.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Contains model definitions for versions of the Oxford VGG network. These model definitions were introduced in the following technical report: Very Deep Convolutional Networks For Large-Scale Image Recognition Karen Simonyan and Andrew Zisserman arXiv technical report, 2015 PDF: http://arxiv.org/pdf/1409.1556.pdf ILSVRC 2014 Slides: http://www.robots.ox.ac.uk/~karen/pdf/ILSVRC_2014.pdf CC-BY-4.0 More information can be obtained from the VGG website: www.robots.ox.ac.uk/~vgg/research/very_deep/ Usage: with slim.arg_scope(vgg.vgg_arg_scope()): outputs, end_points = vgg.vgg_a(inputs) with slim.arg_scope(vgg.vgg_arg_scope()): outputs, end_points = vgg.vgg_16(inputs) @@vgg_a @@vgg_16 @@vgg_19 """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf slim = tf.contrib.slim def vgg_arg_scope(weight_decay=0.0005): """Defines the VGG arg scope. Args: weight_decay: The l2 regularization coefficient. Returns: An arg_scope. """ with slim.arg_scope([slim.conv2d, slim.fully_connected], activation_fn=tf.nn.relu, weights_regularizer=slim.l2_regularizer(weight_decay), biases_initializer=tf.zeros_initializer()): with slim.arg_scope([slim.conv2d], padding='SAME') as arg_sc: return arg_sc def vgg_a(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.5, spatial_squeeze=False, scope='vgg_a'): """Oxford Net VGG 11-Layers version A Example. Note: All the fully_connected layers have been transformed to conv2d layers. To use in classification mode, resize input to 224x224. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. spatial_squeeze: whether or not should squeeze the spatial dimensions of the outputs. Useful to remove unnecessary dimensions for classification. scope: Optional scope for the variables. Returns: the last op containing the log predictions and end_points dict. """ with tf.variable_scope(scope, 'vgg_a', [inputs]) as sc: end_points_collection = sc.name + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d. with slim.arg_scope([slim.conv2d, slim.max_pool2d], outputs_collections=end_points_collection): net = slim.repeat(inputs, 1, slim.conv2d, 64, [3, 3], scope='conv1') net = slim.max_pool2d(net, [2, 2], scope='pool1') net = slim.repeat(net, 1, slim.conv2d, 128, [3, 3], scope='conv2') net = slim.max_pool2d(net, [2, 2], scope='pool2') net = slim.repeat(net, 2, slim.conv2d, 256, [3, 3], scope='conv3') net = slim.max_pool2d(net, [2, 2], scope='pool3') net = slim.repeat(net, 2, slim.conv2d, 512, [3, 3], scope='conv4') net = slim.max_pool2d(net, [2, 2], scope='pool4') net = slim.repeat(net, 2, slim.conv2d, 512, [3, 3], scope='conv5') net = slim.max_pool2d(net, [2, 2], scope='pool5') # Use conv2d instead of fully_connected layers. net = slim.conv2d(net, 4096, [7, 7], padding='VALID', scope='fc6') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout6') net = slim.conv2d(net, 4096, [1, 1], scope='fc7') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout7') net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='fc8') # Convert end_points_collection into a end_point dict. end_points = slim.utils.convert_collection_to_dict(end_points_collection) if spatial_squeeze: net = tf.squeeze(net, [1, 2], name='fc8/squeezed') end_points[sc.name + '/fc8'] = net return net, end_points vgg_a.default_image_size = 224 def vgg_16(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.5, spatial_squeeze=False, train_top_bn=None, # ignore, just for consistency scope='vgg_16'): """Oxford Net VGG 16-Layers version D Example. Note: All the fully_connected layers have been transformed to conv2d layers. To use in classification mode, resize input to 224x224. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. spatial_squeeze: whether or not should squeeze the spatial dimensions of the outputs. Useful to remove unnecessary dimensions for classification. scope: Optional scope for the variables. Returns: the last op containing the log predictions and end_points dict. """ with tf.variable_scope(scope, 'vgg_16', [inputs]) as sc: end_points_collection = sc.name + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d. with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d], outputs_collections=end_points_collection): net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1') net = slim.max_pool2d(net, [2, 2], scope='pool1') net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2') net = slim.max_pool2d(net, [2, 2], scope='pool2') net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3') net = slim.max_pool2d(net, [2, 2], scope='pool3') net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4') net = slim.max_pool2d(net, [2, 2], scope='pool4') # net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5') # rgirdhar: remove the relu from last layer net = slim.repeat(net, 2, slim.conv2d, 512, [3, 3], scope='conv5') net = slim.conv2d(net, 512, [3, 3], activation_fn=None, scope='conv5/conv5_3') conv5_output = net net = tf.nn.relu(net) net = slim.max_pool2d(net, [2, 2], scope='pool5') # Use conv2d instead of fully_connected layers. net = slim.conv2d(net, 4096, [7, 7], padding='VALID', scope='fc6') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout6') net = slim.conv2d(net, 4096, [1, 1], scope='fc7') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout7') net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='fc8') # Convert end_points_collection into a end_point dict. end_points = slim.utils.convert_collection_to_dict(end_points_collection) end_points['vgg_16/conv5'] = conv5_output if spatial_squeeze: net = tf.squeeze(net, [1, 2], name='fc8/squeezed') end_points[sc.name + '/fc8'] = net return net, end_points vgg_16.default_image_size = 224 def vgg_19(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.5, spatial_squeeze=False, scope='vgg_19'): """Oxford Net VGG 19-Layers version E Example. Note: All the fully_connected layers have been transformed to conv2d layers. To use in classification mode, resize input to 224x224. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. spatial_squeeze: whether or not should squeeze the spatial dimensions of the outputs. Useful to remove unnecessary dimensions for classification. scope: Optional scope for the variables. Returns: the last op containing the log predictions and end_points dict. """ with tf.variable_scope(scope, 'vgg_19', [inputs]) as sc: end_points_collection = sc.name + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d. with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d], outputs_collections=end_points_collection): net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1') net = slim.max_pool2d(net, [2, 2], scope='pool1') net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2') net = slim.max_pool2d(net, [2, 2], scope='pool2') net = slim.repeat(net, 4, slim.conv2d, 256, [3, 3], scope='conv3') net = slim.max_pool2d(net, [2, 2], scope='pool3') net = slim.repeat(net, 4, slim.conv2d, 512, [3, 3], scope='conv4') net = slim.max_pool2d(net, [2, 2], scope='pool4') net = slim.repeat(net, 4, slim.conv2d, 512, [3, 3], scope='conv5') net = slim.max_pool2d(net, [2, 2], scope='pool5') # Use conv2d instead of fully_connected layers. net = slim.conv2d(net, 4096, [7, 7], padding='VALID', scope='fc6') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout6') net = slim.conv2d(net, 4096, [1, 1], scope='fc7') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout7') net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='fc8') # Convert end_points_collection into a end_point dict. end_points = slim.utils.convert_collection_to_dict(end_points_collection) if spatial_squeeze: net = tf.squeeze(net, [1, 2], name='fc8/squeezed') end_points[sc.name + '/fc8'] = net return net, end_points vgg_19.default_image_size = 224 # Alias vgg_d = vgg_16 vgg_e = vgg_19 ================================================ FILE: models/slim/nets/vgg_test.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Tests for slim.nets.vgg.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from nets import vgg slim = tf.contrib.slim class VGGATest(tf.test.TestCase): def testBuild(self): batch_size = 5 height, width = 224, 224 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = vgg.vgg_a(inputs, num_classes) self.assertEquals(logits.op.name, 'vgg_a/fc8/squeezed') self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) def testFullyConvolutional(self): batch_size = 1 height, width = 256, 256 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = vgg.vgg_a(inputs, num_classes, spatial_squeeze=False) self.assertEquals(logits.op.name, 'vgg_a/fc8/BiasAdd') self.assertListEqual(logits.get_shape().as_list(), [batch_size, 2, 2, num_classes]) def testEndPoints(self): batch_size = 5 height, width = 224, 224 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) _, end_points = vgg.vgg_a(inputs, num_classes) expected_names = ['vgg_a/conv1/conv1_1', 'vgg_a/pool1', 'vgg_a/conv2/conv2_1', 'vgg_a/pool2', 'vgg_a/conv3/conv3_1', 'vgg_a/conv3/conv3_2', 'vgg_a/pool3', 'vgg_a/conv4/conv4_1', 'vgg_a/conv4/conv4_2', 'vgg_a/pool4', 'vgg_a/conv5/conv5_1', 'vgg_a/conv5/conv5_2', 'vgg_a/pool5', 'vgg_a/fc6', 'vgg_a/fc7', 'vgg_a/fc8' ] self.assertSetEqual(set(end_points.keys()), set(expected_names)) def testModelVariables(self): batch_size = 5 height, width = 224, 224 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) vgg.vgg_a(inputs, num_classes) expected_names = ['vgg_a/conv1/conv1_1/weights', 'vgg_a/conv1/conv1_1/biases', 'vgg_a/conv2/conv2_1/weights', 'vgg_a/conv2/conv2_1/biases', 'vgg_a/conv3/conv3_1/weights', 'vgg_a/conv3/conv3_1/biases', 'vgg_a/conv3/conv3_2/weights', 'vgg_a/conv3/conv3_2/biases', 'vgg_a/conv4/conv4_1/weights', 'vgg_a/conv4/conv4_1/biases', 'vgg_a/conv4/conv4_2/weights', 'vgg_a/conv4/conv4_2/biases', 'vgg_a/conv5/conv5_1/weights', 'vgg_a/conv5/conv5_1/biases', 'vgg_a/conv5/conv5_2/weights', 'vgg_a/conv5/conv5_2/biases', 'vgg_a/fc6/weights', 'vgg_a/fc6/biases', 'vgg_a/fc7/weights', 'vgg_a/fc7/biases', 'vgg_a/fc8/weights', 'vgg_a/fc8/biases', ] model_variables = [v.op.name for v in slim.get_model_variables()] self.assertSetEqual(set(model_variables), set(expected_names)) def testEvaluation(self): batch_size = 2 height, width = 224, 224 num_classes = 1000 with self.test_session(): eval_inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = vgg.vgg_a(eval_inputs, is_training=False) self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) predictions = tf.argmax(logits, 1) self.assertListEqual(predictions.get_shape().as_list(), [batch_size]) def testTrainEvalWithReuse(self): train_batch_size = 2 eval_batch_size = 1 train_height, train_width = 224, 224 eval_height, eval_width = 256, 256 num_classes = 1000 with self.test_session(): train_inputs = tf.random_uniform( (train_batch_size, train_height, train_width, 3)) logits, _ = vgg.vgg_a(train_inputs) self.assertListEqual(logits.get_shape().as_list(), [train_batch_size, num_classes]) tf.get_variable_scope().reuse_variables() eval_inputs = tf.random_uniform( (eval_batch_size, eval_height, eval_width, 3)) logits, _ = vgg.vgg_a(eval_inputs, is_training=False, spatial_squeeze=False) self.assertListEqual(logits.get_shape().as_list(), [eval_batch_size, 2, 2, num_classes]) logits = tf.reduce_mean(logits, [1, 2]) predictions = tf.argmax(logits, 1) self.assertEquals(predictions.get_shape().as_list(), [eval_batch_size]) def testForward(self): batch_size = 1 height, width = 224, 224 with self.test_session() as sess: inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = vgg.vgg_a(inputs) sess.run(tf.global_variables_initializer()) output = sess.run(logits) self.assertTrue(output.any()) class VGG16Test(tf.test.TestCase): def testBuild(self): batch_size = 5 height, width = 224, 224 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = vgg.vgg_16(inputs, num_classes) self.assertEquals(logits.op.name, 'vgg_16/fc8/squeezed') self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) def testFullyConvolutional(self): batch_size = 1 height, width = 256, 256 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = vgg.vgg_16(inputs, num_classes, spatial_squeeze=False) self.assertEquals(logits.op.name, 'vgg_16/fc8/BiasAdd') self.assertListEqual(logits.get_shape().as_list(), [batch_size, 2, 2, num_classes]) def testEndPoints(self): batch_size = 5 height, width = 224, 224 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) _, end_points = vgg.vgg_16(inputs, num_classes) expected_names = ['vgg_16/conv1/conv1_1', 'vgg_16/conv1/conv1_2', 'vgg_16/pool1', 'vgg_16/conv2/conv2_1', 'vgg_16/conv2/conv2_2', 'vgg_16/pool2', 'vgg_16/conv3/conv3_1', 'vgg_16/conv3/conv3_2', 'vgg_16/conv3/conv3_3', 'vgg_16/pool3', 'vgg_16/conv4/conv4_1', 'vgg_16/conv4/conv4_2', 'vgg_16/conv4/conv4_3', 'vgg_16/pool4', 'vgg_16/conv5/conv5_1', 'vgg_16/conv5/conv5_2', 'vgg_16/conv5/conv5_3', 'vgg_16/pool5', 'vgg_16/fc6', 'vgg_16/fc7', 'vgg_16/fc8' ] self.assertSetEqual(set(end_points.keys()), set(expected_names)) def testModelVariables(self): batch_size = 5 height, width = 224, 224 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) vgg.vgg_16(inputs, num_classes) expected_names = ['vgg_16/conv1/conv1_1/weights', 'vgg_16/conv1/conv1_1/biases', 'vgg_16/conv1/conv1_2/weights', 'vgg_16/conv1/conv1_2/biases', 'vgg_16/conv2/conv2_1/weights', 'vgg_16/conv2/conv2_1/biases', 'vgg_16/conv2/conv2_2/weights', 'vgg_16/conv2/conv2_2/biases', 'vgg_16/conv3/conv3_1/weights', 'vgg_16/conv3/conv3_1/biases', 'vgg_16/conv3/conv3_2/weights', 'vgg_16/conv3/conv3_2/biases', 'vgg_16/conv3/conv3_3/weights', 'vgg_16/conv3/conv3_3/biases', 'vgg_16/conv4/conv4_1/weights', 'vgg_16/conv4/conv4_1/biases', 'vgg_16/conv4/conv4_2/weights', 'vgg_16/conv4/conv4_2/biases', 'vgg_16/conv4/conv4_3/weights', 'vgg_16/conv4/conv4_3/biases', 'vgg_16/conv5/conv5_1/weights', 'vgg_16/conv5/conv5_1/biases', 'vgg_16/conv5/conv5_2/weights', 'vgg_16/conv5/conv5_2/biases', 'vgg_16/conv5/conv5_3/weights', 'vgg_16/conv5/conv5_3/biases', 'vgg_16/fc6/weights', 'vgg_16/fc6/biases', 'vgg_16/fc7/weights', 'vgg_16/fc7/biases', 'vgg_16/fc8/weights', 'vgg_16/fc8/biases', ] model_variables = [v.op.name for v in slim.get_model_variables()] self.assertSetEqual(set(model_variables), set(expected_names)) def testEvaluation(self): batch_size = 2 height, width = 224, 224 num_classes = 1000 with self.test_session(): eval_inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = vgg.vgg_16(eval_inputs, is_training=False) self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) predictions = tf.argmax(logits, 1) self.assertListEqual(predictions.get_shape().as_list(), [batch_size]) def testTrainEvalWithReuse(self): train_batch_size = 2 eval_batch_size = 1 train_height, train_width = 224, 224 eval_height, eval_width = 256, 256 num_classes = 1000 with self.test_session(): train_inputs = tf.random_uniform( (train_batch_size, train_height, train_width, 3)) logits, _ = vgg.vgg_16(train_inputs) self.assertListEqual(logits.get_shape().as_list(), [train_batch_size, num_classes]) tf.get_variable_scope().reuse_variables() eval_inputs = tf.random_uniform( (eval_batch_size, eval_height, eval_width, 3)) logits, _ = vgg.vgg_16(eval_inputs, is_training=False, spatial_squeeze=False) self.assertListEqual(logits.get_shape().as_list(), [eval_batch_size, 2, 2, num_classes]) logits = tf.reduce_mean(logits, [1, 2]) predictions = tf.argmax(logits, 1) self.assertEquals(predictions.get_shape().as_list(), [eval_batch_size]) def testForward(self): batch_size = 1 height, width = 224, 224 with self.test_session() as sess: inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = vgg.vgg_16(inputs) sess.run(tf.global_variables_initializer()) output = sess.run(logits) self.assertTrue(output.any()) class VGG19Test(tf.test.TestCase): def testBuild(self): batch_size = 5 height, width = 224, 224 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = vgg.vgg_19(inputs, num_classes) self.assertEquals(logits.op.name, 'vgg_19/fc8/squeezed') self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) def testFullyConvolutional(self): batch_size = 1 height, width = 256, 256 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = vgg.vgg_19(inputs, num_classes, spatial_squeeze=False) self.assertEquals(logits.op.name, 'vgg_19/fc8/BiasAdd') self.assertListEqual(logits.get_shape().as_list(), [batch_size, 2, 2, num_classes]) def testEndPoints(self): batch_size = 5 height, width = 224, 224 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) _, end_points = vgg.vgg_19(inputs, num_classes) expected_names = [ 'vgg_19/conv1/conv1_1', 'vgg_19/conv1/conv1_2', 'vgg_19/pool1', 'vgg_19/conv2/conv2_1', 'vgg_19/conv2/conv2_2', 'vgg_19/pool2', 'vgg_19/conv3/conv3_1', 'vgg_19/conv3/conv3_2', 'vgg_19/conv3/conv3_3', 'vgg_19/conv3/conv3_4', 'vgg_19/pool3', 'vgg_19/conv4/conv4_1', 'vgg_19/conv4/conv4_2', 'vgg_19/conv4/conv4_3', 'vgg_19/conv4/conv4_4', 'vgg_19/pool4', 'vgg_19/conv5/conv5_1', 'vgg_19/conv5/conv5_2', 'vgg_19/conv5/conv5_3', 'vgg_19/conv5/conv5_4', 'vgg_19/pool5', 'vgg_19/fc6', 'vgg_19/fc7', 'vgg_19/fc8' ] self.assertSetEqual(set(end_points.keys()), set(expected_names)) def testModelVariables(self): batch_size = 5 height, width = 224, 224 num_classes = 1000 with self.test_session(): inputs = tf.random_uniform((batch_size, height, width, 3)) vgg.vgg_19(inputs, num_classes) expected_names = [ 'vgg_19/conv1/conv1_1/weights', 'vgg_19/conv1/conv1_1/biases', 'vgg_19/conv1/conv1_2/weights', 'vgg_19/conv1/conv1_2/biases', 'vgg_19/conv2/conv2_1/weights', 'vgg_19/conv2/conv2_1/biases', 'vgg_19/conv2/conv2_2/weights', 'vgg_19/conv2/conv2_2/biases', 'vgg_19/conv3/conv3_1/weights', 'vgg_19/conv3/conv3_1/biases', 'vgg_19/conv3/conv3_2/weights', 'vgg_19/conv3/conv3_2/biases', 'vgg_19/conv3/conv3_3/weights', 'vgg_19/conv3/conv3_3/biases', 'vgg_19/conv3/conv3_4/weights', 'vgg_19/conv3/conv3_4/biases', 'vgg_19/conv4/conv4_1/weights', 'vgg_19/conv4/conv4_1/biases', 'vgg_19/conv4/conv4_2/weights', 'vgg_19/conv4/conv4_2/biases', 'vgg_19/conv4/conv4_3/weights', 'vgg_19/conv4/conv4_3/biases', 'vgg_19/conv4/conv4_4/weights', 'vgg_19/conv4/conv4_4/biases', 'vgg_19/conv5/conv5_1/weights', 'vgg_19/conv5/conv5_1/biases', 'vgg_19/conv5/conv5_2/weights', 'vgg_19/conv5/conv5_2/biases', 'vgg_19/conv5/conv5_3/weights', 'vgg_19/conv5/conv5_3/biases', 'vgg_19/conv5/conv5_4/weights', 'vgg_19/conv5/conv5_4/biases', 'vgg_19/fc6/weights', 'vgg_19/fc6/biases', 'vgg_19/fc7/weights', 'vgg_19/fc7/biases', 'vgg_19/fc8/weights', 'vgg_19/fc8/biases', ] model_variables = [v.op.name for v in slim.get_model_variables()] self.assertSetEqual(set(model_variables), set(expected_names)) def testEvaluation(self): batch_size = 2 height, width = 224, 224 num_classes = 1000 with self.test_session(): eval_inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = vgg.vgg_19(eval_inputs, is_training=False) self.assertListEqual(logits.get_shape().as_list(), [batch_size, num_classes]) predictions = tf.argmax(logits, 1) self.assertListEqual(predictions.get_shape().as_list(), [batch_size]) def testTrainEvalWithReuse(self): train_batch_size = 2 eval_batch_size = 1 train_height, train_width = 224, 224 eval_height, eval_width = 256, 256 num_classes = 1000 with self.test_session(): train_inputs = tf.random_uniform( (train_batch_size, train_height, train_width, 3)) logits, _ = vgg.vgg_19(train_inputs) self.assertListEqual(logits.get_shape().as_list(), [train_batch_size, num_classes]) tf.get_variable_scope().reuse_variables() eval_inputs = tf.random_uniform( (eval_batch_size, eval_height, eval_width, 3)) logits, _ = vgg.vgg_19(eval_inputs, is_training=False, spatial_squeeze=False) self.assertListEqual(logits.get_shape().as_list(), [eval_batch_size, 2, 2, num_classes]) logits = tf.reduce_mean(logits, [1, 2]) predictions = tf.argmax(logits, 1) self.assertEquals(predictions.get_shape().as_list(), [eval_batch_size]) def testForward(self): batch_size = 1 height, width = 224, 224 with self.test_session() as sess: inputs = tf.random_uniform((batch_size, height, width, 3)) logits, _ = vgg.vgg_19(inputs) sess.run(tf.global_variables_initializer()) output = sess.run(logits) self.assertTrue(output.any()) if __name__ == '__main__': tf.test.main() ================================================ FILE: models/slim/preprocessing/__init__.py ================================================ ================================================ FILE: models/slim/preprocessing/cifarnet_preprocessing.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Provides utilities to preprocess images in CIFAR-10. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf _PADDING = 4 slim = tf.contrib.slim def preprocess_for_train(image, output_height, output_width, padding=_PADDING): """Preprocesses the given image for training. Note that the actual resizing scale is sampled from [`resize_size_min`, `resize_size_max`]. Args: image: A `Tensor` representing an image of arbitrary size. output_height: The height of the image after preprocessing. output_width: The width of the image after preprocessing. padding: The amound of padding before and after each dimension of the image. Returns: A preprocessed image. """ tf.image_summary('image', tf.expand_dims(image, 0)) # Transform the image to floats. image = tf.to_float(image) if padding > 0: image = tf.pad(image, [[padding, padding], [padding, padding], [0, 0]]) # Randomly crop a [height, width] section of the image. distorted_image = tf.random_crop(image, [output_height, output_width, 3]) # Randomly flip the image horizontally. distorted_image = tf.image.random_flip_left_right(distorted_image) tf.image_summary('distorted_image', tf.expand_dims(distorted_image, 0)) # Because these operations are not commutative, consider randomizing # the order their operation. distorted_image = tf.image.random_brightness(distorted_image, max_delta=63) distorted_image = tf.image.random_contrast(distorted_image, lower=0.2, upper=1.8) # Subtract off the mean and divide by the variance of the pixels. return tf.image.per_image_whitening(distorted_image) def preprocess_for_eval(image, output_height, output_width): """Preprocesses the given image for evaluation. Args: image: A `Tensor` representing an image of arbitrary size. output_height: The height of the image after preprocessing. output_width: The width of the image after preprocessing. Returns: A preprocessed image. """ tf.image_summary('image', tf.expand_dims(image, 0)) # Transform the image to floats. image = tf.to_float(image) # Resize and crop if needed. resized_image = tf.image.resize_image_with_crop_or_pad(image, output_width, output_height) tf.image_summary('resized_image', tf.expand_dims(resized_image, 0)) # Subtract off the mean and divide by the variance of the pixels. return tf.image.per_image_whitening(resized_image) def preprocess_image(image, output_height, output_width, is_training=False): """Preprocesses the given image. Args: image: A `Tensor` representing an image of arbitrary size. output_height: The height of the image after preprocessing. output_width: The width of the image after preprocessing. is_training: `True` if we're preprocessing the image for training and `False` otherwise. Returns: A preprocessed image. """ if is_training: return preprocess_for_train(image, output_height, output_width) else: return preprocess_for_eval(image, output_height, output_width) ================================================ FILE: models/slim/preprocessing/inception_preprocessing.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Provides utilities to preprocess images for the Inception networks.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from tensorflow.python.ops import control_flow_ops def apply_with_random_selector(x, func, num_cases): """Computes func(x, sel), with sel sampled from [0...num_cases-1]. Args: x: input Tensor. func: Python function to apply. num_cases: Python int32, number of cases to sample sel from. Returns: The result of func(x, sel), where func receives the value of the selector as a python integer, but sel is sampled dynamically. """ sel = tf.random_uniform([], maxval=num_cases, dtype=tf.int32) # Pass the real x only to one of the func calls. return control_flow_ops.merge([ func(control_flow_ops.switch(x, tf.equal(sel, case))[1], case) for case in range(num_cases)])[0] def distort_color(image, color_ordering=0, fast_mode=True, scope=None): """Distort the color of a Tensor image. Each color distortion is non-commutative and thus ordering of the color ops matters. Ideally we would randomly permute the ordering of the color ops. Rather then adding that level of complication, we select a distinct ordering of color ops for each preprocessing thread. Args: image: 3-D Tensor containing single image in [0, 1]. color_ordering: Python int, a type of distortion (valid values: 0-3). fast_mode: Avoids slower ops (random_hue and random_contrast) scope: Optional scope for name_scope. Returns: 3-D Tensor color-distorted image on range [0, 1] Raises: ValueError: if color_ordering not in [0, 3] """ with tf.name_scope(scope, 'distort_color', [image]): if fast_mode: if color_ordering == 0: image = tf.image.random_brightness(image, max_delta=32. / 255.) image = tf.image.random_saturation(image, lower=0.5, upper=1.5) else: image = tf.image.random_saturation(image, lower=0.5, upper=1.5) image = tf.image.random_brightness(image, max_delta=32. / 255.) else: if color_ordering == 0: image = tf.image.random_brightness(image, max_delta=32. / 255.) image = tf.image.random_saturation(image, lower=0.5, upper=1.5) image = tf.image.random_hue(image, max_delta=0.2) image = tf.image.random_contrast(image, lower=0.5, upper=1.5) elif color_ordering == 1: image = tf.image.random_saturation(image, lower=0.5, upper=1.5) image = tf.image.random_brightness(image, max_delta=32. / 255.) image = tf.image.random_contrast(image, lower=0.5, upper=1.5) image = tf.image.random_hue(image, max_delta=0.2) elif color_ordering == 2: image = tf.image.random_contrast(image, lower=0.5, upper=1.5) image = tf.image.random_hue(image, max_delta=0.2) image = tf.image.random_brightness(image, max_delta=32. / 255.) image = tf.image.random_saturation(image, lower=0.5, upper=1.5) elif color_ordering == 3: image = tf.image.random_hue(image, max_delta=0.2) image = tf.image.random_saturation(image, lower=0.5, upper=1.5) image = tf.image.random_contrast(image, lower=0.5, upper=1.5) image = tf.image.random_brightness(image, max_delta=32. / 255.) else: raise ValueError('color_ordering must be in [0, 3]') # The random_* ops do not necessarily clamp. return tf.clip_by_value(image, 0.0, 1.0) def distorted_bounding_box_crop(image, bbox, min_object_covered=0.1, aspect_ratio_range=(0.75, 1.33), # area_range=(0.05, 1.0), area_range=(0.85, 1.0), max_attempts=100, scope=None): """Generates cropped_image using a one of the bboxes randomly distorted. See `tf.image.sample_distorted_bounding_box` for more documentation. Args: image: 3-D Tensor of image (it will be converted to floats in [0, 1]). bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] where each coordinate is [0, 1) and the coordinates are arranged as [ymin, xmin, ymax, xmax]. If num_boxes is 0 then it would use the whole image. min_object_covered: An optional `float`. Defaults to `0.1`. The cropped area of the image must contain at least this fraction of any bounding box supplied. aspect_ratio_range: An optional list of `floats`. The cropped area of the image must have an aspect ratio = width / height within this range. area_range: An optional list of `floats`. The cropped area of the image must contain a fraction of the supplied image within in this range. max_attempts: An optional `int`. Number of attempts at generating a cropped region of the image of the specified constraints. After `max_attempts` failures, return the entire image. scope: Optional scope for name_scope. Returns: A tuple, a 3-D Tensor cropped_image and the distorted bbox """ with tf.name_scope(scope, 'distorted_bounding_box_crop', [image, bbox]): # Each bounding box has shape [1, num_boxes, box coords] and # the coordinates are ordered [ymin, xmin, ymax, xmax]. # A large fraction of image datasets contain a human-annotated bounding # box delineating the region of the image containing the object of interest. # We choose to create a new bounding box for the object which is a randomly # distorted version of the human-annotated bounding box that obeys an # allowed range of aspect ratios, sizes and overlap with the human-annotated # bounding box. If no box is supplied, then we assume the bounding box is # the entire image. sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box( tf.shape(image), bounding_boxes=bbox, min_object_covered=min_object_covered, aspect_ratio_range=aspect_ratio_range, area_range=area_range, max_attempts=max_attempts, use_image_if_no_bounding_boxes=True) bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box # Crop the image to the specified bounding box. cropped_image = tf.slice(image, bbox_begin, bbox_size) return cropped_image, distort_bbox def preprocess_for_train(image, height, width, bbox, fast_mode=True, scope=None): """Distort one image for training a network. Distorting images provides a useful technique for augmenting the data set during training in order to make the network invariant to aspects of the image that do not effect the label. Additionally it would create image_summaries to display the different transformations applied to the image. Args: image: 3-D Tensor of image. If dtype is tf.float32 then the range should be [0, 1], otherwise it would converted to tf.float32 assuming that the range is [0, MAX], where MAX is largest positive representable number for int(8/16/32) data type (see `tf.image.convert_image_dtype` for details). height: integer width: integer bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] where each coordinate is [0, 1) and the coordinates are arranged as [ymin, xmin, ymax, xmax]. fast_mode: Optional boolean, if True avoids slower transformations (i.e. bi-cubic resizing, random_hue or random_contrast). scope: Optional scope for name_scope. Returns: 3-D float Tensor of distorted image used for training with range [-1, 1]. """ with tf.name_scope(scope, 'distort_image', [image, height, width, bbox]): if bbox is None: bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]) if image.dtype != tf.float32: image = tf.image.convert_image_dtype(image, dtype=tf.float32) # Each bounding box has shape [1, num_boxes, box coords] and # the coordinates are ordered [ymin, xmin, ymax, xmax]. # image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0), # bbox) # tf.summary.image('image_with_bounding_boxes', image_with_box) image_channels = image.get_shape().as_list()[-1] distorted_image, distorted_bbox = distorted_bounding_box_crop(image, bbox) # Restore the shape since the dynamic slice based upon the bbox_size loses # the third dimension. distorted_image.set_shape([None, None, image_channels]) # image_with_distorted_box = tf.image.draw_bounding_boxes( # tf.expand_dims(image, 0), distorted_bbox) # tf.summary.image('images_with_distorted_bounding_box', # image_with_distorted_box) # This resizing operation may distort the images because the aspect # ratio is not respected. We select a resize method in a round robin # fashion based on the thread number. # Note that ResizeMethod contains 4 enumerated resizing methods. # We select only 1 case for fast_mode bilinear. num_resize_cases = 1 if fast_mode else 4 distorted_image = apply_with_random_selector( distorted_image, lambda x, method: tf.image.resize_images(x, [height, width], method=method), num_cases=num_resize_cases) # tf.summary.image('cropped_resized_image', # tf.expand_dims(distorted_image, 0)) # Randomly flip the image horizontally. distorted_image = tf.image.random_flip_left_right(distorted_image) # Randomly distort the colors. There are 4 ways to do it. # rgirdhar: Stop distorting colors # distorted_image = apply_with_random_selector( # distorted_image, # lambda x, ordering: distort_color(x, ordering, fast_mode), # num_cases=4) # tf.summary.image('final_distorted_image', # tf.expand_dims(distorted_image, 0)) distorted_image -= 0.5 distorted_image *= 2.0 return distorted_image def preprocess_for_eval(image, height, width, central_fraction=0.875, scope=None): """Prepare one image for evaluation. If height and width are specified it would output an image with that size by applying resize_bilinear. If central_fraction is specified it would cropt the central fraction of the input image. Args: image: 3-D Tensor of image. If dtype is tf.float32 then the range should be [0, 1], otherwise it would converted to tf.float32 assuming that the range is [0, MAX], where MAX is largest positive representable number for int(8/16/32) data type (see `tf.image.convert_image_dtype` for details) height: integer width: integer central_fraction: Optional Float, fraction of the image to crop. scope: Optional scope for name_scope. Returns: 3-D float Tensor of prepared image. """ with tf.name_scope(scope, 'eval_image', [image, height, width]): if image.dtype != tf.float32: image = tf.image.convert_image_dtype(image, dtype=tf.float32) # Crop the central region of the image with an area containing 87.5% of # the original image. if central_fraction: image = tf.image.central_crop(image, central_fraction=central_fraction) if height and width: # Resize the image to the specified height and width. image = tf.expand_dims(image, 0) image = tf.image.resize_bilinear(image, [height, width], align_corners=False) image = tf.squeeze(image, [0]) image -= 0.5 image *= 2.0 return image def preprocess_image(image, height, width, is_training=False, resize_side_min=None, # this and next are only cos VGG # uses these. No effect here. resize_side_max=None, bbox=None, fast_mode=True): """Pre-process one image for training or evaluation. Args: image: 3-D Tensor [height, width, channels] with the image. height: integer, image expected height. width: integer, image expected width. is_training: Boolean. If true it would transform an image for train, otherwise it would transform it for evaluation. bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] where each coordinate is [0, 1) and the coordinates are arranged as [ymin, xmin, ymax, xmax]. fast_mode: Optional boolean, if True avoids slower transformations. Returns: 3-D float Tensor containing an appropriately scaled image Raises: ValueError: if user does not provide bounding box """ if is_training: return preprocess_for_train(image, height, width, bbox, fast_mode) else: return preprocess_for_eval(image, height, width) ================================================ FILE: models/slim/preprocessing/lenet_preprocessing.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Provides utilities for preprocessing.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf slim = tf.contrib.slim def preprocess_image(image, output_height, output_width, is_training): """Preprocesses the given image. Args: image: A `Tensor` representing an image of arbitrary size. output_height: The height of the image after preprocessing. output_width: The width of the image after preprocessing. is_training: `True` if we're preprocessing the image for training and `False` otherwise. Returns: A preprocessed image. """ image = tf.to_float(image) image = tf.image.resize_image_with_crop_or_pad( image, output_width, output_height) image = tf.sub(image, 128.0) image = tf.div(image, 128.0) return image ================================================ FILE: models/slim/preprocessing/preprocessing_factory.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Contains a factory for building various models.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from preprocessing import cifarnet_preprocessing from preprocessing import inception_preprocessing from preprocessing import lenet_preprocessing from preprocessing import vgg_preprocessing slim = tf.contrib.slim def get_preprocessing(name, is_training=False): """Returns preprocessing_fn(image, height, width, **kwargs). Args: name: The name of the preprocessing function. is_training: `True` if the model is being used for training and `False` otherwise. Returns: preprocessing_fn: A function that preprocessing a single image (pre-batch). It has the following signature: image = preprocessing_fn(image, output_height, output_width, ...). Raises: ValueError: If Preprocessing `name` is not recognized. """ preprocessing_fn_map = { 'cifarnet': cifarnet_preprocessing, 'inception': inception_preprocessing, 'inception_v1': inception_preprocessing, 'inception_v2': inception_preprocessing, 'inception_v2_tsn': vgg_preprocessing, # Its wts are copied from caffe 'inception_v3': inception_preprocessing, 'inception_v4': inception_preprocessing, 'inception_resnet_v2': inception_preprocessing, 'lenet': lenet_preprocessing, 'resnet_v1_50': vgg_preprocessing, 'resnet_v1_101': vgg_preprocessing, 'resnet_v1_152': vgg_preprocessing, 'vgg': vgg_preprocessing, 'vgg_a': vgg_preprocessing, 'vgg_16': vgg_preprocessing, 'vgg_19': vgg_preprocessing, } if name not in preprocessing_fn_map: raise ValueError('Preprocessing name [%s] was not recognized' % name) def preprocessing_fn(image, output_height, output_width, **kwargs): # preprocess 4D images (with [frames_per_vid, ht, wd, c]) expanded_dim = False if image.get_shape().ndims == 3: expanded_dim = True image = tf.expand_dims(image, 0) res = tf.stack([preprocessing_fn_map[name].preprocess_image( el, output_height, output_width, is_training=is_training, **kwargs) for el in tf.unstack(image)]) if expanded_dim: res = res[0] return res return preprocessing_fn ================================================ FILE: models/slim/preprocessing/vgg_preprocessing.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Provides utilities to preprocess images. The preprocessing steps for VGG were introduced in the following technical report: Very Deep Convolutional Networks For Large-Scale Image Recognition Karen Simonyan and Andrew Zisserman arXiv technical report, 2015 PDF: http://arxiv.org/pdf/1409.1556.pdf ILSVRC 2014 Slides: http://www.robots.ox.ac.uk/~karen/pdf/ILSVRC_2014.pdf CC-BY-4.0 More information can be obtained from the VGG website: www.robots.ox.ac.uk/~vgg/research/very_deep/ """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf import numpy as np from tensorflow.python.ops import control_flow_ops slim = tf.contrib.slim # _R_MEAN = 123.68 # _G_MEAN = 116.78 # _B_MEAN = 103.94 _MEAN = 128.0 # rgirdhar: changing to this for easier handling of label channel _RESIZE_SIDE_MIN = 512 # _RESIZE_SIDE_MAX = 512 _RESIZE_SIDE_MAX = 512 # for pose, I don't want to loose too much def _crop(image, offset_height, offset_width, crop_height, crop_width): """Crops the given image using the provided offsets and sizes. Note that the method doesn't assume we know the input image size but it does assume we know the input image rank. Args: image: an image of shape [height, width, channels]. offset_height: a scalar tensor indicating the height offset. offset_width: a scalar tensor indicating the width offset. crop_height: the height of the cropped image. crop_width: the width of the cropped image. Returns: the cropped (and resized) image. Raises: InvalidArgumentError: if the rank is not 3 or if the image dimensions are less than the crop size. """ original_shape = tf.shape(image) rank_assertion = tf.Assert( tf.equal(tf.rank(image), 3), ['Rank of image must be equal to 3.']) cropped_shape = control_flow_ops.with_dependencies( [rank_assertion], tf.stack([crop_height, crop_width, original_shape[2]])) size_assertion = tf.Assert( tf.logical_and( tf.greater_equal(original_shape[0], crop_height), tf.greater_equal(original_shape[1], crop_width)), ['Crop size greater than the image size.']) offsets = tf.to_int32(tf.stack([offset_height, offset_width, 0])) # Use tf.slice instead of crop_to_bounding box as it accepts tensors to # define the crop size. image = control_flow_ops.with_dependencies( [size_assertion], tf.slice(image, offsets, cropped_shape)) return tf.reshape(image, cropped_shape) def _random_crop(image_list, crop_height, crop_width, preproc_info): """Crops the given list of images. The function applies the same crop to each image in the list. This can be effectively applied when there are multiple image inputs of the same dimension such as: image, depths, normals = _random_crop([image, depths, normals], 120, 150) Args: image_list: a list of image tensors of the same dimension but possibly varying channel. crop_height: the new height. crop_width: the new width. Returns: the image_list with cropped images. Raises: ValueError: if there are multiple image inputs provided with different size or the images are smaller than the crop dimensions. """ if not image_list: raise ValueError('Empty image_list.') # Compute the rank assertions. rank_assertions = [] for i in range(len(image_list)): image_rank = tf.rank(image_list[i]) rank_assert = tf.Assert( tf.equal(image_rank, 3), ['Wrong rank for tensor %s [expected] [actual]', image_list[i].name, 3, image_rank]) rank_assertions.append(rank_assert) image_shape = control_flow_ops.with_dependencies( [rank_assertions[0]], tf.shape(image_list[0])) image_height = image_shape[0] image_width = image_shape[1] crop_size_assert = tf.Assert( tf.logical_and( tf.greater_equal(image_height, crop_height), tf.greater_equal(image_width, crop_width)), ['Crop size greater than the image size.']) asserts = [rank_assertions[0], crop_size_assert] for i in range(1, len(image_list)): image = image_list[i] asserts.append(rank_assertions[i]) shape = control_flow_ops.with_dependencies([rank_assertions[i]], tf.shape(image)) height = shape[0] width = shape[1] height_assert = tf.Assert( tf.equal(height, image_height), ['Wrong height for tensor %s [expected][actual]', image.name, height, image_height]) width_assert = tf.Assert( tf.equal(width, image_width), ['Wrong width for tensor %s [expected][actual]', image.name, width, image_width]) asserts.extend([height_assert, width_assert]) # Create a random bounding box. # # Use tf.random_uniform and not numpy.random.rand as doing the former would # generate random numbers at graph eval time, unlike the latter which # generates random numbers at graph definition time. max_offset_height = control_flow_ops.with_dependencies( asserts, tf.reshape(image_height - crop_height + 1, [])) max_offset_width = control_flow_ops.with_dependencies( asserts, tf.reshape(image_width - crop_width + 1, [])) offset_height = tf.random_uniform( [], maxval=max_offset_height, dtype=tf.int32) offset_width = tf.random_uniform( [], maxval=max_offset_width, dtype=tf.int32) preproc_info['crop_info'] = [ offset_height, offset_width, crop_height, crop_width] return [_crop(image, offset_height, offset_width, crop_height, crop_width) for image in image_list] def _central_crop(image_list, crop_height, crop_width): """Performs central crops of the given image list. Args: image_list: a list of image tensors of the same dimension but possibly varying channel. crop_height: the height of the image following the crop. crop_width: the width of the image following the crop. Returns: the list of cropped images. """ outputs = [] for image in image_list: image_height = tf.shape(image)[0] image_width = tf.shape(image)[1] offset_height = (image_height - crop_height) / 2 offset_width = (image_width - crop_width) / 2 outputs.append(_crop(image, offset_height, offset_width, crop_height, crop_width)) return outputs def _mean_image_subtraction(image, means): """Subtracts the given means from each image channel. For example: means = [123.68, 116.779, 103.939] image = _mean_image_subtraction(image, means) Note that the rank of `image` must be known. Args: image: a tensor of size [height, width, C]. means: a C-vector of values to subtract from each channel. Returns: the centered image. Raises: ValueError: If the rank of `image` is unknown, if `image` has a rank other than three or if the number of channels in `image` doesn't match the number of values in `means`. """ # if image.get_shape().ndims != 3: # raise ValueError('Input must be of size [height, width, C>0]') num_channels = image.get_shape().as_list()[-1] if len(means) != num_channels: raise ValueError('len(means) must match the number of channels') channels = tf.split(image, num_channels, 2) for i in range(num_channels): channels[i] -= means[i] return tf.concat(channels, 2) def _smallest_size_at_least(height, width, smallest_side): """Computes new shape with the smallest side equal to `smallest_side`. Computes new shape with the smallest side equal to `smallest_side` while preserving the original aspect ratio. Args: height: an int32 scalar tensor indicating the current height. width: an int32 scalar tensor indicating the current width. smallest_side: A python integer or scalar `Tensor` indicating the size of the smallest side after resize. Returns: new_height: an int32 scalar tensor indicating the new height. new_width: and int32 scalar tensor indicating the new width. """ smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32) height = tf.to_float(height) width = tf.to_float(width) smallest_side = tf.to_float(smallest_side) scale = tf.cond(tf.greater(height, width), lambda: smallest_side / width, lambda: smallest_side / height) new_height = tf.to_int32(height * scale) new_width = tf.to_int32(width * scale) return new_height, new_width def _aspect_preserving_resize(image, smallest_side): """Resize images preserving the original aspect ratio. Args: image: A 3-D image `Tensor`. smallest_side: A python integer or scalar `Tensor` indicating the size of the smallest side after resize. Returns: resized_image: A 3-D tensor containing the resized image. """ num_channels = image.get_shape().as_list()[-1] smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32) shape = tf.shape(image) height = shape[0] width = shape[1] new_height, new_width = _smallest_size_at_least(height, width, smallest_side) image = tf.expand_dims(image, 0) resized_image = tf.image.resize_bilinear(image, [new_height, new_width], align_corners=False) resized_image = tf.squeeze(resized_image) resized_image.set_shape([None, None, num_channels]) return resized_image def preprocess_for_train(image, output_height, output_width, resize_side_min=_RESIZE_SIDE_MIN, resize_side_max=_RESIZE_SIDE_MAX, preproc_info={}, modality='rgb'): """Preprocesses the given image for training. Note that the actual resizing scale is sampled from [`resize_size_min`, `resize_size_max`]. Args: image: A `Tensor` representing an image of arbitrary size. output_height: The height of the image after preprocessing. output_width: The width of the image after preprocessing. resize_side_min: The lower bound for the smallest side of the image for aspect-preserving resizing. resize_side_max: The upper bound for the smallest side of the image for aspect-preserving resizing. Returns: A preprocessed image. """ num_channels = image.get_shape().as_list()[-1] resize_side = tf.random_uniform( [], minval=resize_side_min, maxval=resize_side_max+1, dtype=tf.int32) image = _aspect_preserving_resize(image, resize_side) preproc_info['image_shape'] = tf.shape(image) image = _random_crop([image], output_height, output_width, preproc_info)[0] image.set_shape([output_height, output_width, num_channels]) image = tf.to_float(image) image, whether_flip = tf.cond( tf.greater(tf.random_uniform((), 0, 1, tf.float32), 0.5), lambda: tf.tuple([tf.image.flip_left_right(image), tf.constant(True)]), lambda: tf.tuple([image, tf.constant(False)])) if modality.startswith('flow'): tf.logging.info('Subtracting 255-x from X-flow for flips. Flow input.') assert(num_channels % 2 == 0) flow_img_flip = image alt_mat = np.ones([ image.get_shape().as_list()[-3], image.get_shape().as_list()[-2], num_channels]) alt_mat[..., np.arange(0, num_channels, 2)] *= -1 IMG_SCALER = 256.0 flow_img_flip = (flow_img_flip - IMG_SCALER/2) * alt_mat + IMG_SCALER/2 image = tf.cond( whether_flip, lambda: flow_img_flip, lambda: image) preproc_info['whether_flip'] = whether_flip # tf.image.random_flip_left_right(image) # return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN]) return _mean_image_subtraction(image, [_MEAN] * num_channels) def preprocess_for_eval(image, output_height, output_width, resize_side): """Preprocesses the given image for evaluation. Args: image: A `Tensor` representing an image of arbitrary size. output_height: The height of the image after preprocessing. output_width: The width of the image after preprocessing. resize_side: The smallest side of the image for aspect-preserving resizing. Returns: A preprocessed image. """ num_channels = image.get_shape().as_list()[-1] image = _aspect_preserving_resize(image, resize_side) image = _central_crop([image], output_height, output_width)[0] image.set_shape([output_height, output_width, num_channels]) image = tf.to_float(image) return _mean_image_subtraction(image, [_MEAN] * num_channels) # return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN]) def preprocess_image(image, output_height, output_width, is_training=False, resize_side_min=_RESIZE_SIDE_MIN, resize_side_max=_RESIZE_SIDE_MAX, preproc_info={}, modality='rgb'): """Preprocesses the given image. Args: image: A `Tensor` representing an image of arbitrary size. output_height: The height of the image after preprocessing. output_width: The width of the image after preprocessing. is_training: `True` if we're preprocessing the image for training and `False` otherwise. resize_side_min: The lower bound for the smallest side of the image for aspect-preserving resizing. If `is_training` is `False`, then this value is used for rescaling. resize_side_max: The upper bound for the smallest side of the image for aspect-preserving resizing. If `is_training` is `False`, this value is ignored. Otherwise, the resize side is sampled from [resize_size_min, resize_size_max]. preproc_info: will return all the information for the preprocessing, including sizes, positions, flip or not etc, which can then be replayed onto other images (specifically to be used for target heatmaps). It should contain: - whether_flip: Bool tensor: whether the image was flipped or not - image_shape: [3,1] tensor: size of the original image after resize - crop_info: [offset_ht, offset_wd, crop_ht, crop_wd] Returns: A preprocessed image. """ if is_training: return preprocess_for_train(image, output_height, output_width, resize_side_min, resize_side_max, preproc_info, modality) else: return preprocess_for_eval(image, output_height, output_width, resize_side_min) ================================================ FILE: src/config.py ================================================ """Config System """ import os import os.path as osp import numpy as np from easydict import EasyDict as edict __C = edict() # Consumers can get config by: # from fast_rcnn_config import cfg cfg = __C # # Input options # __C.INPUT = edict() # normal: normal image # rendered-pose: rendered pose on black bg # rendered-pose-on-image: rendered onto the image __C.INPUT.INPUT_IMAGE_FORMAT = 'normal' # pose renders can be 'rgb' or 'split-channel' __C.INPUT.INPUT_IMAGE_FORMAT_POSE_RENDER_TYPE = 'rgb' # input glimpse options __C.INPUT.POSE_GLIMPSE_CONTEXT_RATIO = 0.0 # ratio of glimpse area to pad around # set the following to true to resize the output to [IMAGE_SIZE, IMAGE_SIZE] # square __C.INPUT.POSE_GLIMPSE_RESIZE = False # list part sof the pose to keep in glimpse. Empty => all parts to keep __C.INPUT.POSE_GLIMPSE_PARTS_KEEP = [] __C.INPUT.SPLIT_ID = 1 # for dataset with multiple splits (hmdb) # FOR VIDEO __C.INPUT.VIDEO = edict() __C.INPUT.VIDEO.MODALITY = 'rgb' # rgb/flow5/flow10 etc # # Training options # __C.TRAIN = edict() # Minibatch size __C.TRAIN.BATCH_SIZE = 10 __C.TRAIN.WEIGHT_DECAY = 0.0005 # set to a positive value to clip the gradients at that l2 norm __C.TRAIN.CLIP_GRADIENTS = -1.0 # the following should have been in the INPUT, but are here for historical # reasons __C.TRAIN.IMAGE_SIZE = 450 # final cropped image size __C.TRAIN.RESIZE_SIDE = 480 # resize the input image to this size for preproc ## The RESIZE_SIDE is the size for the smallest side, so be careful, ## MPII has images with extreme ratios ## Note that if the difference RESIZE_SIDE to IMAGE_SIZE is too high, ## most of the image being fed into the network will be small parts of the ## image # This is the side of the heatmap before putting into queues # Ideally, resize it to the final target size so that there is no # need for a resize before computing loss. For inception-v2 with 450 input, the # output is 15x15 __C.TRAIN.FINAL_POSE_HMAP_SIDE = 15 __C.TRAIN.LABEL_SMOOTHING = False __C.TRAIN.MOVING_AVERAGE_VARIABLES = None __C.TRAIN.LEARNING_RATE = 0.01 __C.TRAIN.LEARNING_RATE_DECAY_RATE = 0.33 __C.TRAIN.END_LEARNING_RATE = 0.00001 __C.TRAIN.NUM_STEPS_PER_DECAY = 0 # if this is not 0, the NUM_EPOCHS_PER_DECAY # is ignored and this is used __C.TRAIN.NUM_EPOCHS_PER_DECAY = 40.0 __C.TRAIN.LEARNING_RATE_DECAY_TYPE = 'exponential' __C.TRAIN.OPTIMIZER = 'momentum' __C.TRAIN.MOMENTUM = 0.9 __C.TRAIN.ADAM_BETA1 = 0.9 __C.TRAIN.ADAM_BETA2 = 0.999 __C.TRAIN.OPT_EPSILON = 1.0 __C.TRAIN.TRAINABLE_SCOPES = '' __C.TRAIN.MAX_NUMBER_OF_STEPS = 100000 __C.TRAIN.LOG_EVERY_N_STEPS = 10 __C.TRAIN.SAVE_SUMMARIES_SECS = 300 __C.TRAIN.SAVE_INTERVAL_SECS = 1800 __C.TRAIN.IGNORE_MISSING_VARS = True __C.TRAIN.CHECKPOINT_PATH = 'data/pretrained_models/inception_v3.ckpt' # __C.TRAIN.CHECKPOINT_EXCLUDE_SCOPES = 'InceptionV3/Logits,InceptionV3/AuxLogits,PoseLogits' __C.TRAIN.CHECKPOINT_EXCLUDE_SCOPES = '' __C.TRAIN.DATASET_SPLIT_NAME = 'trainval_train' # loss fn can be from the list or empty '', i.e. no loss on that modality __C.TRAIN.LOSS_FN_POSE = 'l2' # can be 'l2'/'log-loss'/'sigmoid-log-loss'/'cosine-loss' __C.TRAIN.LOSS_FN_POSE_WT = 1.0 __C.TRAIN.LOSS_FN_POSE_SAMPLED = False # Harder loss, sample the negatives __C.TRAIN.LOSS_FN_ACTION = 'softmax-xentropy' # can be 'softmax-xentropy' __C.TRAIN.LOSS_FN_ACTION_WT = 1.0 __C.TRAIN.VAR_NAME_MAPPER = '' # to be used when loading from npy checkpoints # see options in restore/var_name_mapper.py __C.TRAIN.VIDEO_FRAMES_PER_VIDEO = 1 # If true, divide the video into segments and read # a random frame from that segment __C.TRAIN.READ_SEGMENT_STYLE = False __C.TRAIN.ITER_SIZE = 1 # accumulate gradients over this many iterations __C.TRAIN.OTHER_IMG_SUMMARIES_TO_ADD = ['PosePrelogitsBasedAttention'] # # Testing options # __C.TEST = edict() __C.TEST.BATCH_SIZE = 10 __C.TEST.DATASET_SPLIT_NAME = 'trainval_val' __C.TEST.MAX_NUM_BATCHES = None __C.TEST.CHECKPOINT_PATH = b'' __C.TEST.MOVING_AVERAGE_DECAY = None __C.TEST.VIDEO_FRAMES_PER_VIDEO = 1 # single image dataset. Set 25 for hmdb __C.TEST.EVAL_METRIC = '' # normal eval. Set ='mAP' to compute that. # # Network properties # __C.NET = edict() # The following replaces the action logits with one computed by weighting the # output using pose heatmaps __C.NET.USE_POSE_ATTENTION_LOGITS = False __C.NET.USE_POSE_ATTENTION_LOGITS_DIMS = [-1] # by default use all parts # set following true to have a heatmap as the avg of all heatmaps __C.NET.USE_POSE_ATTENTION_LOGITS_AVGED_HMAP = False # The following will replace the action logits with one computed over the last # pose logits __C.NET.USE_POSE_LOGITS_DIRECTLY = False # set true to also have the actual logits concatenated to the output __C.NET.USE_POSE_LOGITS_DIRECTLY_PLUS_LOGITS = False # Another version, after talking to Deva on March 20, 2017. Concat before avg # pool and remove the extra layer. # The following by default contain the image logits __C.NET.USE_POSE_LOGITS_DIRECTLY_v2 = False __C.NET.USE_POSE_LOGITS_DIRECTLY_v2_EXTRA_LAYER = False # The following will replace the action logits with a one computed using an # unconstrained attention predictor based on the pose output __C.NET.USE_POSE_PRELOGITS_BASED_ATTENTION = False # REMOVED THIS TO DEPRECATE # # setting the following to true basically just reproduces the original system # # (doesnot use any attention). I just used it to debug that this can reproduce # # the original numbers (nothing else got screwed up) # __C.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_DEBUG = False # set the following to more to have more layers predicting the unconstrained # attention map # DEPRECATING the following, commented out for now, will be removed later. # __C.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_NLAYERS = 1 # set True to enforce the attention map that is learnt to be passed through a # spatial softmax __C.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_SOFTMAX_ATT = False # Pass the attention through a relu __C.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_RELU_ATT = False # 21 April 2017: This is not DEPRECATED because it didn't help, so it won't # work with code now. This was to simplify code for TopDownAttention endpoint # # Create an attention map for each class # adding it again on July 26, 2017 for NIPS17 rebuttal __C.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_PER_CLASS = False # Train attention directly over image features __C.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_SINGLE_LAYER_ATT = False # Add the predicted pose to the logits features __C.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_WITH_POSE_FEAT = False # 2-layers over the pose logits __C.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_WITH_POSE_FEAT_2LAYER = False # Allow for Rank > 1 approximation. Other options might not work with this __C.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_RANK = 1 # Do attention on temporal pooling as well __C.NET.USE_TEMPORAL_ATT = False # Bilinear pooling baselines __C.NET.USE_COMPACT_BILINEAR_POOLING = False # Set which endpoint serves as the output for pose __C.NET.LAST_CONV_MAP_FOR_POSE = edict() __C.NET.LAST_CONV_MAP_FOR_POSE.inception_v2_tsn = 'InceptionV2_TSN/inception_5a' __C.NET.LAST_CONV_MAP_FOR_POSE.inception_v3 = 'Mixed_7c' __C.NET.LAST_CONV_MAP_FOR_POSE.resnet_v1_101 = 'resnet_v1_101/block4' __C.NET.LAST_CONV_MAP_FOR_POSE.vgg_16 = 'vgg_16/conv5' # Train the top BN. Useful when training flow/multi-channel inputs other than # RGB. In case of ResNet, this means "train only top_bn", and keep others # fixed. __C.NET.TRAIN_TOP_BN = False # Dropout # -1 (<0) => Use the network default. Else, use this value __C.NET.DROPOUT = -1.0 # # MISC # # For reproducibility __C.RNG_SEED = 42 # A small number that's used many times __C.EPS = 1e-14 # Root directory of project __C.ROOT_DIR = osp.abspath(osp.join(osp.dirname(__file__), '..', '..')) # Data directory __C.DATA_DIR = osp.abspath(osp.join(__C.ROOT_DIR, 'data')) # Model directory __C.EXP_DIR = 'expt_outputs/' __C.DATASET_NAME = 'mpii' __C.DATASET_DIR = 'data/mpii/mpii_tfrecords' # Set the following if using the train_test files from non-std location __C.DATASET_LIST_DIR = '' __C.MODEL_NAME = 'inception_v3' __C.NUM_READERS = 4 __C.NUM_PREPROCESSING_THREADS = 4 __C.GPUS = '2' __C.HEATMAP_MARKER_WD_RATIO = 0.1 __C.MAX_INPUT_IMAGE_SIZE = 512 # to avoid arbitrarily huge input images # ['one-label'/'multi-label%d'] __C.INPUT_FILE_STYLE_LABEL = '' def get_output_dir(config_file_name): """Return the directory where experimental artifacts are placed. If the directory does not exist, it is created. A canonical path is built using the name from an imdb and a network (if not None). """ outdir = osp.abspath(osp.join(__C.EXP_DIR, osp.basename(config_file_name))) if not os.path.exists(outdir): os.makedirs(outdir) return outdir def _merge_a_into_b(a, b): """Merge config dictionary a into config dictionary b, clobbering the options in b whenever they are also specified in a. """ if type(a) is not edict: return for k, v in a.iteritems(): # a must specify keys that are in b if not b.has_key(k): raise KeyError('{} is not a valid config key'.format(k)) # the types must match, too old_type = type(b[k]) if old_type is not type(v): if isinstance(b[k], np.ndarray): v = np.array(v, dtype=b[k].dtype) else: raise ValueError(('Type mismatch ({} vs. {}) ' 'for config key: {}').format(type(b[k]), type(v), k)) # recursively merge dicts if type(v) is edict: try: _merge_a_into_b(a[k], b[k]) except: print('Error under config key: {}'.format(k)) raise else: b[k] = v def cfg_from_file(filename): """Load a config file and merge it into the default options.""" import yaml with open(filename, 'r') as f: yaml_cfg = edict(yaml.load(f)) _merge_a_into_b(yaml_cfg, __C) def cfg_from_list(cfg_list): """Set config keys via list (e.g., from command line).""" from ast import literal_eval assert len(cfg_list) % 2 == 0 for k, v in zip(cfg_list[0::2], cfg_list[1::2]): key_list = k.split('.') d = __C for subkey in key_list[:-1]: assert d.has_key(subkey) d = d[subkey] subkey = key_list[-1] assert d.has_key(subkey) try: value = literal_eval(v) except: # handle the case when v is a string literal value = v assert type(value) == type(d[subkey]), \ 'type {} does not match original type {}'.format( type(value), type(d[subkey])) d[subkey] = value ================================================ FILE: src/custom_ops/Makefile ================================================ BOOST_DIR := /home/rgirdhar/Software/basic/boost/install2/ BOOST_LIB_DIR := $(BOOST_DIR)/lib BOOST_INC_DIR := $(BOOST_DIR)/include TF_INC := $(shell python -c 'import tensorflow as tf; print(tf.sysconfig.get_include())') LDFLAGS := -Wl,-rpath,$(BOOST_LIB_DIR) # this ensures it will look in the correct BOOST directory for libs (not the system path) all: pose_to_heatmap.so zero_out_channels.so render_pose.so render_objects.so pose_to_heatmap.so: pose_to_heatmap.cc g++ -std=c++11 $(LDFLAGS) -shared -I$(BOOST_INC_DIR) `pkg-config --cflags --libs opencv` pose_to_heatmap.cc -o pose_to_heatmap.so -fPIC -I $(TF_INC) -O2 -L$(BOOST_LIB_DIR) -lboost_system -lboost_filesystem -lboost_thread zero_out_channels.so: zero_out_channels.cc g++ -std=c++11 $(LDFLAGS) -shared zero_out_channels.cc -o zero_out_channels.so -fPIC -I $(TF_INC) -O2 render_pose.so: render_pose.cc pose_utils.hpp g++ -std=c++11 $(LDFLAGS) -shared -I$(BOOST_INC_DIR) `pkg-config --cflags --libs opencv` render_pose.cc -o render_pose.so -fPIC -I $(TF_INC) -O2 -L$(BOOST_LIB_DIR) -lboost_system -lboost_filesystem -lboost_thread render_objects.so: render_objects.cc g++ -std=c++11 $(LDFLAGS) -shared -I$(BOOST_INC_DIR) `pkg-config --cflags --libs opencv` render_objects.cc -o render_objects.so -fPIC -I $(TF_INC) -O2 -L$(BOOST_LIB_DIR) -lboost_system -lboost_filesystem -lboost_thread ================================================ FILE: src/custom_ops/__init__.py ================================================ ================================================ FILE: src/custom_ops/custom_ops_factory.py ================================================ import os import json from collections import OrderedDict import numpy as np import tensorflow as tf cur_path = os.path.realpath(__file__) ROOT_PATH = os.path.dirname(cur_path) # add any new ops under the following pose_to_heatmap_fn = tf.load_op_library( os.path.join(ROOT_PATH, 'pose_to_heatmap.so')).pose_to_heatmap zero_out_channels_fn = tf.load_op_library( os.path.join(ROOT_PATH, 'zero_out_channels.so')).zero_out_channels render_pose_fn = tf.load_op_library( os.path.join(ROOT_PATH, 'render_pose.so')).render_pose render_objects_fn = tf.load_op_library( os.path.join(ROOT_PATH, 'render_objects.so')).render_objects def pose_to_heatmap(*args, **kwargs): with tf.variable_scope('pose_to_heatmap_pyWrapper'): pose_img, pose_valid = pose_to_heatmap_fn(*args, **kwargs) out_channels = kwargs['out_channels'] pose_img.set_shape((None, None, out_channels)) pose_valid.set_shape((out_channels,)) pose_img *= 255.0 pose_img = tf.cast(pose_img, tf.uint8) return pose_img, pose_valid def zero_out_channels(*args, **kwargs): with tf.variable_scope('zero_out_channels_pyWrapper'): return zero_out_channels_fn(*args, **kwargs) def render_pose(*args, **kwargs): with tf.variable_scope('render_pose_pyWrapper'): out_channels = 3 if kwargs['out_type'] == 'rgb': kwargs['out_type'] = 1 out_channels = 3 elif kwargs['out_type'] == 'split-channel': kwargs['out_type'] = 2 out_channels = 18 # number of limbs img = render_pose_fn(*args, **kwargs) img *= 255.0 img = tf.cast(img, tf.uint8) img.set_shape((None, None, out_channels)) return img # from render_pose.cc mpii_to_coco = OrderedDict([ (9, 0), (8, 1), (12, 2), (11, 3), (10, 4), (13, 5), (14, 6), (15, 7), (2, 8), (1, 9), (0, 10), (3, 11), (4, 12), (5, 13), ]) def read_json_pose_fn(fpath): try: with open(fpath, 'r') as fin: data = json.load(fin) except: print('Unable to open file {}'.format(fpath)) return -np.ones((16*3,)).astype('int64') res = [] for body in data['bodies']: mpii_joints = -np.ones((16, 3)) joints = np.array(body['joints']) joints = np.reshape(joints, (-1, 3)) joints[joints[..., :] <= 0] = -1 mpii_joints[np.array(mpii_to_coco.keys()), :] = \ joints[np.array(mpii_to_coco.values()), :] res += mpii_joints.reshape((-1,)).tolist() res = np.array(res).astype('int64') return res def read_json_pose(*args): return tf.py_func(read_json_pose_fn, args, tf.int64) def render_objects(*args, **kwargs): with tf.variable_scope('render_objects_pyWrapper'): img = render_objects_fn(*args, **kwargs) img *= 255.0 img = tf.cast(img, tf.uint8) img.set_shape((None, None, kwargs['out_channels'])) return img def extract_glimpse(image, pose_label, orig_im_ht, orig_im_wd, out_side, pad_ratio, parts_keep): # pose label is a [3x16xn,] vector # for now just take the first pose and crop out the human with tf.name_scope('ExtractGlimpse'): pose_label = pose_label[:16*3] pose_label = tf.reshape(pose_label, [16, 3]) if len(parts_keep) > 0: pose_label = tf.gather(pose_label, parts_keep) if len(parts_keep) == 1: # now only one point, but need at least two to make a crop region delta = tf.to_int64( [tf.to_float(tf.shape(image)[-2]) * 0.1, tf.to_float(tf.shape(image)[-3]) * 0.1, 0]) pose_label = tf.stack([ pose_label[0] - delta, pose_label[0] + delta]) pose_label_x = tf.to_float(pose_label[:, 0]) * \ tf.to_float(tf.shape(image)[-2]) / tf.to_float(orig_im_wd) pose_label_y = tf.to_float(pose_label[:, 1]) * \ tf.to_float(tf.shape(image)[-3]) / tf.to_float(orig_im_ht) pose_label = tf.stack([pose_label_y, pose_label_x]) mx_pts = tf.to_int32(tf.reduce_max(pose_label, axis=1)) mn_pts = tf.to_int32(tf.reduce_min( tf.where(tf.greater_equal(pose_label, 0), pose_label, tf.ones(pose_label.get_shape()) * 999999), axis=1)) delta_0 = tf.to_int32(tf.to_float((mx_pts[0] - mn_pts[0])) * pad_ratio) delta_1 = tf.to_int32(tf.to_float((mx_pts[1] - mn_pts[1])) * pad_ratio) mx_pts = mx_pts + [delta_0, delta_1] mn_pts = mn_pts - [delta_0, delta_1] offset_ht = tf.maximum(mn_pts[0], 0) offset_wd = tf.maximum(mn_pts[1], 0) target_ht = tf.minimum(mx_pts[0]-offset_ht, tf.shape(image)[-3]-offset_ht-1) target_wd = tf.minimum(mx_pts[1]-offset_wd, tf.shape(image)[-2]-offset_wd-1) # image = tf.Print(image, [offset_ht, offset_wd, target_ht, target_wd, # tf.shape(image)], "stuff:") image = tf.cond(tf.logical_and( tf.greater(mx_pts[1], mn_pts[1]), tf.greater(mx_pts[0], mn_pts[0])), lambda: tf.image.crop_to_bounding_box( image, offset_ht, offset_wd, target_ht, target_wd), lambda: image) if out_side > 0: image = tf.image.resize_images( image, [out_side, out_side]) return image def read_sparse_label_fn(sparse_label, nclasses): """sparse_label is a string and return a 1D vector with the dense label """ res = np.zeros((nclasses,), dtype='int32') res[np.array([int(el.split(':')[0]) for el in sparse_label.split(',')])] = \ np.array([int(el.split(':')[1]) for el in sparse_label.split(',')]) res[res < 0] = 0 # get rid of -1 label for now return res def read_sparse_label(*args): return tf.py_func(read_sparse_label_fn, args, tf.int32) ================================================ FILE: src/custom_ops/pose_to_heatmap.cc ================================================ #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/shape_inference.h" #include "tensorflow/core/framework/op_kernel.h" #include #include #include using namespace tensorflow; using namespace std; REGISTER_OP("PoseToHeatmap") .Attr("out_channels: int = 16") .Attr("marker_wd_ratio: float = 0.1") .Attr("do_gauss_blur: bool = True") .Input("pose_label: int64") .Input("im_ht: int64") .Input("im_wd: int64") .Input("out_wd: int64") // out_height decided using this and aspect ratio of image .Output("heatmap: float") .Output("is_valid: bool"); // a bit for each channel, if that pose label is valid or not class PoseToHeatmapOp : public OpKernel { public: explicit PoseToHeatmapOp(OpKernelConstruction* context) : OpKernel(context) { OP_REQUIRES_OK( context, context->GetAttr("out_channels", &out_channels_)); OP_REQUIRES_OK( context, context->GetAttr("marker_wd_ratio", &marker_wd_ratio_)); OP_REQUIRES_OK( context, context->GetAttr("do_gauss_blur", &do_gauss_blur_)); } void Compute(OpKernelContext* context) override { // Grab the input tensor const Tensor& pose_label_tensor = context->input(0); auto pose_label = pose_label_tensor.flat(); const Tensor& im_ht_tensor = context->input(1); auto im_ht = im_ht_tensor.flat()(0); const Tensor& im_wd_tensor = context->input(2); auto im_wd = im_wd_tensor.flat()(0); const Tensor& out_wd_tensor = context->input(3); auto out_wd = out_wd_tensor.flat()(0); int out_ht = ((im_ht * out_wd * 1.0) / im_wd); // The pose label should be 16 keypoints, with X,Y,is_visible int num_keypoints = out_channels_; assert(pose_label.size() % (3 * num_keypoints) == 0); int n_rects = pose_label.size() / (3 * num_keypoints); // Create output tensors TensorShape out_shape {out_ht, out_wd, out_channels_}; Tensor* output_tensor = NULL; OP_REQUIRES_OK( context, context->allocate_output( 0, out_shape, &output_tensor)); auto output = output_tensor->tensor(); TensorShape out_shape_valid {out_channels_}; Tensor* output_tensor_valid = NULL; OP_REQUIRES_OK( context, context->allocate_output( 1, out_shape_valid, &output_tensor_valid)); auto output_valid = output_tensor_valid->tensor(); int elts_per_pose = num_keypoints * 3; for (int i = 0; i < num_keypoints; i++) { cv::Mat channel(out_ht, out_wd, CV_32FC1, 0.0); output_valid(i) = false; for (int rid = 0; rid < n_rects; rid++) { // for each rectangle int x = pose_label(rid * elts_per_pose + i * 3) * out_wd / im_wd; int y = pose_label(rid * elts_per_pose + i * 3 + 1) * out_ht / im_ht; int is_visible = pose_label(rid * elts_per_pose + i * 3 + 2); // ignore this if (pose_label(rid * elts_per_pose + i * 3) >= 0 && pose_label(rid * elts_per_pose + i * 3 + 1) >= 0) { output_valid(i) = true; circle(channel, cv::Point(x, y), (int) out_wd * marker_wd_ratio_, cv::Scalar(1.0, 1.0, 1.0), -1); if (do_gauss_blur_) GaussianBlur(channel, channel, cv::Size(7, 7), 0); } } for (int r = 0; r < channel.rows; r++) { for (int c = 0; c < channel.cols; c++) { output(r, c, i) = channel.at(r, c); } } } } private: int out_channels_; float marker_wd_ratio_; bool do_gauss_blur_; }; REGISTER_KERNEL_BUILDER(Name("PoseToHeatmap").Device(DEVICE_CPU), PoseToHeatmapOp); ================================================ FILE: src/custom_ops/pose_utils.hpp ================================================ #include #include #include #include // Very important to add the following #define // boost json parser depends on boost::spirit // which is not thread safe by default. // It was giving Segmentation Faults. // Also, this means I need to compile with -lboost_thread // ref: http://stackoverflow.com/a/22089792/1492614 // This was tested to work fine with multi-threaded training #define BOOST_SPIRIT_THREADSAFE #include #include using namespace std; using namespace cv; namespace pt = boost::property_tree; vector joint_color {1, 0, 0, 1, 0.33, 0, 1, 0.66, 0, 1, 1, 0, 0.66, 1, 0, 0.33, 1, 0, 0, 1, 0, 0, 1, 0.33, 0, 1, 0.66, 0, 1, 1, 0, 0.66, 1, 0, 0.33, 1, 0, 0, 1, 0.33, 0, 1, 0.66, 0, 1, 1, 0, 1, 1, 0, 0.66, 1, 0, 0.33}; // 1, 1, 1}; vector limbSeq {2, 3, 2, 6, 3, 4, 4, 5, 6, 7, 7, 8, 2, 9, 9, 10, 10, 11, 2, 12, 12, 13, 13, 14, 2, 1, 1, 15, 15, 17, 1, 16, 16, 18, 3, 17}; // 6, 18}; #define RENDER_POSE_OUT_TYPE_RGB 1 #define RENDER_POSE_OUT_TYPE_SPLITCHANNEL 2 Mat render_pose(vector>> poses, int out_ht, int out_wd, int max_ht, int max_wd, int marker_wd, int out_type=RENDER_POSE_OUT_TYPE_RGB) { int nLimbs = limbSeq.size() / 2; int nchannels = 3; if (out_type == RENDER_POSE_OUT_TYPE_RGB) { nchannels = 3; } else if (out_type == RENDER_POSE_OUT_TYPE_SPLITCHANNEL) { nchannels = nLimbs; } else { cerr << "render_pose: Unknown output type." << endl; } Mat output(out_ht, out_wd, CV_32FC(nchannels), 0.0); vector output_channels; if (nchannels != 3) { split(output, output_channels); } // assert(limbSeq.size() / 2 == joint_color.size() / 3); for (int body_id = 0; body_id < poses.size(); body_id++) { for (int i = 0; i < nLimbs; i++) { float scal_ht = out_ht * 1.0 / max_ht; float scal_wd = out_wd * 1.0 / max_wd; tuple pt1 = poses[body_id][limbSeq[2*i]-1]; tuple pt2 = poses[body_id][limbSeq[2*i+1]-1]; float pt1_conf = get<2>(pt1); float pt2_conf = get<2>(pt2); if (pt1_conf < 0.1 || pt2_conf < 0.1) { continue; } Mat render_img; Scalar color; if (nchannels == 3) { render_img = output; color = CV_RGB(joint_color[i*3], joint_color[i*3+1], joint_color[i*3+2]); } else { render_img = output_channels[i]; color = Scalar(1); } line( render_img, Point(get<0>(pt1) * scal_wd, get<1>(pt1) * scal_ht), Point(get<0>(pt2) * scal_wd, get<1>(pt2) * scal_ht), color, marker_wd); } } if (nchannels != 3) { merge(output_channels, output); } return output; } vector>> read_pose_xml(string xml_str, int &pose_dim) { vector>> poses; if (xml_str.size() > 0) { stringstream ss(xml_str); pt::ptree root; pt::read_json(ss, root); for (pt::ptree::value_type &body : root.get_child("bodies")) { vector elts; for (pt::ptree::value_type &joints : body.second.get_child("joints")) { elts.push_back((float) stof(joints.second.data())); } pose_dim = elts.size() / 3; // x,y,score format if (pose_dim * 3 != elts.size()) { cerr << "Invalid number of numbers in pose dim (" << pose_dim * 3 << " vs " << elts.size() << endl; poses.clear(); break; } vector> pose; for (int i = 0; i < pose_dim; i++) { pose.push_back(make_tuple(elts[i*3], elts[i*3+1], elts[i*3+2])); } poses.push_back(pose); } } else { cerr << "json_to_pose: Empty string passed in." << endl; } return poses; } ================================================ FILE: src/custom_ops/render_objects.cc ================================================ #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/shape_inference.h" #include "tensorflow/core/framework/op_kernel.h" #include #include #include using namespace tensorflow; using namespace std; REGISTER_OP("RenderObjects") .Attr("out_channels: int = 80") .Input("objects_label: string") .Input("im_ht: int64") .Input("im_wd: int64") .Input("out_wd: int64") // out_height decided using this and aspect ratio of image .Output("image: float"); void read_detections( string objects_label, vector> &detections) { istringstream ss(objects_label); int ob_label, id; // ignore the id float conf, xmin, ymin, xmax, ymax; detections.clear(); while (ss >> id >> ob_label >> conf >> xmin >> ymin >> xmax >> ymax) { detections.push_back(make_tuple(ob_label, conf, xmin, ymin, xmax, ymax)); } } class RenderObjectsOp : public OpKernel { public: explicit RenderObjectsOp(OpKernelConstruction* context) : OpKernel(context) { OP_REQUIRES_OK( context, context->GetAttr("out_channels", &out_channels_)); } void Compute(OpKernelContext* context) override { // Grab the input tensor const Tensor& objects_label_tensor = context->input(0); auto objects_label = objects_label_tensor.flat()(0); const Tensor& im_ht_tensor = context->input(1); auto im_ht = im_ht_tensor.flat()(0); const Tensor& im_wd_tensor = context->input(2); auto im_wd = im_wd_tensor.flat()(0); const Tensor& out_wd_tensor = context->input(3); auto out_wd = out_wd_tensor.flat()(0); int out_ht = ((im_ht * out_wd * 1.0) / im_wd); // Create output tensors TensorShape out_shape {out_ht, out_wd, out_channels_}; Tensor* output_tensor = NULL; OP_REQUIRES_OK( context, context->allocate_output( 0, out_shape, &output_tensor)); auto output = output_tensor->tensor(); vector> detections; read_detections(objects_label, detections); for (int i = 0; i < out_wd; i++) { for (int j = 0; j < out_ht; j++) { for (int k = 0; k < out_channels_; k++) { output(j, i, k) = 0; } } } if (out_channels_ != 3) { // i.e. not doing a RGB output for (unsigned int i = 0; i < detections.size(); i++) { int xmin = get<2>(detections[i]) * out_wd; int ymin = get<3>(detections[i]) * out_ht; int xmax = get<4>(detections[i]) * out_wd; int ymax = get<5>(detections[i]) * out_ht; int ob_label = get<0>(detections[i]); float conf = get<1>(detections[i]); for (int c = max(0, (int) xmin); c < min(xmax, (int) out_wd); c++) { for (int r = max(0, (int) ymin); r < min(ymax, (int) out_ht); r++) { output(r, c, ob_label) = conf; } } } } else { cerr << "render_objects: unable to render RGB currently." << endl; } } private: int out_channels_; }; REGISTER_KERNEL_BUILDER(Name("RenderObjects").Device(DEVICE_CPU), RenderObjectsOp); ================================================ FILE: src/custom_ops/render_pose.cc ================================================ #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/shape_inference.h" #include "tensorflow/core/framework/op_kernel.h" #include #include #include #include "pose_utils.hpp" using namespace tensorflow; using namespace std; namespace pt = boost::property_tree; REGISTER_OP("RenderPose") .Attr("marker_wd_ratio: float = 0.01") // ratio of output image width .Attr("out_type: int = 1") // RENDER_POSE_OUT_TYPE_RGB or RENDER_POSE_OUT_TYPE_SPLITCHANNEL .Input("pose_label: int64") .Input("im_ht: int64") .Input("im_wd: int64") .Input("out_wd: int64") .Output("image: float"); class RenderPoseOp : public OpKernel { public: explicit RenderPoseOp(OpKernelConstruction* context) : OpKernel(context) { OP_REQUIRES_OK( context, context->GetAttr("marker_wd_ratio", &marker_wd_ratio_)); OP_REQUIRES_OK( context, context->GetAttr("out_type", &out_type_)); } void Compute(OpKernelContext* context) override { // Grab the input tensor const Tensor& pose_label_tensor = context->input(0); auto pose_label = pose_label_tensor.flat(); const Tensor& im_ht_tensor = context->input(1); auto im_ht = im_ht_tensor.flat()(0); const Tensor& im_wd_tensor = context->input(2); auto im_wd = im_wd_tensor.flat()(0); const Tensor& out_wd_tensor = context->input(3); auto out_wd = out_wd_tensor.flat()(0); int out_ht = ((im_ht * out_wd * 1.0) / im_wd); int num_keypoints = 16; // MPII poses assert(pose_label.size() % (3 * num_keypoints) == 0); int n_people = pose_label.size() / (3 * num_keypoints); vector>> poses; int elts_per_pose = 3 * num_keypoints; for (int i = 0; i < n_people; i++) { vector> person; for (int j = 0; j < num_keypoints; j++) { int x = pose_label(elts_per_pose * i + 3 * j); int y = pose_label(elts_per_pose * i + 3 * j + 1); int is_visible = pose_label(elts_per_pose * i + 3 * j + 2); // TODO (rgirdhar): Maybe this needs be fixed if (x == -1 && y == -1) { is_visible = 0; } else { is_visible = 1; } person.push_back(make_tuple(x, y, is_visible)); } poses.push_back(convert_pose_mpii_to_coco(person)); } cv::Mat render = render_pose( poses, out_ht, out_wd, im_ht, im_wd, out_wd * marker_wd_ratio_, out_type_); // Create an output tensor TensorShape out_shape {out_ht, out_wd, render.channels()}; Tensor* output_tensor = NULL; OP_REQUIRES_OK( context, context->allocate_output( 0, out_shape, &output_tensor)); auto output = output_tensor->tensor(); for (int i = 0; i < render.rows; i++) { for (int j = 0; j < render.cols; j++) { float *pixel = render.ptr(i, j); for (int k = 0; k < render.channels(); k++) { output(i, j, k) = pixel[render.channels()-k-1]; } } } } private: vector> convert_pose_mpii_to_coco( vector> poses) { // Using the coco definition from https://github.com/CMU-Perceptual-Computing-Lab/caffe_rtpose // Using the MPII definition from http://human-pose.mpi-inf.mpg.de/#download vector> res; auto dummy = make_tuple(0, 0, 0); // for the parts I don't have in MPII map coco_to_mpii = { {0, 9}, // Nose, head_top (approx) {1, 8}, {2, 12}, {3, 11}, {4, 10}, {5, 13}, {6, 14}, {7, 15}, {8, 2}, {9, 1}, {10, 0}, {11, 3}, {12, 4}, {13, 5}, {14, -1}, {15, -1}, {16, -1}, {17, -1}, {18, -1} }; for (int i = 0; i < coco_to_mpii.size(); i++) { if (coco_to_mpii[i] == -1) { res.push_back(dummy); } else { res.push_back(poses[coco_to_mpii[i]]); } } return res; } float marker_wd_ratio_; int out_type_; }; REGISTER_KERNEL_BUILDER(Name("RenderPose").Device(DEVICE_CPU), RenderPoseOp); ================================================ FILE: src/custom_ops/test/pose_to_heatmap_op_test.py ================================================ import tensorflow as tf import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import numpy as np from custom_ops.custom_ops_factory import pose_to_heatmap with tf.Session(''): pose = [50, 50, 1] * 3 +\ [0, 0, 1] * 2 +\ [-1, -1, 1] * 11 pose += [90, 90, 1] * 3 +\ [0, 0, 1] * 2 +\ [-1, -1, 1] * 11 T, T_valid = pose_to_heatmap( pose, 100, 200, 100, out_channels=16 ) A = T.eval() A_valid = T_valid.eval() plt.imsave('temp.jpg', np.mean(A, axis=-1)) print A_valid import pdb pdb.set_trace() a = 1 ================================================ FILE: src/custom_ops/test/render_objects_op_test.py ================================================ import tensorflow as tf import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import numpy as np from custom_ops.custom_ops_factory import render_objects with tf.Session(''): T = render_objects( '1 1 0.743129 0.031770 0.151354 0.448363 0.994178\n' '1 1 0.813451 0.517574 0.303005 0.957526 0.975016', 100, 200, 100, out_channels=80 ) A = T.eval() plt.imsave('temp.jpg', np.mean(A, axis=-1)) import pdb pdb.set_trace() a = 1 ================================================ FILE: src/custom_ops/test/zero_out_channels_op_test.py ================================================ import tensorflow as tf import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import numpy as np from custom_ops.custom_ops_factory import zero_out_channels with tf.Session(''): A = np.ones((1, 3, 3, 5)) channels = [True, False, True, True, True] B = zero_out_channels(A, channels) print B C = B.eval() assert(np.all(C[:, :, :, 0] == 1)) assert(np.all(C[:, :, :, 1] == 0)) assert(np.all(C[:, :, :, 2] == 1)) assert(np.all(C[:, :, :, 3] == 1)) import pdb pdb.set_trace() a = 1 ================================================ FILE: src/custom_ops/zero_out_channels.cc ================================================ #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/shape_inference.h" #include "tensorflow/core/framework/op_kernel.h" using namespace tensorflow; REGISTER_OP("ZeroOutChannels") .Attr("T: {float32, float64, int32, int64}") .Input("to_zero: T") // must be 4-dim images .Input("channels: bool") // list of true/false, false=>zero out that channel .Output("zeroed: T") .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) { c->set_output(0, c->input(0)); return Status::OK(); }); template class ZeroOutChannelsOp : public OpKernel { public: explicit ZeroOutChannelsOp(OpKernelConstruction* context) : OpKernel(context) {} void Compute(OpKernelContext* context) override { // Grab the input tensor const Tensor& input_tensor = context->input(0); const Tensor& input_tensor_channel = context->input(1); auto input = input_tensor.tensor(); auto input_channel = input_tensor_channel.flat(); assert(input_tensor.shape().dims() == 4); int num_channels = input_tensor.shape().dim_size(3); assert(num_channels == input_tensor_channel.shape().dim_size(0)); Tensor *output = NULL; OP_REQUIRES_OK( context, context->allocate_output(0, input_tensor.shape(), &output)); auto output_flat = output->tensor(); for (int i = 0; i < input_tensor.shape().dim_size(0); i++) { for (int j = 0; j < input_tensor.shape().dim_size(1); j++) { for (int k = 0; k < input_tensor.shape().dim_size(2); k++) { for (int l = 0; l < input_tensor.shape().dim_size(3); l++) { if (input_channel(l) == false) { output_flat(i, j, k, l) = 0; } else { output_flat(i, j, k, l) = input(i, j, k, l); } } } } } } }; REGISTER_KERNEL_BUILDER( Name("ZeroOutChannels") .Device(DEVICE_CPU) .TypeConstraint("T"), ZeroOutChannelsOp); REGISTER_KERNEL_BUILDER( Name("ZeroOutChannels") .Device(DEVICE_CPU) .TypeConstraint("T"), ZeroOutChannelsOp); REGISTER_KERNEL_BUILDER( Name("ZeroOutChannels") .Device(DEVICE_CPU) .TypeConstraint("T"), ZeroOutChannelsOp); REGISTER_KERNEL_BUILDER( Name("ZeroOutChannels") .Device(DEVICE_CPU) .TypeConstraint("T"), ZeroOutChannelsOp); ================================================ FILE: src/datasets/__init__.py ================================================ ================================================ FILE: src/datasets/charades.py ================================================ """Provides data for the HMDB51 dataset. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from datasets.video_data_utils import gen_dataset import tensorflow as tf _CHARADES_TRINITY_LIST_DIR = '/data/rgirdhar/Data2/Projects/2016/002_VideoRepresentation/StandardData/001_Charades/v1/Lists/train_test_lists/' _CHARADES_TRINITY_POSE_LABEL_DIR = '/scratch/rgirdhar/Datasets/Video/004_Charades/Processed/002_Pose_CPM_v2/' def get_split(split_name, dataset_dir, file_pattern=None, reader=None, **kwargs): _NUM_CLASSES = 157 # There are no pose labels, but need to keep this to load models from MPII # trained # Also, now the processing can still avoided by having no loss on pose _NUM_POSE_KEYPOINTS = 16 # Need to do this otherwise the lambda function defined below will not work # It evaluates the kwargs['..'] also when evaluated if 'dataset_list_dir' not in kwargs: dataset_list_dir = _CHARADES_TRINITY_LIST_DIR else: dataset_list_dir = kwargs['dataset_list_dir'] _LIST_FN = lambda split, id: \ '%s/%s_split%d.txt' % ( dataset_list_dir, split, id) kwargs['num_pose_keypoints'] = _NUM_POSE_KEYPOINTS kwargs['num_classes'] = _NUM_CLASSES kwargs['list_fn'] = _LIST_FN with open(_LIST_FN(split_name, kwargs['split_id']), 'r') as fin: ncols = len(fin.readline().strip().split()) if ncols == 4: input_file_style = '4-col' elif ncols == 3: input_file_style = '3-col' # since video level testing with mAP else: raise ValueError('Invalid file style') tf.logging.info('Using input_file_style {}'.format(input_file_style)) # need to remove some things from kwargs (if they exist) before passing on kwargs.pop('dataset_list_dir', []) return gen_dataset(split_name, dataset_dir, file_pattern, reader, pose_dataset_dir=_CHARADES_TRINITY_POSE_LABEL_DIR, input_file_style=input_file_style, **kwargs), _NUM_POSE_KEYPOINTS ================================================ FILE: src/datasets/dataset_factory.py ================================================ """A factory-pattern class which returns classification image/label pairs.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function from datasets import mpii from datasets import hmdb51 from datasets import charades from datasets import hico from datasets import jhmdb21 datasets_map = { 'mpii': mpii, 'hmdb51': hmdb51, 'charades': charades, 'hico': hico, 'jhmdb21': jhmdb21 } def get_dataset(name, split_name, dataset_dir, file_pattern=None, reader=None, **kwargs): # added by rgirdhar: allow other options """Given a dataset name and a split_name returns a Dataset. Args: name: String, the name of the dataset. split_name: A train/test split name. dataset_dir: The directory where the dataset files are stored. file_pattern: The file pattern to use for matching the dataset source files. reader: The subclass of tf.ReaderBase. If left as `None`, then the default reader defined by each dataset is used. Returns: A `Dataset` class. Raises: ValueError: If the dataset `name` is unknown. """ if name not in datasets_map: raise ValueError('Name of dataset unknown %s' % name) return datasets_map[name].get_split( split_name, dataset_dir, file_pattern, reader, **kwargs) ================================================ FILE: src/datasets/dataset_utils.py ================================================ # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Contains utilities for downloading and converting datasets.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys import tarfile from six.moves import urllib import tensorflow as tf LABELS_FILENAME = 'labels.txt' def int64_feature(values): """Returns a TF-Feature of int64s. Args: values: A scalar or list of values. Returns: a TF-Feature. """ if not isinstance(values, (tuple, list)): values = [values] return tf.train.Feature(int64_list=tf.train.Int64List(value=values)) def bytes_feature(values): """Returns a TF-Feature of bytes. Args: values: A string. Returns: a TF-Feature. """ return tf.train.Feature(bytes_list=tf.train.BytesList(value=[values])) def image_to_tfexample(image_data, image_format, height, width, class_id): return tf.train.Example(features=tf.train.Features(feature={ 'image/encoded': bytes_feature(image_data), 'image/format': bytes_feature(image_format), 'image/class/label': int64_feature(class_id), 'image/height': int64_feature(height), 'image/width': int64_feature(width), })) def download_and_uncompress_tarball(tarball_url, dataset_dir): """Downloads the `tarball_url` and uncompresses it locally. Args: tarball_url: The URL of a tarball file. dataset_dir: The directory where the temporary files are stored. """ filename = tarball_url.split('/')[-1] filepath = os.path.join(dataset_dir, filename) def _progress(count, block_size, total_size): sys.stdout.write('\r>> Downloading %s %.1f%%' % ( filename, float(count * block_size) / float(total_size) * 100.0)) sys.stdout.flush() filepath, _ = urllib.request.urlretrieve(tarball_url, filepath, _progress) print() statinfo = os.stat(filepath) print('Successfully downloaded', filename, statinfo.st_size, 'bytes.') tarfile.open(filepath, 'r:gz').extractall(dataset_dir) def write_label_file(labels_to_class_names, dataset_dir, filename=LABELS_FILENAME): """Writes a file with the list of class names. Args: labels_to_class_names: A map of (integer) labels to class names. dataset_dir: The directory in which the labels file should be written. filename: The filename where the class names are written. """ labels_filename = os.path.join(dataset_dir, filename) with tf.gfile.Open(labels_filename, 'w') as f: for label in labels_to_class_names: class_name = labels_to_class_names[label] f.write('%d:%s\n' % (label, class_name)) def has_labels(dataset_dir, filename=LABELS_FILENAME): """Specifies whether or not the dataset directory contains a label map file. Args: dataset_dir: The directory in which the labels file is found. filename: The filename where the class names are written. Returns: `True` if the labels file exists and `False` otherwise. """ return tf.gfile.Exists(os.path.join(dataset_dir, filename)) def read_label_file(dataset_dir, filename=LABELS_FILENAME): """Reads the labels file and returns a mapping from ID to class name. Args: dataset_dir: The directory in which the labels file is found. filename: The filename where the class names are written. Returns: A map from a label (integer) to class name. """ labels_filename = os.path.join(dataset_dir, filename) with tf.gfile.Open(labels_filename, 'r') as f: lines = f.read().decode() lines = lines.split('\n') lines = filter(None, lines) labels_to_class_names = {} for line in lines: index = line.index(':') labels_to_class_names[int(line[:index])] = line[index+1:] return labels_to_class_names ================================================ FILE: src/datasets/hico.py ================================================ """Provides data for the HMDB51 dataset. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from datasets.video_data_utils import gen_dataset _HICO_TRINITY_LIST_DIR = '/data/rgirdhar/Data2/Projects/2016/002_VideoRepresentation/StandardData/005_HICO/data_videoFormat/001_Basic/train_test_lists' _HICO_TRINITY_POSE_LABEL_DIR = '/scratch/rgirdhar/Datasets/Image/003_HICO/data_videoFormat/001_Basic/features/001_CPMPose/' _HICO_DATASET_DIR = '/scratch/rgirdhar/Datasets/Image/003_HICO/data_videoFormat/001_Basic/frames' def get_split(split_name, dataset_dir, file_pattern=None, reader=None, **kwargs): _NUM_CLASSES = 600 # There are no pose labels, but need to keep this to load models from MPII # trained # Also, now the processing can still avoided by having no loss on pose _NUM_POSE_KEYPOINTS = 16 if 'dataset_list_dir' not in kwargs: dataset_list_dir = _HICO_TRINITY_LIST_DIR else: dataset_list_dir = kwargs['dataset_list_dir'] _LIST_FN = lambda split, id: \ '%s/%s_split%d.txt' % ( dataset_list_dir, split, id) kwargs['num_pose_keypoints'] = _NUM_POSE_KEYPOINTS kwargs['num_classes'] = _NUM_CLASSES kwargs['list_fn'] = _LIST_FN input_file_style = '3-col' kwargs.pop('dataset_list_dir', []) return gen_dataset(split_name, dataset_dir, file_pattern, reader, pose_dataset_dir=_HICO_TRINITY_POSE_LABEL_DIR, input_file_style=input_file_style, **kwargs), _NUM_POSE_KEYPOINTS ================================================ FILE: src/datasets/hmdb51.py ================================================ """Provides data for the HMDB51 dataset. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from datasets.video_data_utils import gen_dataset _HMDB51_TRINITY_LIST_DIR = '/data/rgirdhar/Data2/Projects/2016/001_NetVLADVideo/raw/HMDB51/lists/train_test_lists2' _HMDB51_TRINITY_POSE_LABEL_DIR = '/scratch/rgirdhar/Datasets/Video/002_HMDB51/processed/features/002_CPM_Pose/' _HMDB51_TRINITY_OBJECTS_LABEL_DIR = '/scratch/rgirdhar/Datasets/Video/002_HMDB51/processed/features/001_YOLO9K_cocoDets_denseFilledIn' def get_split(split_name, dataset_dir, file_pattern=None, reader=None, **kwargs): # dataset_list_dir=_HMDB51_TRINITY_LIST_DIR, # modality='rgb', num_samples=1, # split_id=1, **kwargs): _NUM_CLASSES = 51 # There are no pose labels, but need to keep this to load models from MPII # trained # Also, now the processing can still avoided by having no loss on pose _NUM_POSE_KEYPOINTS = 16 _LIST_FN = lambda split, id: \ '%s/%s_split%d.txt' % ( kwargs['dataset_list_dir'] if 'dataset_list_dir' in kwargs else _HMDB51_TRINITY_LIST_DIR, split, id) kwargs['num_pose_keypoints'] = _NUM_POSE_KEYPOINTS kwargs['num_classes'] = _NUM_CLASSES kwargs['list_fn'] = _LIST_FN return gen_dataset(split_name, dataset_dir, file_pattern, reader, pose_dataset_dir=_HMDB51_TRINITY_POSE_LABEL_DIR, objects_dataset_dir=_HMDB51_TRINITY_OBJECTS_LABEL_DIR, **kwargs), _NUM_POSE_KEYPOINTS # modality, num_samples, split_id, # _NUM_CLASSES, _LIST_FN, **kwargs), _NUM_POSE_KEYPOINTS ================================================ FILE: src/datasets/image_read_utils.py ================================================ import tensorflow as tf # TODO: move this to the main train script if useful. Not a good idea to have this inside. tf.app.flags.DEFINE_string( 'pose_style', 'heatmap', 'Select style for pose to be rendered [heatmap/render].') FLAGS = tf.app.flags.FLAGS IM_HT = 256 IM_WD = 340 def _get_frame_sublist(start_frame, duration, num_samples, num_consec_frames, randomFromSegmentStyle=None): # follow segmental architecture res = [] step = tf.cast((duration - tf.constant(num_consec_frames)) / (tf.constant(num_samples)), 'int32') step = tf.maximum(step, 1) cur_end_point = 0 if randomFromSegmentStyle is None: if num_samples == 1: randomFromSegmentStyle = True # because otherwise would not make sense else: randomFromSegmentStyle = False # start_frame = tf.Print(start_frame, [start_frame], 'Using start frame: ') # The following will be printed as many times as the number of read threads if randomFromSegmentStyle: tf.logging.info('Reading in random segment style') else: tf.logging.info('IMP NOTE:: Reading uniform frames') for i in range(num_samples): if randomFromSegmentStyle: res.append(tf.random_uniform([1], tf.minimum(start_frame + step * i, duration-num_consec_frames-1), tf.minimum(start_frame + step * (i+1), duration-num_consec_frames), dtype='int32')[0]) else: res.append(tf.minimum(start_frame + step * i, duration - 1)) # To debug # res[0] = tf.Print(res[0], res, 'Offsets:' ) [el.set_shape(()) for el in res] return res def _get_frame_sublist_SAME_AS_CAFFE( start_frame, duration, num_samples, num_consec_frames, randomFromSegmentStyle=None): # follow segmental architecture res = [] avg_duration = tf.cast(duration / tf.constant(num_samples), 'int32') cur_end_point = 0 if randomFromSegmentStyle is None: if num_samples == 1: randomFromSegmentStyle = True # because otherwise would not make sense else: randomFromSegmentStyle = False # start_frame = tf.Print(start_frame, [start_frame], 'Using start frame: ') # The following will be printed as many times as the number of read threads if randomFromSegmentStyle: tf.logging.info('Reading in random segment style') else: tf.logging.info('IMP NOTE:: Reading uniform frames') for i in range(num_samples): if randomFromSegmentStyle: offset = tf.random_uniform([1], 0, avg_duration-num_consec_frames+1, dtype=tf.int32) T = tf.cond(tf.greater_equal(avg_duration, num_consec_frames), lambda: offset + i * avg_duration, lambda: tf.constant([1])) res.append(T[0]) else: T = tf.cond(tf.greater_equal(avg_duration, num_consec_frames), lambda: ( avg_duration-num_consec_frames+1)/2 + i*avg_duration, lambda: tf.constant([1])) res.append(T[0]) # To debug # res[0] = tf.Print(res[0], res, 'Offsets:' ) return res def _read_from_disk_spatial(fpath, nframes, num_samples=25, start_frame=0, file_prefix='', file_zero_padding=4, file_index=1, dataset_dir='', frame_sublist=None, randomFromSegmentStyle=None): if frame_sublist is None: frame_sublist = _get_frame_sublist(start_frame, nframes, num_samples, 1, randomFromSegmentStyle) allimgs = [] with tf.variable_scope('read_rgb_video'): for i in range(num_samples): with tf.variable_scope('read_rgb_image'): prefix = file_prefix + '_' if file_prefix else '' impath = tf.string_join([ tf.constant(dataset_dir + '/'), fpath, tf.constant('/'), prefix, tf.as_string(frame_sublist[i] + file_index, width=file_zero_padding, fill='0'), tf.constant('.jpg')]) # To debug # impath = tf.Print(impath, [impath], message='Reading image:') img_str = tf.read_file(impath) allimgs.append(img_str) return allimgs def _read_from_disk_temporal( fpath, nframes, num_samples=25, optical_flow_frames=10, start_frame=0, file_prefix='', file_zero_padding=4, file_index=1, dataset_dir='', frame_sublist=None, randomFromSegmentStyle=None): if frame_sublist is None: frame_sublist = _get_frame_sublist(start_frame, nframes, num_samples, optical_flow_frames, randomFromSegmentStyle) allimgs = [] with tf.variable_scope('read_flow_video'): for i in range(num_samples): with tf.variable_scope('read_flow_image'): flow_img = [] for j in range(optical_flow_frames): # To protect for small videos, avoid overshooting the filelist frame_id = frame_sublist[i] + j frame_id = tf.cond( tf.greater(frame_id, nframes-2), lambda: nframes-2, lambda: frame_id) with tf.variable_scope('read_flow_channels'): for dr in ['x', 'y']: prefix = file_prefix + '_' if file_prefix else '' impath = tf.string_join([ tf.constant(dataset_dir + '/'), fpath, tf.constant('/'), prefix, '%s_' % dr, tf.as_string(frame_id + file_index, width=file_zero_padding, fill='0'), tf.constant('.jpg')]) # impath = tf.Print(impath, [impath], "Read file: ") img_str = tf.read_file(impath) flow_img.append(img_str) allimgs.append(flow_img) return allimgs def _read_from_disk_pose( fpath, nframes, num_samples=25, pose_frames=5, start_frame=0, file_prefix='', file_zero_padding=4, file_index=1, dataset_dir='', frame_sublist=None, randomFromSegmentStyle=None, file_ext='.jpg'): from custom_ops.custom_ops_factory import read_file_safe if frame_sublist is None: frame_sublist = _get_frame_sublist(start_frame, nframes, num_samples, pose_frames, randomFromSegmentStyle) allimgs = [] with tf.variable_scope('read_pose_video'): for i in range(num_samples): with tf.variable_scope('read_pose_image'): pose_img = [] for j in range(pose_frames): # To protect for small videos, avoid overshooting the filelist frame_id = frame_sublist[i] + j frame_id = tf.cond( tf.greater(frame_id, nframes-1), # there are nframes-1 flow lambda: nframes-1, lambda: frame_id) prefix = file_prefix + '_' if file_prefix else '' impath = tf.string_join([ tf.constant(dataset_dir + '/'), fpath, tf.constant('/'), prefix, tf.as_string(frame_id + file_index, width=file_zero_padding, fill='0'), tf.constant(file_ext)]) # img_str = tf.read_file(impath) img_str = read_file_safe(impath) pose_img.append(img_str) allimgs.append(pose_img) return allimgs def decode_rgb(img_str): with tf.variable_scope('decode_rgb_frame'): img = tf.image.decode_jpeg(img_str, channels=3) # Always convert before resize, this is a bug in TF # https://github.com/tensorflow/tensorflow/issues/1763 # IMPORTANT NOTE: The original netvlad model was trained with the convert # happening after the resize, and hence it's trained with the large values. # It still works if I do that, but I'm training a new netvlad RGB model # with the current setup. img = tf.image.convert_image_dtype(img, dtype=tf.float32) return [img] def decode_flow(img_str, perImageChannels=1): # IMPORTANT NOTE: I am now resizing the flow frames before running through # the preprocessing. I was not doing that earlier (in the master). This leads # to the 66 number to drop to 63 on HMDB. But it should be fixable by # re-training with this setup with tf.variable_scope('decode_flow_frame'): img = tf.concat([tf.image.decode_jpeg(el, channels=perImageChannels) for el in tf.unstack(img_str)], axis=2) # Always convert before resize, this is a bug in TF # https://github.com/tensorflow/tensorflow/issues/1763 img = tf.image.convert_image_dtype(img, dtype=tf.float32) return [img] def decode_poseJson(img_str, perImageChannels=1): from custom_ops.custom_ops_factory import json_to_pose with tf.variable_scope('decode_poseJson_frame'): pose_style = FLAGS.pose_style img = tf.concat([json_to_pose( el, out_height=IM_HT, out_width=IM_WD, marker_wid=5 if pose_style=='render' else 20, out_style=pose_style) for el in img_str], axis=2) # img = tf.image.resize_images(img, [IM_HT, IM_WD]) # not any faster # TODO: remove the following checks once sure # with tf.control_dependencies( # [tf.assert_less_equal(img, tf.constant(1.5)), # tf.assert_greater_equal(img, tf.constant(-0.5))]): # img = tf.identity(img) # img = tf.image.convert_image_dtype(img, dtype=tf.float32) return [img] def _decode_from_string(img_str, modality): if modality == 'rgb': img = decode_rgb(img_str) elif modality.startswith('flow'): img = decode_flow(img_str) elif modality.startswith('rgb+flow'): with tf.name_scope('decode_rgbNflow'): img_rgb = decode_rgb(img_str[..., 0]) img_flow = decode_flow(img_str[..., 1:]) img = [img_rgb[0], img_flow[0]] elif modality.startswith('posejson'): img = decode_poseJson(img_str) elif modality.startswith('pose'): img = decode_flow(img_str, perImageChannels=3) im_ht = tf.reduce_max([tf.shape(el)[-3] for el in img]) im_wd = tf.reduce_max([tf.shape(el)[-2] for el in img]) img = [tf.image.resize_images(el, [IM_HT, IM_WD]) for el in img] return img, im_ht, im_wd ================================================ FILE: src/datasets/jhmdb21.py ================================================ """Provides data for the JHMDB21 dataset. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from datasets.video_data_utils import gen_dataset _JHMDB21_TRINITY_LIST_DIR = '/data/rgirdhar/Data2/Projects/2016/002_VideoRepresentation/StandardData/002_JHMDB/Processed/Lists/train_test_lists/' _JHMDB21_TRINITY_POSE_LABEL_DIR = '/data/rgirdhar/Data2/Projects/2016/002_VideoRepresentation/StandardData/002_JHMDB/Processed/Features/001_CPM_Pose/' _JHMDB21_TRINITY_OBJECTS_LABEL_DIR = '' def get_split(split_name, dataset_dir, file_pattern=None, reader=None, **kwargs): # dataset_list_dir=_JHMDB21_TRINITY_LIST_DIR, # modality='rgb', num_samples=1, # split_id=1, **kwargs): _NUM_CLASSES = 21 # There are no pose labels, but need to keep this to load models from MPII # trained # Also, now the processing can still avoided by having no loss on pose _NUM_POSE_KEYPOINTS = 16 _LIST_FN = lambda split, id: \ '%s/%s_split%d.txt' % ( kwargs['dataset_list_dir'] if 'dataset_list_dir' in kwargs else _JHMDB21_TRINITY_LIST_DIR, split, id) kwargs['num_pose_keypoints'] = _NUM_POSE_KEYPOINTS kwargs['num_classes'] = _NUM_CLASSES kwargs['list_fn'] = _LIST_FN return gen_dataset(split_name, dataset_dir, file_pattern, reader, pose_dataset_dir=_JHMDB21_TRINITY_POSE_LABEL_DIR, objects_dataset_dir=_JHMDB21_TRINITY_OBJECTS_LABEL_DIR, **kwargs), _NUM_POSE_KEYPOINTS # modality, num_samples, split_id, # _NUM_CLASSES, _LIST_FN, **kwargs), _NUM_POSE_KEYPOINTS ================================================ FILE: src/datasets/mpii.py ================================================ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import glob import tensorflow as tf slim = tf.contrib.slim _FILE_PATTERN = 'mpii_%s_*.tfrecord' SPLITS_TO_SIZES = {'trainval_train': 8219, 'trainval_val': 6988, 'trainval': 15207, # 8219 + 6988 'test': 5709} _NUM_CLASSES = 393 # activities _NUM_POSE_KEYPOINTS = 16 _ITEMS_TO_DESCRIPTIONS = { 'image': 'A color image of varying size.', 'label': 'A pose representation, [x1,y1,is_visible1,...]', } def _tfrecord_file_pattern_to_list(pattern): res = glob.glob(pattern) return sorted(res) def get_split(split_name, dataset_dir, file_pattern=None, reader=None): """Gets a dataset tuple with instructions for reading flowers. Args: split_name: A train/validation split name. dataset_dir: The base directory of the dataset sources. file_pattern: The file pattern to use when matching the dataset sources. It is assumed that the pattern contains a '%s' string so that the split name can be inserted. reader: The TensorFlow reader type. Returns: A `Dataset` namedtuple. Raises: ValueError: if `split_name` is not a valid train/validation split. """ if split_name not in SPLITS_TO_SIZES: raise ValueError('split name %s was not recognized.' % split_name) if not file_pattern: file_pattern = _FILE_PATTERN file_pattern = os.path.join(dataset_dir, file_pattern % split_name) # The following is important to ensure the files are read in order, because # otherwise test time output can be generated in any random order file_pattern = _tfrecord_file_pattern_to_list(file_pattern) # Allowing None in the signature so that dataset_factory can use the default. if reader is None: reader = tf.TFRecordReader keys_to_features = { 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), 'image/format': tf.FixedLenFeature((), tf.string, default_value='png'), 'image/class/pose': tf.VarLenFeature(dtype=tf.int64), 'image/class/action_label': tf.FixedLenFeature( (), tf.int64, default_value=tf.zeros([], dtype=tf.int64)), 'image/height': tf.FixedLenFeature( (), tf.int64, default_value=tf.zeros([], dtype=tf.int64)), 'image/width': tf.FixedLenFeature( (), tf.int64, default_value=tf.zeros([], dtype=tf.int64)), } items_to_handlers = { 'image': slim.tfexample_decoder.Image(), 'pose': slim.tfexample_decoder.Tensor('image/class/pose'), 'action_label': slim.tfexample_decoder.Tensor('image/class/action_label'), 'im_ht': slim.tfexample_decoder.Tensor('image/height'), 'im_wd': slim.tfexample_decoder.Tensor('image/width'), } decoder = slim.tfexample_decoder.TFExampleDecoder( keys_to_features, items_to_handlers) labels_to_names = None return slim.dataset.Dataset( data_sources=file_pattern, reader=reader, decoder=decoder, num_samples=SPLITS_TO_SIZES[split_name], items_to_descriptions=_ITEMS_TO_DESCRIPTIONS, num_classes=_NUM_CLASSES, labels_to_names=labels_to_names), _NUM_POSE_KEYPOINTS ================================================ FILE: src/datasets/video_data_utils.py ================================================ """Provides data for the UCF101 dataset. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import numpy as np import tensorflow as tf import sys from datasets import dataset_utils from datasets.image_read_utils import _read_from_disk_spatial, \ _decode_from_string, _read_from_disk_temporal, _get_frame_sublist, \ _read_from_disk_pose from tensorflow.python.platform import tf_logging as logging from custom_ops.custom_ops_factory import read_json_pose, read_sparse_label slim = tf.contrib.slim class PreReadTextLineReader(tf.ReaderBase): @staticmethod def read(lines_queue): # just return the line from this queue. # The queue will be randomized if training and not if not. # Standard tf.TextLineReader will open the file and return line by line, we # don't want that, but want to randomize the whole file. Hence, this solves # this by first reading the whole file into the queue and then just picking # stuff from the queue. video_information = lines_queue.dequeue() return [video_information, video_information] # make the video_info as the # key for this datapoint as # well def decode_train_file_line(line, input_file_style='3-col', input_file_style_label='one-label'): start_frame = 0 if input_file_style == '3-col': fpath, nframes, label = tf.decode_csv( line, record_defaults=[[''], [-1], ['']], field_delim=' ') elif input_file_style == '4-col': fpath, start_frame, nframes, label = tf.decode_csv( line, record_defaults=[[''], [-1], [-1], ['']], field_delim=' ') else: raise ValueError('Unknown input file style: {0}'.format( input_file_style)) if input_file_style_label == 'one-label': label = tf.string_to_number(label, out_type=tf.int32) label.set_shape(()) elif input_file_style_label.startswith('multi-label'): nclasses = int(input_file_style_label[len('multi-label'):]) label = read_sparse_label(label, nclasses) label.set_shape((nclasses,)) return fpath, start_frame, nframes, label def getReaderFn(num_samples, modality='rgb', dataset_dir='', randomFromSegmentStyle=None, input_file_style='3-col', input_file_style_label='one-label'): def readerFn(): class reader_func(tf.ReaderBase): @staticmethod # def read(filename_queue): def read(value): # value = filename_queue.dequeue() fpath, start_frame, nframes, label = decode_train_file_line( value, input_file_style, input_file_style_label) # TODO(rgirdhar): Release the file_prefix='', file_zero_padding=4, # file_index=1 options to the bash script # TODO: Fix the optical_flow_frame number... optical_flow_frames = 1 frame_sublist = _get_frame_sublist(0, nframes, num_samples, optical_flow_frames, randomFromSegmentStyle=randomFromSegmentStyle) # frame_sublist = tf.Print(frame_sublist, frame_sublist, "frame sublist:") if modality == 'rgb': assert(len(dataset_dir) >= 1) image_buffer = _read_from_disk_spatial( fpath, nframes, num_samples=num_samples, start_frame=start_frame, file_prefix='image', file_zero_padding=5, file_index=1, dataset_dir=dataset_dir[0], frame_sublist=frame_sublist, randomFromSegmentStyle=randomFromSegmentStyle) elif modality.startswith('flow'): assert(len(dataset_dir) >= 1) optical_flow_frames = int(modality[4:]) image_buffer = _read_from_disk_temporal( fpath, nframes, num_samples=num_samples, start_frame=start_frame, optical_flow_frames=optical_flow_frames, file_prefix='flow', file_zero_padding=5, file_index=1, dataset_dir=dataset_dir[0], frame_sublist=frame_sublist, randomFromSegmentStyle=randomFromSegmentStyle) elif modality.startswith('rgb+flow'): assert(len(dataset_dir) >= 2) # in this case, fix the step for both the streams to ensure correspondence optical_flow_frames = int(modality[8:]) rgb_image_buffer = _read_from_disk_spatial( fpath, nframes, num_samples=num_samples, start_frame=start_frame, file_prefix='image', file_zero_padding=5, file_index=1, dataset_dir=dataset_dir[0], frame_sublist=frame_sublist) flow_image_buffer = _read_from_disk_temporal( fpath, nframes, num_samples=num_samples, start_frame=start_frame, optical_flow_frames=optical_flow_frames, file_prefix='flow', file_zero_padding=5, file_index=1, dataset_dir=dataset_dir[1], frame_sublist=frame_sublist) image_buffer = zip(rgb_image_buffer, flow_image_buffer) image_buffer = [[el[0]] + el[1] for el in image_buffer] elif modality.startswith('pose'): assert(len(dataset_dir) >= 1) if modality.startswith('posejson'): pose_frames = int(modality[8:]) file_ext = '.json' elif modality.startswith('pose'): pose_frames = int(modality[4:]) file_ext = '.jpg' image_buffer = _read_from_disk_pose( fpath, nframes, num_samples=num_samples, start_frame=start_frame, pose_frames=pose_frames, file_prefix='image', file_zero_padding=5, file_index=1, dataset_dir=dataset_dir[0], frame_sublist=frame_sublist, randomFromSegmentStyle=randomFromSegmentStyle, file_ext=file_ext) else: logging.error('Unknown modality %s\n' % modality) raise ValueError() return [image_buffer, label, fpath, frame_sublist, start_frame] return reader_func return readerFn def decoderFn( reader, num_samples=1, modality='rgb', dataset_dir='', randomFromSegmentStyle=True, num_pose_keypoints=16, pose_dataset_dir=None, num_object_catagories=80, objects_dataset_dir=None): class decoder_func(slim.data_decoder.DataDecoder): @staticmethod def list_items(): return ['image', 'action_label', 'pose', 'im_ht', 'im_wd', 'objects'] @staticmethod def decode(data, items): out = {} # Arguments: # data: Can be 3-col or 4-col CSV. A 3-col would look like "filepath # nframes class_id", 4-col will be similar for Charades like dataset # items: The different items to be returned. with tf.name_scope('decode_video'): if modality == 'rgb' or \ modality.startswith('flow') or \ modality.startswith('rgb+flow') or \ modality.startswith('pose'): image_buffer, label, fpath, frame_sublist, start_frame = reader.read(data) # stacking required due to the way queues in main train loop work # image_buffer = tf.stack([tf.stack(_decode_from_string(el, modality)) for # el in image_buffer]) image_lst = [] image_hts = [] image_wds = [] for im_buf in image_buffer: temp = _decode_from_string(im_buf, modality) image_lst += temp[0] image_hts.append(temp[1]) image_wds.append(temp[2]) image_buffer = tf.stack(image_lst) im_ht = tf.reduce_max(image_hts) im_wd = tf.reduce_max(image_wds) # image_buffer = tf.stack([ # _decode_from_string(el, modality)[0] for el in image_buffer]) else: logging.error('Unknown modality %s\n' % modality) # since my code gives a 0-1 image, change it back out['image'] = tf.cast(image_buffer * 255.0, tf.uint8) if 'pose' in items: if pose_dataset_dir is None: out['pose'] = [-tf.ones([num_pose_keypoints * 3,], dtype=tf.int64)] else: out['pose'] = [read_json_pose(tf.string_join([ pose_dataset_dir, '/', fpath, '/', 'image_', tf.as_string(frame_sublist_i+1, width=5, fill='0'), '.json'])) for frame_sublist_i in tf.unstack(frame_sublist)] if 'objects' in items: if objects_dataset_dir is None: out['objects'] = [] else: out['objects'] = [tf.read_file(tf.string_join([ objects_dataset_dir, '/', fpath, '/', 'image_', tf.as_string(frame_sublist_i+1, width=5, fill='0'), '.txt'])) for frame_sublist_i in tf.unstack(frame_sublist)] out['action_label'] = label # The following is the original image size on disk, # on which pose etc would have been computed out['im_wd'] = tf.cast(im_wd, tf.int64) out['im_ht'] = tf.cast(im_ht, tf.int64) return [out[el] for el in items] return decoder_func def count_frames_file(fpath, frameLevel=True): res = 0 with open(fpath, 'r') as fin: for line in fin: if frameLevel: res += int(line.split()[1]) else: res += 1 return res def gen_dataset(split_name, dataset_dir, file_pattern=None, reader=None, pose_dataset_dir=None, objects_dataset_dir=None, modality='rgb', num_samples=1, split_id=1, num_classes=0, list_fn=None, input_file_style='3-col', randomFromSegmentStyle=None, num_pose_keypoints=16, num_object_catagories=80, input_file_style_label='one-label'): """ input_file_style_label: ['one-label'/'multi-label%d' % integer] """ SPLITS_TO_SIZES = { 'train': count_frames_file(list_fn('train', split_id), frameLevel=(num_samples==1)), 'test': count_frames_file(list_fn('test', split_id), frameLevel=(num_samples==1)), } if split_name not in SPLITS_TO_SIZES: raise ValueError('split name %s was not recognized.' % split_name) _ITEMS_TO_DESCRIPTIONS = { 'image': 'A [? x ? x 3] color image.', 'label': 'A single integer between 0 and %d' % num_classes, } LIST_FILE = list_fn(split_name, split_id) logging.info('Using file %s' % LIST_FILE) with open(LIST_FILE, 'r') as fin: data_sources = fin.read().splitlines() # don't randomize here, in testing # I'll run without randomizing, and # the queue is going to randomize # automatically anyway # Allowing None in the signature so that dataset_factory can use the default. if not reader: reader = getReaderFn(num_samples, modality, [dataset_dir], randomFromSegmentStyle, input_file_style, input_file_style_label) labels_to_names = None # if dataset_utils.has_labels(dataset_dir): # labels_to_names = dataset_utils.read_label_file(dataset_dir) return slim.dataset.Dataset( data_sources=data_sources, reader=lambda: PreReadTextLineReader, decoder=decoderFn(reader(), num_samples, modality, [dataset_dir], randomFromSegmentStyle, num_pose_keypoints, pose_dataset_dir, num_object_catagories, objects_dataset_dir), num_samples=SPLITS_TO_SIZES[split_name], items_to_descriptions=_ITEMS_TO_DESCRIPTIONS, num_classes=num_classes, labels_to_names=labels_to_names) ================================================ FILE: src/eval/__init__.py ================================================ ================================================ FILE: src/eval/cap_eval_utils.py ================================================ # -------------------------------------------------------- # Written by Saurabh Gupta # Modified by Ishan Misra # rgirdhar: Obtained on March-09-2017 from # https://github.com/imisra/latent-noise-icnm/blob/master/cap_eval_utils.py # -------------------------------------------------------- import numpy as np from scipy.interpolate import interp1d from IPython.core.debugger import Tracer import code def calc_pr_ovr(counts, out, K): """ [P, R, score, ap] = calc_pr_ovr(counts, out, K) Input : counts : number of occurrences of this word in the ith image out : score for this image K : number of references Output : P, R : precision and recall score : score which corresponds to the particular precision and recall ap : average precision """ K = np.float64(K) tog = np.hstack((counts[:,np.newaxis].astype(np.float64), out[:, np.newaxis].astype(np.float64))) ind = np.argsort(out) ind = ind[::-1] score = np.array([tog[i,1] for i in ind]) sortcounts = np.array([tog[i,0] for i in ind]) tp = sortcounts*(1.-1./K); fp = sortcounts.copy(); for i in xrange(sortcounts.shape[0]): if sortcounts[i] > 1: fp[i] = 0.; elif sortcounts[i] == 0: fp[i] = 1.; elif sortcounts[i] == 1: fp[i] = 1./K; P = np.cumsum(tp)/(np.cumsum(tp) + np.cumsum(fp)); # c = accumarray(sortcounts(:)+1, 1); c = [np.sum(np.array(sortcounts) == i) for i in xrange(int(max(sortcounts)+1))] ind = np.array(range(0, len(c))); numinst = ind*c*(K-1.)/K; numinst = np.sum(numinst, axis = 0) R = np.cumsum(tp)/numinst ap = voc_ap(R,P) return P, R, score, ap def calc_pr_ovr_noref(counts, out): """ [P, R, score, ap] = calc_pr_ovr(counts, out, K) Input : counts : number of occurrences of this word in the ith image out : score for this image K : number of references Output : P, R : precision and recall score : score which corresponds to the particular precision and recall ap : average precision """ #binarize counts counts = np.array(counts > 0, dtype=np.float32); tog = np.hstack((counts[:,np.newaxis].astype(np.float64), out[:, np.newaxis].astype(np.float64))) ind = np.argsort(out) ind = ind[::-1] score = np.array([tog[i,1] for i in ind]) sortcounts = np.array([tog[i,0] for i in ind]) tp = sortcounts; fp = sortcounts.copy(); for i in xrange(sortcounts.shape[0]): if sortcounts[i] >= 1: fp[i] = 0.; elif sortcounts[i] < 1: fp[i] = 1.; P = np.cumsum(tp)/(np.cumsum(tp) + np.cumsum(fp)); numinst = np.sum(counts); R = np.cumsum(tp)/numinst ap = voc_ap(R,P) return P, R, score, ap def voc_ap(rec, prec): """ ap = voc_ap(rec, prec) Computes the AP under the precision recall curve. """ rec = rec.reshape(rec.size,1); prec = prec.reshape(prec.size,1) z = np.zeros((1,1)); o = np.ones((1,1)); mrec = np.vstack((z, rec, o)) mpre = np.vstack((z, prec, z)) for i in range(len(mpre)-2, -1, -1): mpre[i] = max(mpre[i], mpre[i+1]) I = np.where(mrec[1:] != mrec[0:-1])[0]+1; ap = 0; for i in I: ap = ap + (mrec[i] - mrec[i-1])*mpre[i]; return ap def compute_precision_score_mapping(thresh, prec, score): ind = np.argsort(thresh); thresh = thresh[ind]; prec = prec[ind]; for i in xrange(1, len(prec)): prec[i] = max(prec[i], prec[i-1]); indexes = np.unique(thresh, return_index=True)[1] indexes = np.sort(indexes); thresh = thresh[indexes] prec = prec[indexes] thresh = np.vstack((min(-1000, min(thresh)-1), thresh[:, np.newaxis], max(1000, max(thresh)+1))); prec = np.vstack((prec[0], prec[:, np.newaxis], prec[-1])); f = interp1d(thresh[:,0], prec[:,0]) val = f(score) return val def human_agreement(gt, K): """ function [prec, recall] = human_agreement(gt, K) """ c = np.zeros((K+1,1), dtype=np.float64) # namespace = globals().copy() # namespace.update(locals()) # code.interact(local=namespace) for i in xrange(len(gt)): if gt[i] K+1: # print 'warning: ' # maxRun = K+1; # for i in xrange(maxRun): # c[gt[i]] += 1; c = c/np.sum(c); ind = np.array(range(len(c)))[:, np.newaxis] n_tp = sum(ind*(ind-1)*c)/K; n_fp = c[1]/K; numinst = np.sum(c * (K-1) * ind) / K; prec = n_tp / (n_tp+n_fp); recall = n_tp / numinst; return prec, recall #follows from http://arxiv.org/pdf/1312.4894v2.pdf (Sec 4.2) def compute_warpstyle_pr(gtLabel, predMat, topK): assert gtLabel.shape == predMat.shape, 'gt {}; pred {}'.format(gtLabel.shape, predMat.shape) gtLabel = gtLabel.astype(np.float64) predMat = predMat.astype(np.float64) numTags = gtLabel.shape[1]; numIm = gtLabel.shape[0]; #first look at topK predictions per image topPreds = np.zeros_like(predMat); for imInd in range(numIm): topKInds = im_utils.maxk(predMat[imInd,...], topK); topPreds[imInd, topKInds] = 1; # tb.print_stack();namespace = globals().copy();namespace.update(locals());code.interact(local=namespace) gtLabel = (gtLabel > 0).astype(np.float64) topPreds = (topPreds > 0).astype(np.float64) corrMat = np.logical_and(gtLabel, topPreds).astype(np.float64) nc_per_tag = corrMat.sum(axis=0).astype(np.float64); ng_per_tag = gtLabel.sum(axis=0).astype(np.float64); np_per_tag = topPreds.sum(axis=0).astype(np.float64); #mean per-class perclass_recall = 0.0; perclass_precision = 0.0; eps = 1e-6; for t in range(numTags): cr = nc_per_tag[t]/(ng_per_tag[t]+eps); cp = nc_per_tag[t]/(np_per_tag[t]+eps); perclass_precision += cp; perclass_recall += cr; perclass_precision = (1.0/numTags) * perclass_precision; perclass_recall = (1.0/numTags) * perclass_recall; #overall overall_recall = nc_per_tag.sum()/(ng_per_tag.sum()+eps); overall_precision = nc_per_tag.sum()/(np_per_tag.sum()+eps); return perclass_precision, perclass_recall, overall_precision, overall_recall; def print_benchmark_latex(evalFile, vocab = None, sortBy = "words", \ printWords = False, printPos = True, printAgg = False, possOrder=None): #evalFile has the following ['details', 'agg', 'vocab', 'imdb'] evalData = sg_utils.load_variables(evalFile); if vocab==None: vocab = evalData['vocab']; if 'details' in evalData: details = evalData['details']; else: details = evalData; ap = details['ap']; prec_at_human_rec = details['prec_at_human_rec']; human_prec = details['prec_at_human_rec']; words = vocab['words']; ind = 0; if possOrder is None: possOrder = ['NN', 'VB', 'JJ', 'DT', 'PRP', 'IN', 'other'] print ' '.join(possOrder); for pos in possOrder: ind = [i for i,x in enumerate(vocab['poss']) if pos == x] ind = np.asarray(ind,dtype=np.int32) if any( np.isnan(ap[0,ind] )): #print 'nan numbers ... skipping them for mean' print 'nan numbers ... setting them to zero for mean stats' ap[0, ind[np.where(np.isnan(ap[0, ind]))]] = 0; print '%.1f &'%(100*np.mean(ap[0,ind])), print '%.1f & &'%(100*np.mean(ap[0, :])) for pos in possOrder: ind = [i for i,x in enumerate(vocab['poss']) if pos == x] ind = np.asarray(ind,dtype=np.int32) if any( np.isnan(prec_at_human_rec[0,ind] )) or \ any( np.isnan(human_prec[0,ind] )) : #print 'nan numbers ... skipping them for mean' print 'nan numbers ... setting them to zero for mean stats' prec_at_human_rec[0, ind[np.where(np.isnan(prec_at_human_rec[0, ind]))]] = 0; human_prec[0, ind[np.where(np.isnan(human_prec[0, ind]))]] = 0; print '%.1f &'%(100*np.mean(prec_at_human_rec[0,ind])), print '%.1f \\\\'%(100*np.mean(prec_at_human_rec[0, :])) def print_benchmark_plain(evalFile, vocab = None, \ sortBy = "words", printWords = False, printPos = True, printAgg = False): #evalFile has the following ['details', 'agg', 'vocab', 'imdb'] evalData = sg_utils.load_variables(evalFile); if vocab==None: vocab = evalData['vocab']; if 'details' in evalData: details = evalData['details']; else: details = evalData; ap = details['ap']; prec_at_human_rec = details['prec_at_human_rec']; human_prec = details['prec_at_human_rec']; words = vocab['words']; ind = 0; if sortBy == "words": srtInds = np.argsort(words); elif sortBy == "ap": srtInds = np.argsort(ap); srtInds = srtInds[0]; srtInds = srtInds[::-1]; if printWords == True: print "{:>50s}".format("-"*50) print "{:^50s}".format("Word metrics") print "{:>50s}".format("-"*50) print "{:>15s} {:>8s} {:>6s} : {:^5s} {:^5s}". \ format("Words","POS","Counts","mAP", "p@H") for i in srtInds: print "{:>15s} {:>8s} {:6d} : {:5.2f} {:5.2f}". \ format(words[i], vocab['poss'][i], vocab['counts'][i], 100*np.mean(ap[0, i]), 100*np.mean(prec_at_human_rec[0, i])); if printPos: print "{:>50s}".format("-"*50) print "{:^50s}".format("POS metrics") print "{:>50s}".format("-"*50) print "{:>15s} : {:^5s} {:^5s} {:^5s}". \ format("POS", "mAP", "p@H", "h") for pos in list(set(vocab['poss'])): ind = [i for i,x in enumerate(vocab['poss']) if pos == x] ind = np.asarray(ind) if any( np.isnan(ap[0,ind] )) or \ any( np.isnan(prec_at_human_rec[0,ind] )) or \ any( np.isnan(human_prec[0,ind] )) : print 'nan numbers ... setting them to zero for mean stats' ap[0, ind[np.where(np.isnan(ap[0, ind]))]] = 0; prec_at_human_rec[0, ind[np.where(np.isnan(prec_at_human_rec[0, ind]))]] = 0; human_prec[0, ind[np.where(np.isnan(human_prec[0, ind]))]] = 0; print "{:>11s} [{:4d}]: {:5.2f} {:5.2f} {:5.2f}". \ format(pos, len(ind), 100*np.mean(ap[0, ind]), 100*np.mean(prec_at_human_rec[0, ind]), \ 100*np.mean(human_prec[0, ind])) if printAgg: print "{:>50s}".format("-"*50) print "{:^50s}".format("Agg metrics") print "{:>50s}".format("-"*50) print "{:>15s} : {:^5s} {:^5s} {:^5s}". \ format("agg", "mAP", "p@H", "h") pos = 'all'; ind = srtInds; ind = np.asarray(ind); if any( np.isnan(ap[0,ind] )) or \ any( np.isnan(prec_at_human_rec[0,ind] )) or \ any( np.isnan(human_prec[0,ind] )) : print 'nan numbers ... setting them to zero for mean stats' ap[0, ind[np.where(np.isnan(ap[0, ind]))]] = 0; prec_at_human_rec[0, ind[np.where(np.isnan(prec_at_human_rec[0, ind]))]] = 0; human_prec[0, ind[np.where(np.isnan(human_prec[0, ind]))]] = 0; print "{:>11s} [{:^4d}] : {:^5.2f} {:5.2f} {:5.2f}". \ format(pos, len(ind), 100*np.mean(ap[0, ind]), 100*np.mean(prec_at_human_rec[0, ind]), \ 100*np.mean(human_prec[0, ind])) ================================================ FILE: src/eval/utils.py ================================================ from eval.cap_eval_utils import calc_pr_ovr_noref import numpy as np def compute_map(all_logits, all_labels): num_classes = all_logits.shape[1] APs = [] for cid in range(num_classes): this_logits = all_logits[:, cid] this_labels = (all_labels == cid).astype('float32') if np.sum(this_labels) == 0: print('No positive videos for class {}. Ignoring...'.format(cid)) continue _, _, _, ap = calc_pr_ovr_noref(this_labels, this_logits) APs.append(ap) mAP = np.mean(APs) return mAP, APs ================================================ FILE: src/eval.py ================================================ """Generic evaluation script that evaluates a model using a given dataset.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import math import argparse import sys import tensorflow as tf import pprint import os import math import cv2 import matplotlib.pyplot as plt import numpy as np from tqdm import tqdm import pdb from datasets import dataset_factory sys.path.append('../models/slim') from nets import nets_factory from preprocessing import preprocessing_factory from config import cfg, cfg_from_file, cfg_from_list, get_output_dir from eval.utils import compute_map from preprocess_pipeline import get_input slim = tf.contrib.slim def parse_args(): """ Parse input arguments """ parser = argparse.ArgumentParser(description='Train a keypoint regressor.') parser.add_argument('--cfg', dest='cfg_file', help='optional config file', default=None, type=str) parser.add_argument('--gpu', dest='gpu', help='GPU to use for running this.', default='0', type=str) parser.add_argument('--save', dest='save', action='store_const', const=True, default=False, help='Set to save the features. Works only in mAP mode. ' '(Set in cfg).') parser.add_argument('--outfpath', default=None, help='(Optional) Give a custom path to save the features. ' 'By def. picks a path in ckpt directory.') parser.add_argument('--preprocs', default=[], nargs='*', help='Set additional preprocs to do when testing. Eg. ' 'can put \'flips\'. This will flip images before ' 'pushing through the network. Can be useful for ' 'late fusion of multiple features.') parser.add_argument('--ept', dest='ept', nargs='+', type=str, default=[], help='Optional end point to store. ' 'By def store the softmax logits.') parser.add_argument('--split_name', default=None, type=str, help='Set to change the dataset split to run on. ' 'Eg, \'train\' or \'test\'.') parser.add_argument('--frames_per_video', default=None, type=int, help='Set to change the ' 'cfg.TRAIN.VIDEO_FRAMES_PER_VIDEO.') parser.add_argument('--dataset_list_dir', default=None, type=str, help='Set to change the train_test_lists dir.') args = parser.parse_args() if args.cfg_file is not None: cfg_from_file(args.cfg_file) # Change config for some options if args.split_name is not None: cfg.TEST.DATASET_SPLIT_NAME = args.split_name if args.frames_per_video is not None: cfg.TEST.VIDEO_FRAMES_PER_VIDEO = args.frames_per_video if args.outfpath is not None: args.save = True return args, cfg def mkdir_p(dpath): try: os.makedirs(dpath) except: pass def main(): args, cfg = parse_args() train_dir = get_output_dir('default' if args.cfg_file is None else args.cfg_file) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu print('Using Config:') pprint.pprint(cfg) tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): tf_global_step = slim.get_or_create_global_step() ###################### # Select the dataset # ###################### kwargs = {} if cfg.TEST.VIDEO_FRAMES_PER_VIDEO > 1: kwargs['num_samples'] = cfg.TEST.VIDEO_FRAMES_PER_VIDEO kwargs['modality'] = cfg.INPUT.VIDEO.MODALITY kwargs['split_id'] = cfg.INPUT.SPLIT_ID if args.dataset_list_dir is not None: kwargs['dataset_list_dir'] = args.dataset_list_dir elif cfg.DATASET_LIST_DIR != '': kwargs['dataset_list_dir'] = cfg.DATASET_LIST_DIR if cfg.INPUT_FILE_STYLE_LABEL != '': kwargs['input_file_style_label'] = cfg.INPUT_FILE_STYLE_LABEL dataset, num_pose_keypoints = dataset_factory.get_dataset( cfg.DATASET_NAME, cfg.TEST.DATASET_SPLIT_NAME, cfg.DATASET_DIR, **kwargs) #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( cfg.MODEL_NAME, num_classes=dataset.num_classes, num_pose_keypoints=num_pose_keypoints, is_training=False, cfg=cfg) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=False, num_epochs=1, common_queue_capacity=2 * cfg.TEST.BATCH_SIZE, common_queue_min=cfg.TEST.BATCH_SIZE) [image, action_label] = get_input(provider, cfg, ['image', 'action_label']) # label -= FLAGS.labels_offset ##################################### # Select the preprocessing function # ##################################### preprocessing_name = cfg.MODEL_NAME image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) eval_image_size = cfg.TRAIN.IMAGE_SIZE or network_fn.default_image_size image = image_preprocessing_fn( image, eval_image_size, eval_image_size, resize_side_min=cfg.TRAIN.RESIZE_SIDE, resize_side_max=cfg.TRAIN.RESIZE_SIDE) # additional preprocessing as required if 'flips' in args.preprocs: tf.logging.info('Flipping all images while testing!') image = tf.stack([ tf.image.flip_left_right(el) for el in tf.unstack(image)]) images, action_labels = tf.train.batch( [image, action_label], batch_size=cfg.TEST.BATCH_SIZE, # following is because if there are more, the order of batch can be # different due to different speed... so avoid that # http://stackoverflow.com/questions/35001027/does-batching-queue-tf-train-batch-not-preserve-order#comment57731040_35001027 # num_threads=1 if args.save else cfg.NUM_PREPROCESSING_THREADS, num_threads=1, # The above was too unsafe as sometimes I forgot --save # and it would just randomize the whole thing. # This is very important so # shifting to this by default. Better safe than sorry. allow_smaller_final_batch=True if cfg.TEST.VIDEO_FRAMES_PER_VIDEO == 1 else False, # because otherwise we need to # average logits over the frames, # and that needs first dimensions # to be fully defined capacity=5 * cfg.TEST.BATCH_SIZE) #################### # Define the model # #################### logits, end_points = network_fn(images) end_points['images'] = images if cfg.TEST.MOVING_AVERAGE_DECAY: variable_averages = tf.train.ExponentialMovingAverage( cfg.TEST.MOVING_AVERAGE_DECAY, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() predictions = tf.argmax(logits, 1) if cfg.TRAIN.LOSS_FN_ACTION.startswith('multi-label'): logits = tf.sigmoid(logits) else: logits = tf.nn.softmax(logits, -1) labels = tf.squeeze(action_labels) end_points['labels'] = labels # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), # 'Recall@5': slim.metrics.streaming_recall_at_k( # logits, labels, 5), }) # Print the summaries to screen. for name, value in names_to_values.iteritems(): summary_name = 'eval/%s' % name op = tf.summary.scalar(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if cfg.TEST.MAX_NUM_BATCHES: num_batches = cfg.TEST.MAX_NUM_BATCHES else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(cfg.TEST.BATCH_SIZE)) # just test the latest trained model checkpoint_path = cfg.TEST.CHECKPOINT_PATH or train_dir if tf.gfile.IsDirectory(checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(checkpoint_path) else: checkpoint_path = checkpoint_path checkpoint_step = int(checkpoint_path.split('-')[-1]) tf.logging.info('Evaluating %s' % checkpoint_path) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True summary_writer = tf.summary.FileWriter(logdir=train_dir) if cfg.TEST.EVAL_METRIC == 'mAP' or args.save or args.ept: from tensorflow.python.training import supervisor from tensorflow.python.framework import ops import h5py saver = tf.train.Saver(variables_to_restore) sv = supervisor.Supervisor(graph=ops.get_default_graph(), logdir=None, summary_op=None, summary_writer=summary_writer, global_step=None, saver=None) all_labels = [] end_points['logits'] = logits end_points_to_save = args.ept + ['logits'] end_points_to_save = list(set(end_points_to_save)) all_feats = dict([(ename, []) for ename in end_points_to_save]) with sv.managed_session( '', start_standard_services=False, config=config) as sess: saver.restore(sess, checkpoint_path) sv.start_queue_runners(sess) for j in tqdm(range(int(math.ceil(num_batches)))): feats = sess.run([ action_labels, [end_points[ename] for ename in end_points_to_save]]) all_labels.append(feats[0]) for ept_id, ename in enumerate(end_points_to_save): all_feats[ename].append(feats[1][ept_id]) APs = [] all_labels = np.concatenate(all_labels) if args.save or args.ept: res_outdir = os.path.join(train_dir, 'Features/') mkdir_p(res_outdir) outfpath = args.outfpath or os.path.join( res_outdir, 'features_ckpt_{}_{}.h5'.format( cfg.TEST.DATASET_SPLIT_NAME, checkpoint_step)) print('Saving the features/logits/labels to {}'.format(outfpath)) with h5py.File(outfpath, 'a') as fout: for ename in end_points_to_save: if ename in fout: tf.logging.warning('Deleting {} from output HDF5 to write the ' 'new features.'.format(ename)) del fout[ename] if ename == 'labels': feat_to_save = np.array(all_feats[ename]) else: feat_to_save = np.concatenate(all_feats[ename]) try: fout.create_dataset( ename, data=feat_to_save, compression='gzip', compression_opts=9) except: pdb.set_trace() # manually deal with it and continue if 'labels' in fout: del fout['labels'] fout.create_dataset( 'labels', data=all_labels, compression='gzip', compression_opts=9) if args.ept: tf.logging.info('Evaluation had --ept passed in. ' 'This indicates script was used for feature ' 'extraction. Hence, not performing any evaluation.') return # Evaluation code all_logits = np.concatenate(all_feats['logits']) acc = np.mean( all_logits.argmax(axis=1) == all_labels) mAP = compute_map(all_logits, all_labels)[0] print('Mean AP: {}'.format(mAP)) print('Accuracy: {}'.format(acc)) summary_writer.add_summary(tf.Summary(value=[ tf.Summary.Value( tag='mAP/{}'.format(cfg.TEST.DATASET_SPLIT_NAME), simple_value=mAP)]), global_step=checkpoint_step) summary_writer.add_summary(tf.Summary(value=[ tf.Summary.Value( tag='Accuracy/{}'.format(cfg.TEST.DATASET_SPLIT_NAME), simple_value=acc)]), global_step=checkpoint_step) else: slim.evaluation.evaluate_once( master='', checkpoint_path=checkpoint_path, logdir=train_dir, num_evals=num_batches, eval_op=names_to_updates.values(), variables_to_restore=variables_to_restore, session_config=config) if __name__ == '__main__': main() ================================================ FILE: src/loss.py ================================================ import tensorflow as tf slim = tf.contrib.slim def gen_losses( labels_action, logits_action, loss_type_action, num_action_classes, action_loss_wt, labels_pose, logits_pose, loss_type_pose, labels_pose_valid, pose_loss_wt, end_points, cfg): with tf.name_scope('LossFn'): if loss_type_pose and logits_pose.get_shape().as_list()[-1] > 0: with tf.name_scope('PoseLoss'): # Loss over the pose if labels_pose.get_shape().as_list() != \ logits_pose.get_shape().as_list(): tf.logging.info('Sizes of logits {} and labels {} are different. ' 'Change the cfg.FINAL_POSE_HMAP_SIDE to avoid ' 'a resize operation.'.format( logits_pose.get_shape().as_list(), labels_pose.get_shape().as_list())) labels_pose = tf.image.resize_images( labels_pose, logits_pose.get_shape().as_list()[-3:-1]) # ignore the unknown channels, set those channels to 0 to incur no loss # Following needs defining the gradient for this... # labels_pose = zero_out_channels(labels_pose, labels_pose_valid) # logits_pose = zero_out_channels(logits_pose, labels_pose_valid) with tf.name_scope('ValidPoseLoss'): channels_valid = tf.unstack(labels_pose_valid, axis=-1) channels_logits = tf.unstack(logits_pose, axis=-1) channels_labels = tf.unstack(labels_pose, axis=-1) loss_elements = [] pose_loss_mask = [] for v, lbl, lgt in zip(channels_valid, channels_logits, channels_labels): if cfg.TRAIN.LOSS_FN_POSE_SAMPLED: # To make it harder neg_areas = tf.equal(lgt, 0) pos_areas = tf.greater(lgt, 0) total_area = lgt.shape.num_elements() pos_area_ratio = tf.reduce_sum(tf.to_float(pos_areas)) / total_area # select that much of neg area neg_areas_selected = tf.to_float(tf.less(tf.random_uniform( tf.shape(lgt), 0, 1.0), pos_area_ratio)) * tf.to_float(neg_areas) # keep all positive pixels mask = tf.greater(neg_areas_selected + tf.to_float( tf.greater(lbl, 0)), 0) mask = tf.to_float(mask) lgt = lgt * mask # just do loss over this subset lbl = lbl * mask loss_val = 0.5 * tf.reduce_mean(tf.square(lbl - lgt), axis=[1,2]) else: mask = tf.ones(tf.shape(lgt)) loss_val = 0.5 * tf.reduce_sum( tf.square(lbl - lgt), axis=[1,2]) / tf.reduce_sum(mask) pose_loss_mask.append(tf.expand_dims(mask, -1)) if loss_type_pose == 'l2': L = tf.reduce_mean(tf.where( v, loss_val, [0] * v.get_shape().as_list()[0])) elif loss_type_pose == '': L = 0 else: raise ValueError('Invalid loss {}'.format(loss_type_pose)) loss_elements.append(L) end_points['PoseLossMask'] = tf.concat(pose_loss_mask, axis=-1) tot_loss = tf.reduce_sum(loss_elements, name='ValidPoseEucLoss') tf.losses.add_loss(tot_loss * pose_loss_wt) with tf.name_scope('ActionLoss'): # TODO (rgirdhar): Add the option of having -1 label, so ignore that one if loss_type_action == 'softmax-xentropy': tf.losses.softmax_cross_entropy( onehot_labels=slim.one_hot_encoding( labels_action, num_action_classes), logits=logits_action, weights=action_loss_wt) elif loss_type_action == 'l2': tf.losses.mean_squared_error( labels=slim.one_hot_encoding( labels_action, num_action_classes), predictions=logits_action, weights=action_loss_wt) elif loss_type_action == 'multi-label': labels_action = tf.to_float(labels_action) # labels_action = tf.Print( # labels_action, [labels_action, tf.reduce_sum(labels_action, 1)], # "Label action:") loss = tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits( targets=labels_action, logits=logits_action, pos_weight=10)) tf.losses.add_loss(loss) elif loss_type_action == 'multi-label-2': tf.losses.sigmoid_cross_entropy( multi_class_labels=labels_action, logits=logits_action) elif loss_type_action == '': tf.logging.info('No loss on action') else: raise ValueError('Unrecognized loss {}'.format(loss_type_action)) ================================================ FILE: src/preprocess_pipeline.py ================================================ import tensorflow as tf from custom_ops.custom_ops_factory import pose_to_heatmap, render_pose, \ render_objects, extract_glimpse def _resize_if_needed(image, max_wd): with tf.name_scope('LimitMaxSizeOriginalImage'): im_ht = tf.shape(image)[-3] im_wd = tf.shape(image)[-2] new_ht = tf.cast(im_ht, tf.float32) * ( tf.cast(max_wd, tf.float32) / tf.cast(im_wd, tf.float32)) new_ht = tf.cast(new_ht, tf.int64) image = tf.cond( tf.greater(im_wd, max_wd), lambda: tf.image.resize_images( image, tf.cast([new_ht, max_wd], tf.int32)), lambda: tf.cast(image, tf.float32)) image = tf.cast(image, tf.uint8) return image def _replay_augmentation(H, aug_info): # use the augmentation info from the original image to identically transform # the heatmap H with tf.name_scope('ReplayAugmentation'): ## 1. Crop H_wd = tf.shape(H)[-2] H_ht = tf.shape(H)[-3] num_channels = tf.shape(H)[-1] orig_wd = aug_info['image_shape'][-2] orig_ht = aug_info['image_shape'][-3] ratio_x = tf.to_float(H_wd) / tf.to_float(orig_wd) ratio_y = tf.to_float(H_ht) / tf.to_float(orig_ht) start_points = [tf.to_float(aug_info['crop_info'][0]) * ratio_y, tf.to_float(aug_info['crop_info'][1]) * ratio_x] edge_sides = [tf.to_float(aug_info['crop_info'][2]) * ratio_y, tf.to_float(aug_info['crop_info'][3]) * ratio_x] H = tf.slice(H, tf.concat([tf.to_int32(start_points), [0,]], axis=-1), tf.concat([tf.to_int32(edge_sides), [num_channels,]], axis=-1)) ## 2. Flip H = tf.cond( aug_info['whether_flip'], lambda: tf.image.flip_left_right(H), lambda: H) return H def _get_other_items(provider, stuff, existing_items, new_items): res = [] for item in new_items: if item in existing_items: res.append(stuff[existing_items.index(item)]) else: res.append(provider.get([item])[0]) return res def get_input(provider, cfg, items): stuff = provider.get(items) if 'image' in items: img_pos = items.index('image') image = stuff[img_pos] # MPII has some huge images, which makes further processing too slow. # So, make image smaller if needed # IMP NOTE: Do not change the orig_im_ht or orig_im_wd, they are used for plotting # the pose and the pose is defined w.r.t to the original image size # Pose Label format: [16x3xn,] : x1,y1,score/isvisible... # if x1 and y1 are both -1, that point is not visible/labeled image = _resize_if_needed(image, cfg.MAX_INPUT_IMAGE_SIZE) if cfg.INPUT.INPUT_IMAGE_FORMAT.startswith('rendered-pose') or \ cfg.INPUT.INPUT_IMAGE_FORMAT.startswith('pose-glimpse'): pose_label, orig_im_ht, orig_im_wd = _get_other_items( provider, stuff, items, ['pose', 'im_ht', 'im_wd']) # pose_label = tf.Print(pose_label, [pose_label], "Pose Label: ") pose_label_was_list = True if not isinstance(pose_label, list): pose_label_was_list = False pose_label = [pose_label] if cfg.INPUT.INPUT_IMAGE_FORMAT.startswith('rendered-pose'): rendered_pose = tf.stack([render_pose( pose_label[i], orig_im_ht, orig_im_wd, # TODO: the following tf.shape is going to read the image irrespective # of whether needed or not to compute shape. However the code isn't # slow so not worrying about it at the moment. But fix it. tf.cast(tf.shape(image)[-2], tf.int64), out_type=cfg.INPUT.INPUT_IMAGE_FORMAT_POSE_RENDER_TYPE) for i in range(len(pose_label))]) rendered_pose = tf.image.resize_images( rendered_pose, tf.shape(image)[-3:-1]) if not pose_label_was_list: rendered_pose = rendered_pose[0] else: image_glimpse = tf.stack([extract_glimpse( image, pose_label[i], orig_im_ht, orig_im_wd, cfg.TRAIN.IMAGE_SIZE if cfg.INPUT.POSE_GLIMPSE_RESIZE else -1, cfg.INPUT.POSE_GLIMPSE_CONTEXT_RATIO, cfg.INPUT.POSE_GLIMPSE_PARTS_KEEP) for i in range(len(pose_label))]) if cfg.INPUT.INPUT_IMAGE_FORMAT.startswith('rendered-objects'): objects_label, orig_im_ht, orig_im_wd = _get_other_items( provider, stuff, items, ['objects', 'im_ht', 'im_wd']) # pose_label = tf.Print(pose_label, [pose_label], "Pose Label: ") rendered_objects = tf.stack([render_objects( objects_label[i], orig_im_ht, orig_im_wd, cfg.TRAIN.IMAGE_SIZE, out_channels=80) for i in range(len(objects_label))]) # Final output if cfg.INPUT.INPUT_IMAGE_FORMAT == 'rendered-pose': image = rendered_pose # debugging # image = tf.tile(tf.reduce_mean( # image, axis=-1, keep_dims=True), [1, 1, 1, 3]) elif cfg.INPUT.INPUT_IMAGE_FORMAT == 'rendered-pose-on-image': image = tf.cast(tf.to_float(image) * 0.5 + \ tf.to_float(rendered_pose) * 0.5, tf.uint8) elif cfg.INPUT.INPUT_IMAGE_FORMAT == 'rendered-objects': image = rendered_objects # To debug # image = tf.cast( # tf.to_float(image) * 0.0 + \ # tf.to_float(tf.image.resize_images( # tf.reduce_mean(rendered_objects, axis=-1, keep_dims=True), # tf.shape(image)[-3:-1])) * 1.0, # tf.uint8) elif cfg.INPUT.INPUT_IMAGE_FORMAT == 'pose-glimpse': image = image_glimpse stuff[img_pos] = image return stuff def train_preprocess_pipeline(provider, cfg, network_fn, num_pose_keypoints, image_preprocessing_fn): [image, pose_label, orig_im_ht, orig_im_wd, action_label] = get_input( provider, cfg, ['image', 'pose', 'im_ht', 'im_wd', 'action_label']) # for consistency between video and image datasets, convert image datasets to # 1-frame videos if image.get_shape().ndims == 3: image = tf.expand_dims(image, 0) pose_label = [pose_label] train_image_size = cfg.TRAIN.IMAGE_SIZE or network_fn.default_image_size # joint preprocessing combined_preproc_flag = False with tf.name_scope('CombinedPreproc'): if num_pose_keypoints > 0 and not cfg.TRAIN.LOSS_FN_POSE == '': combined_preproc_flag = True all_pose_label_hmap = [] all_pose_label_valid = [] for pl in pose_label: pose_label_hmap, pose_label_valid = pose_to_heatmap( pl, orig_im_ht, orig_im_wd, # small enough for preproc, big enough to see max(200, cfg.TRAIN.FINAL_POSE_HMAP_SIDE), out_channels=num_pose_keypoints, # if needed, do using a conv layer with fixed kernel # would be faster on GPU do_gauss_blur=False, marker_wd_ratio=cfg.HEATMAP_MARKER_WD_RATIO) # larger => large targets all_pose_label_hmap.append(pose_label_hmap) all_pose_label_valid.append(pose_label_valid) # concat on last axis for now (for preproc), will stack it (like the # valid labels) after that. pose_label_hmap = tf.concat(all_pose_label_hmap, axis=-1) pose_label_valid = tf.stack(all_pose_label_valid) # rgirdhar NOTE: This is the most expensive CPU part. My perf was super # slow with the output image sizes being 450x, because it'd first resize # the smallest dimension to 512 or so, and then take a 450 crop from that. # Doing that over RGB+heatmap channels was super slow, and is fixed when # using small sizes (now, 256 & 224 works well). Another issue was the # number of INTRA and INTER PARALLELIZATION THREADS, set in the train.py # which sped up a lot. Also saves from the machines getting stuck by # controlling the number of threads while giving better performance. For # me, the Inter=12 and Intra=4 worked well. preproc_info = {} # since images is 4D vector, need to reshape to pass it through preproc frames_per_video = image.get_shape().as_list()[0] image = tf.concat(tf.unstack(image), axis=-1) image = image_preprocessing_fn( image, train_image_size, train_image_size, resize_side_min=cfg.TRAIN.RESIZE_SIDE, resize_side_max=cfg.TRAIN.RESIZE_SIDE, preproc_info=preproc_info, modality=cfg.INPUT.VIDEO.MODALITY) # works for image too, rgb by def image = tf.stack(tf.split( image, frames_per_video, axis=image.get_shape().ndims-1)) if combined_preproc_flag: pose_label_hmap = _replay_augmentation(pose_label_hmap, preproc_info) pose_label_hmap = tf.image.convert_image_dtype(pose_label_hmap, tf.float32) # undo any value scaling that happened while preproc pose_label_hmap -= tf.reduce_min(pose_label_hmap) pose_label_hmap /= (tf.reduce_max(pose_label_hmap) + cfg.EPS) # reduce the size of heatmaps to reduce memory usage in queues pose_label_hmap = tf.image.resize_images( pose_label_hmap, [cfg.TRAIN.FINAL_POSE_HMAP_SIDE, cfg.TRAIN.FINAL_POSE_HMAP_SIDE]) pose_label_hmap.set_shape([ pose_label_hmap.get_shape().as_list()[0], pose_label_hmap.get_shape().as_list()[1], num_pose_keypoints * frames_per_video]) pose_label_hmap = tf.stack(tf.split( pose_label_hmap, frames_per_video, axis=pose_label_hmap.get_shape().ndims-1)) else: pose_label_hmap = tf.zeros((0,)) # dummy value, not used pose_label_valid = tf.zeros((0,)) # dummy value, not used return image, pose_label_hmap, pose_label_valid, action_label ================================================ FILE: src/restore/__init__.py ================================================ ================================================ FILE: src/restore/model_restorer.py ================================================ import numpy as np import h5py from tensorflow.contrib import slim from tensorflow.python.platform import tf_logging as logging from tensorflow.python import pywrap_tensorflow import tensorflow as tf import var_name_mapper def restore_model(checkpoint_path, variables_to_restore, ignore_missing_vars=False, var_name_mapper_type=None): all_ops = [] checkpoint_variables = variables_to_restore if checkpoint_path.endswith('.npy'): vars_to_restore_names = [ el.name for el in checkpoint_variables] key_name_mapper = var_name_mapper.map(var_name_mapper_type) init_weights = np.load(checkpoint_path).item() init_weights_final = {} vars_restored = [] for key in init_weights.keys(): for subkey in init_weights[key].keys(): final_key_name = key_name_mapper( key + '/' + subkey) if final_key_name not in vars_to_restore_names: logging.info('Not using %s from npy' % final_key_name) continue target_shape = slim.get_model_variables( final_key_name)[0].get_shape().as_list() pretrained_wts = init_weights[key][subkey].copy() target_shape_squeezed = np.delete( target_shape, np.where(np.array(target_shape) == 1)) pretrained_shape_squeezed = np.delete( pretrained_wts.shape, np.where(np.array(pretrained_wts.shape) == 1)) go_ahead = False # whether or not I'll be able to copy these weights if np.any(target_shape_squeezed != pretrained_shape_squeezed): logging.info('Shape mismatch var: %s from npy [%s vs %s]. ' % ( final_key_name, target_shape, pretrained_wts.shape)) if pretrained_shape_squeezed[-2] != target_shape_squeezed[-2]: logging.info('Trying repeating channels to make it similar.') pretrained_wts = np.repeat( np.mean(pretrained_wts, axis=-2, keepdims=True), repeats=target_shape_squeezed[-2], axis=-2) if np.all(target_shape_squeezed == pretrained_wts.shape): logging.info('Success! Copying the hacked weights.') go_ahead = True else: logging.info('Couldnot match the weights still.') else: go_ahead = True if go_ahead: init_weights_final[final_key_name] = \ pretrained_wts vars_restored.append(final_key_name) init_weights = init_weights_final for v in vars_to_restore_names: if v not in vars_restored: logging.fatal('No weights found for %s' % v) if not ignore_missing_vars: raise ValueError() all_ops.append(slim.assign_from_values_fn(init_weights)) else: all_ops.append(assign_from_checkpoint_fn( checkpoint_path, checkpoint_variables, ignore_missing_vars=ignore_missing_vars, resize_variables=True)) def combined(sess): for op in all_ops: op(sess) return combined def assign_from_checkpoint_fn(model_path, var_list, ignore_missing_vars=False, reshape_variables=False, resize_variables=False): """Modified function from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/framework/python/ops/variables.py Mod by rgirdhar to allow for repeating the channels dimension in case a layer does not match. It's useful for setting the first layer in flow models for videos. Does this only when resize_variables is True. """ """Returns a function that assigns specific variables from a checkpoint. If ignore_missing_vars is True and no variables are found in the checkpoint it returns None. Args: model_path: The full path to the model checkpoint. To get latest checkpoint use `model_path = tf.train.latest_checkpoint(checkpoint_dir)` var_list: A list of `Variable` objects or a dictionary mapping names in the checkpoint to the corresponding variables to initialize. If empty or `None`, it would return `no_op(), None`. ignore_missing_vars: Boolean, if True it would ignore variables missing in the checkpoint with a warning instead of failing. reshape_variables: Boolean, if True it would automatically reshape variables which are of different shape then the ones stored in the checkpoint but which have the same number of elements. resize_variables: Boolean, if True it would repeat the channels to match the target variable dimensions Returns: A function that takes a single argument, a `tf.Session`, that applies the assignment operation. If no matching variables were found in the checkpoint then `None` is returned. Raises: ValueError: If var_list is empty. """ if not var_list: raise ValueError('var_list cannot be empty') reader = pywrap_tensorflow.NewCheckpointReader(model_path) if isinstance(var_list, dict): var_dict = var_list else: var_dict = {var.op.name: var for var in var_list} available_vars = {} for var in var_dict: if reader.has_tensor(var): go_ahead = False V = reader.get_tensor(var) ckpt_shape = list(V.shape) target_shape = var_dict[var].get_shape().as_list() if np.all(ckpt_shape == target_shape): go_ahead = True else: if resize_variables: logging.warning('Resizing to assign to variable {} to {} from {}'.format( var, var_dict[var].get_shape().as_list(), V.shape)) V = np.repeat( np.mean(V, axis=-2, keepdims=True), repeats=target_shape[-2], axis=-2) ckpt_shape = list(V.shape) if np.all(ckpt_shape == target_shape): logging.info('Was able to match shape, so restoring the var :-)') go_ahead = True else: logging.error('Was not able to match shape, not restoring it!!!') go_ahead = False else: logging.error('Found a shape mismatch. Set resize_var to true to ' 'do a hacky shape copy.') if go_ahead: available_vars[var] = V else: logging.warning( 'Variable %s missing in checkpoint %s', var, model_path) if not ignore_missing_vars: raise ValueError() return slim.assign_from_values_fn(available_vars) def get_special_assigns(special_assign_vars): init_wts = {} special_assign_vars = special_assign_vars.split(',') for i in range(len(special_assign_vars) / 2): var_name = special_assign_vars[2*i] file_path = special_assign_vars[2*i+1] with h5py.File(file_path, 'r') as fin: init_wts[var_name] = fin['feat'].value logging.info('Special Assign: %s with a %s array' % ( var_name, init_wts[var_name].shape)) return slim.assign_from_values_fn(init_wts) ================================================ FILE: src/restore/var_name_mapper.py ================================================ def map(var_name_mapping): map_fn = lambda x: x if var_name_mapping == 'placenet365-vgg': map_fn = placenet365_vgg_fn elif var_name_mapping == 'cuhk-action-vgg': map_fn = cuhk_action_vgg elif var_name_mapping == 'cuhk-action-tsn': map_fn = cuhk_action_tsn elif var_name_mapping == 'xiaolonw_action_vgg_hmdb': map_fn = xiaolonw_action_vgg_hmdb else: raise ValueError('Invalid var name mapping') return map_fn def placenet365_vgg_fn(var_name): final_name = var_name if final_name.split('/')[0].startswith('conv'): final_name = \ final_name.split('/')[0].split('_')[0] + '/' + final_name elif final_name.split('/')[0] == 'fc8a': final_name = final_name.replace('fc8a', 'fc8') return 'vgg_16/' + final_name + ':0' def cuhk_action_vgg(var_name): final_name = var_name if final_name.split('/')[0].startswith('conv'): final_name = \ final_name.split('/')[0].split('_')[0] + '/' + final_name elif final_name.split('/')[0].startswith('fc8'): final_name = final_name.replace(final_name.split('/')[0], 'fc8') return 'vgg_16/' + final_name + ':0' def xiaolonw_action_vgg_hmdb(var_name): final_name = var_name if final_name.split('/')[0].startswith('conv'): final_name = \ final_name.split('/')[0].split('_')[0] + '/' + final_name elif final_name.split('/')[0] == 'fc8_hmdb': final_name = final_name.replace('fc8_hmdb', 'fc8') return 'vgg_16/' + final_name + ':0' def cuhk_action_tsn(var_name): final_name = var_name var_name = final_name.split('/')[-1] if final_name.split('/')[0].endswith('_bn'): if var_name == 'scale': var_name = 'gamma' elif var_name == 'shift': var_name = 'beta' elif var_name == 'mean': var_name = 'moving_mean' elif var_name == 'variance': var_name = 'moving_variance' final_name = \ final_name.split('/')[0][:-3] + '/BatchNorm/' + var_name elif final_name.split('/')[0] == 'fc-action': final_name = 'Logits/Conv/' + var_name else: final_name = final_name.split('/')[0] + '/Conv/' + var_name block_name = final_name.split('/')[0] pos = None if block_name.startswith('inception'): pos = len('inception_xx') elif block_name.startswith('conv'): pos = len('convx') if pos is not None: final_name = final_name[:pos] + '/' + final_name[pos+1:] return 'InceptionV2_TSN/' + final_name + ':0' ================================================ FILE: src/train.py ================================================ from __future__ import absolute_import from __future__ import division from __future__ import print_function import argparse import tensorflow as tf import sys import pprint import os import time import numpy as np from datetime import datetime from tensorflow.python.ops import control_flow_ops from tensorflow.python.client import timeline from datasets import dataset_factory sys.path.append('../models/slim') from deployment import model_deploy from nets import nets_factory from preprocessing import preprocessing_factory from config import cfg, cfg_from_file, cfg_from_list, get_output_dir from restore import model_restorer from loss import gen_losses from preprocess_pipeline import train_preprocess_pipeline slim = tf.contrib.slim def _configure_learning_rate(num_samples_per_epoch, num_clones, global_step): """Configures the learning rate. Args: num_samples_per_epoch: The number of samples in each epoch of training. global_step: The global_step tensor. Returns: A `Tensor` representing the learning rate. Raises: ValueError: if """ if cfg.TRAIN.NUM_STEPS_PER_DECAY > 0: decay_steps = cfg.TRAIN.NUM_STEPS_PER_DECAY tf.logging.info('Using {} steps for decay. Ignoring any epoch setting for ' 'decay.'.format(decay_steps)) else: decay_steps = int(num_samples_per_epoch / ( cfg.TRAIN.BATCH_SIZE * num_clones * cfg.TRAIN.ITER_SIZE) * cfg.TRAIN.NUM_EPOCHS_PER_DECAY) if cfg.TRAIN.LEARNING_RATE_DECAY_TYPE == 'exponential': return tf.train.exponential_decay(cfg.TRAIN.LEARNING_RATE, global_step, decay_steps, cfg.TRAIN.LEARNING_RATE_DECAY_RATE, staircase=True, name='exponential_decay_learning_rate') elif cfg.TRAIN.LEARNING_RATE_DECAY_TYPE == 'fixed': return tf.constant(cfg.TRAIN.LEARNING_RATE, name='fixed_learning_rate') elif cfg.TRAIN.LEARNING_RATE_DECAY_TYPE == 'polynomial': return tf.train.polynomial_decay(cfg.TRAIN.LEARNING_RATE, global_step, decay_steps, cfg.TRAIN.END_LEARNING_RATE, power=1.0, cycle=False, name='polynomial_decay_learning_rate') else: raise ValueError('learning_rate_decay_type [%s] was not recognized', cfg.TRAIN.LEARNING_RATE_DECAY_RATE) def _configure_optimizer(learning_rate): """Configures the optimizer used for training. Args: learning_rate: A scalar or `Tensor` learning rate. Returns: An instance of an optimizer. Raises: ValueError: if cfg.optimizer is not recognized. """ if cfg.TRAIN.OPTIMIZER == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate, beta1=cfg.TRAIN.ADAM_BETA1, beta2=cfg.TRAIN.ADAM_BETA2, epsilon=cfg.TRAIN.OPT_EPSILON) elif cfg.TRAIN.OPTIMIZER == 'momentum': optimizer = tf.train.MomentumOptimizer( learning_rate, momentum=cfg.TRAIN.MOMENTUM, name='Momentum') elif cfg.TRAIN.OPTIMIZER == 'rmsprop': optimizer = tf.train.RMSPropOptimizer( learning_rate, decay=cfg.TRAIN.RMSPROP_DECAY, momentum=cfg.TRAIN.MOMENTUM, epsilon=cfg.TRAIN.OPT_EPSILON) elif cfg.TRAIN.OPTIMIZER == 'sgd': optimizer = tf.train.GradientDescentOptimizer(learning_rate) else: raise ValueError('Optimizer [%s] was not recognized', cfg.TRAIN.OPTIMIZER) return optimizer def _add_variables_summaries(learning_rate): summaries = [] for variable in slim.get_model_variables(): summaries.append(tf.histogram_summary(variable.op.name, variable)) summaries.append(tf.summary.scalar(tensor=learning_rate, name='training/Learning Rate')) return summaries def _get_init_fn(train_dir): """Returns a function run by the chief worker to warm-start the training. Note that the init_fn is only run when initializing the model during the very first global step. Returns: An init function run by the supervisor. """ if cfg.TRAIN.CHECKPOINT_PATH is None: return None # Warn the user if a checkpoint exists in the train_dir. Then we'll be # ignoring the checkpoint anyway. if tf.train.latest_checkpoint(train_dir): tf.logging.info( 'Ignoring --checkpoint_path because a checkpoint already exists in %s' % train_dir) return None exclusions = [] if cfg.TRAIN.CHECKPOINT_EXCLUDE_SCOPES: exclusions = [scope.strip() for scope in cfg.TRAIN.CHECKPOINT_EXCLUDE_SCOPES.split(',')] # variables_to_restore = slim.get_variables_to_restore(exclude=exclusions) # NOTE: The above was wrong!! It would restore all global_step, momentum etc # variables too, which we don't want when starting from a pretrained model # (like imagenet). The above is (and should be) used when restoring from a # half-trained model of the same script (which doesn't happen here anyway, # see above, there's a return None if a checkpoint exists) variables_to_restore = slim.filter_variables( slim.get_model_variables(), exclude_patterns=exclusions) if tf.gfile.IsDirectory(cfg.TRAIN.CHECKPOINT_PATH): checkpoint_path = tf.train.latest_checkpoint(cfg.TRAIN.CHECKPOINT_PATH) else: checkpoint_path = cfg.TRAIN.CHECKPOINT_PATH tf.logging.info('Fine-tuning from %s' % checkpoint_path) return model_restorer.restore_model( checkpoint_path, variables_to_restore, ignore_missing_vars=cfg.TRAIN.IGNORE_MISSING_VARS, var_name_mapper_type=cfg.TRAIN.VAR_NAME_MAPPER) def _get_variables_to_train(): """Returns a list of variables to train. Returns: A list of variables to train by the optimizer. """ if cfg.TRAIN.TRAINABLE_SCOPES == '': return tf.trainable_variables() else: scopes = [scope.strip() for scope in cfg.TRAIN.TRAINABLE_SCOPES.split(',')] variables_to_train = [] for scope in scopes: variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) variables_to_train.extend(variables) return variables_to_train def _gen_overlayed_img(hmap, img): with tf.name_scope('VisualizeOverlayedHeatmap'): hmap = tf.expand_dims(hmap, -1) hmap = tf.image.resize_images( hmap, img.get_shape().as_list()[-3:-1]) img = tf.tile( tf.image.rgb_to_grayscale(img), [1, 1, 1, 3]) hmap = tf.image.grayscale_to_rgb(hmap) hmap = tf.concat([ tf.expand_dims(hmap[..., 0] * 255.0, -1), hmap[..., 1:] * 0.0], axis=-1) return (0.5 * img + 0.5 * hmap) def _summarize_heatmaps(name, tensor, img_tensor): # return tf.summary.image(name, tf.reduce_sum(tensor, axis=-1, keep_dims=True)) if tensor.get_shape()[-1] == 0: tf.logging.info('Pose heatmaps have 0 channels. Not summarizing') return set() return set([ tf.summary.image( name + '/head', _gen_overlayed_img(tensor[..., 9], img_tensor)), tf.summary.image( name + '/lwrist', _gen_overlayed_img(tensor[..., 15], img_tensor)), tf.summary.image( name + '/rankle', _gen_overlayed_img(tensor[..., 0], img_tensor)), tf.summary.image( name + '/pelvis', _gen_overlayed_img(tensor[..., 6], img_tensor))]) end_points_debug = {} def _train_step(sess, train_op, global_step, train_step_kwargs): """Function that takes a gradient step and specifies whether to stop. Args: sess: The current session. train_op: An `Operation` that evaluates the gradients and returns the total loss. global_step: A `Tensor` representing the global training step. train_step_kwargs: A dictionary of keyword arguments. Returns: The total loss and a boolean indicating whether or not to stop training. Raises: ValueError: if 'should_trace' is in `train_step_kwargs` but `logdir` is not. """ start_time = time.time() trace_run_options = None run_metadata = None if 'should_trace' in train_step_kwargs: if 'logdir' not in train_step_kwargs: raise ValueError('logdir must be present in train_step_kwargs when ' 'should_trace is present') if sess.run(train_step_kwargs['should_trace']): trace_run_options = config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE) run_metadata = config_pb2.RunMetadata() if cfg.TRAIN.ITER_SIZE == 1: # To Debug, uncomment here and observe the end_points_debug total_loss, np_global_step = sess.run([train_op, global_step], options=trace_run_options, run_metadata=run_metadata) else: for j in range(cfg.TRAIN.ITER_SIZE-1): sess.run([train_op[j]]) total_loss, np_global_step = sess.run([ train_op[cfg.TRAIN.ITER_SIZE-1], global_step], options=trace_run_options, run_metadata=run_metadata) time_elapsed = time.time() - start_time if run_metadata is not None: tl = timeline.Timeline(run_metadata.step_stats) trace = tl.generate_chrome_trace_format() trace_filename = os.path.join(train_step_kwargs['logdir'], 'tf_trace-%d.json' % np_global_step) tf.logging.info('Writing trace to %s', trace_filename) file_io.write_string_to_file(trace_filename, trace) if 'summary_writer' in train_step_kwargs: train_step_kwargs['summary_writer'].add_run_metadata(run_metadata, 'run_metadata-%d' % np_global_step) if 'should_log' in train_step_kwargs: if sess.run(train_step_kwargs['should_log']): tf.logging.info('%s: global step %d: loss = %.4f (%.2f sec/step)', datetime.now(), np_global_step, total_loss, time_elapsed) if 'should_stop' in train_step_kwargs: should_stop = sess.run(train_step_kwargs['should_stop']) else: should_stop = False return total_loss, should_stop def parse_args(): """ Parse input arguments """ parser = argparse.ArgumentParser(description='Train a keypoint regressor.') parser.add_argument('--cfg', dest='cfg_file', help='optional config file', default=None, type=str) args = parser.parse_args() return args def main(): args = parse_args() if args.cfg_file is not None: cfg_from_file(args.cfg_file) tf.logging.info('Using Config:') pprint.pprint(cfg) train_dir = get_output_dir('default' if args.cfg_file is None else args.cfg_file) os.environ['CUDA_VISIBLE_DEVICES'] = cfg.GPUS num_clones = len(cfg.GPUS.split(',')) tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): ###################### # Config model_deploy# ###################### tf.set_random_seed(cfg.RNG_SEED) deploy_config = model_deploy.DeploymentConfig( num_clones=num_clones, clone_on_cpu=False, replica_id=0, num_replicas=1, num_ps_tasks=0) # Create global_step with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() ###################### # Select the dataset # ###################### kwargs = {} if cfg.TRAIN.VIDEO_FRAMES_PER_VIDEO > 1: kwargs['num_samples'] = cfg.TRAIN.VIDEO_FRAMES_PER_VIDEO kwargs['randomFromSegmentStyle'] = cfg.TRAIN.READ_SEGMENT_STYLE kwargs['modality'] = cfg.INPUT.VIDEO.MODALITY kwargs['split_id'] = cfg.INPUT.SPLIT_ID if cfg.DATASET_LIST_DIR != '': kwargs['dataset_list_dir'] = cfg.DATASET_LIST_DIR if cfg.INPUT_FILE_STYLE_LABEL != '': kwargs['input_file_style_label'] = cfg.INPUT_FILE_STYLE_LABEL dataset, num_pose_keypoints = dataset_factory.get_dataset( cfg.DATASET_NAME, cfg.TRAIN.DATASET_SPLIT_NAME, cfg.DATASET_DIR, **kwargs) #################### # Select the network # #################### network_fn = nets_factory.get_network_fn( cfg.MODEL_NAME, num_classes=(dataset.num_classes), num_pose_keypoints=num_pose_keypoints, weight_decay=cfg.TRAIN.WEIGHT_DECAY, is_training=True, cfg=cfg) # advanced network creation controlled with cfg.NET ##################################### # Select the preprocessing function # ##################################### preprocessing_name = cfg.MODEL_NAME image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## with tf.device(deploy_config.inputs_device()): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=cfg.NUM_READERS, common_queue_capacity=20 * cfg.TRAIN.BATCH_SIZE, common_queue_min=10 * cfg.TRAIN.BATCH_SIZE) [image, pose_label_hmap, pose_label_valid, action_label] = train_preprocess_pipeline( provider, cfg, network_fn, num_pose_keypoints, image_preprocessing_fn) # input_data = [preprocess_pipeline( # provider, cfg, network_fn, num_pose_keypoints, image_preprocessing_fn) # for _ in range(cfg.NUM_PREPROCESSING_THREADS)] images, pose_labels_hmap, pose_labels_valid, action_labels = tf.train.batch( [image, pose_label_hmap, pose_label_valid, action_label], # input_data, batch_size=cfg.TRAIN.BATCH_SIZE, num_threads=cfg.NUM_PREPROCESSING_THREADS, capacity=5 * cfg.TRAIN.BATCH_SIZE) batch_queue = slim.prefetch_queue.prefetch_queue( [images, pose_labels_hmap, pose_labels_valid, action_labels], capacity=5 * deploy_config.num_clones * cfg.TRAIN.ITER_SIZE) #################### # Define the model # #################### def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" images, labels_pose, labels_pose_valid, labels_action = batch_queue.dequeue() # due to the multi-frame/video thing, need to squeeze first 2 dimensions labels_pose = tf.concat(tf.unstack(labels_pose), axis=0) labels_pose_valid = tf.concat(tf.unstack(labels_pose_valid), axis=0) logits, end_points = network_fn(images) pose_logits = end_points['PoseLogits'] ############################# # Specify the loss function # ############################# # if 'AuxLogits' in end_points: # slim.losses.softmax_cross_entropy( # end_points['AuxLogits'], labels, # label_smoothing=cfg.TRAIN.LABEL_SMOOTHING, weight=0.4, scope='aux_loss') # slim.losses.softmax_cross_entropy( # logits, labels, label_smoothing=cfg.TRAIN.LABEL_SMOOTHING, weight=1.0) end_points['Images'] = images end_points['PoseLabels'] = labels_pose end_points['ActionLabels'] = labels_action end_points['ActionLogits'] = logits tf.logging.info('PoseLogits shape is {}.'.format(pose_logits.get_shape().as_list())) gen_losses(labels_action, logits, cfg.TRAIN.LOSS_FN_ACTION, dataset.num_classes, cfg.TRAIN.LOSS_FN_ACTION_WT, labels_pose, pose_logits, cfg.TRAIN.LOSS_FN_POSE, labels_pose_valid, cfg.TRAIN.LOSS_FN_POSE_WT, end_points, cfg) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs # store the end points in a global variable for debugging in train_step global end_points_debug end_points_debug = end_points for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) # summaries.add(tf.summary.scalar(tf.nn.zero_fraction(x), # name='sparsity/' + end_point)) sum_img = tf.concat(tf.unstack(end_points['Images']), axis=0) if sum_img.get_shape().as_list()[-1] not in [1, 3, 4]: sum_img = tf.reduce_sum(sum_img, axis=-1, keep_dims=True) sum_img = sum_img - tf.reduce_min(sum_img) sum_img = sum_img / (tf.reduce_max(sum_img) + cfg.EPS) summaries.add(tf.summary.image('images', sum_img)) for epname in cfg.TRAIN.OTHER_IMG_SUMMARIES_TO_ADD: if epname in end_points: summaries.add(tf.summary.image('image_vis/' + epname, end_points[epname])) summaries = summaries.union(_summarize_heatmaps( 'labels', end_points['PoseLabels'], sum_img)) summaries = summaries.union(_summarize_heatmaps( 'pose', end_points['PoseLogits'], sum_img)) if 'PoseLossMask' in end_points: summaries = summaries.union(_summarize_heatmaps( 'loss_mask/pose', end_points['PoseLossMask'], sum_img)) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar(tensor=loss, name='losses/%s' % loss.op.name)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if cfg.TRAIN.MOVING_AVERAGE_VARIABLES: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( cfg.TRAIN.MOVING_AVERAGE_VARIABLES, global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = _configure_learning_rate(dataset.num_samples, num_clones, global_step) optimizer = _configure_optimizer(learning_rate) summaries.add(tf.summary.scalar(tensor=learning_rate, name='learning_rate')) # if cfg.sync_replicas: # # If sync_replicas is enabled, the averaging will be done in the chief # # queue runner. # optimizer = tf.train.SyncReplicasOptimizer( # opt=optimizer, # replicas_to_aggregate=, # variable_averages=variable_averages, # variables_to_average=moving_average_variables, # replica_id=tf.constant(cfg.task, tf.int32, shape=()), # total_num_replicas=cfg.worker_replicas) # elif cfg.moving_average_decay: # # Update ops executed locally by trainer. # update_ops.append(variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = _get_variables_to_train() tf.logging.info('Training the following variables: {}'.format( ', '.join([var.op.name for var in variables_to_train]))) # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train, clip_gradients=cfg.TRAIN.CLIP_GRADIENTS) # Add total_loss to summary. summaries.add(tf.summary.scalar(tensor=total_loss, name='total_loss')) # Create gradient updates. train_ops = {} if cfg.TRAIN.ITER_SIZE == 1: grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') train_ops = train_tensor else: with tf.name_scope('AccumulateGradients'): # copied as is from my previous code gvs = [(grad, var) for grad, var in clones_gradients] varnames = [var.name for grad, var in gvs] varname_to_var = {var.name: var for grad, var in gvs} varname_to_grad = {var.name: grad for grad, var in gvs} varname_to_ref_grad = {} for vn in varnames: grad = varname_to_grad[vn] print("accumulating ... ", (vn, grad.get_shape())) with tf.variable_scope("ref_grad"): with tf.device(deploy_config.variables_device()): ref_var = slim.local_variable( np.zeros(grad.get_shape(),dtype=np.float32), name=vn[:-2]) varname_to_ref_grad[vn] = ref_var all_assign_ref_op = [ref.assign(varname_to_grad[vn]) for vn, ref in varname_to_ref_grad.items()] all_assign_add_ref_op = [ref.assign_add(varname_to_grad[vn]) for vn, ref in varname_to_ref_grad.items()] assign_gradients_ref_op = tf.group(*all_assign_ref_op) accmulate_gradients_op = tf.group(*all_assign_add_ref_op) with tf.control_dependencies([accmulate_gradients_op]): final_gvs = [(varname_to_ref_grad[var.name] / float(cfg.TRAIN.ITER_SIZE), var) for grad, var in gvs] apply_gradients_op = optimizer.apply_gradients(final_gvs, global_step=global_step) update_ops.append(apply_gradients_op) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') for i in range(cfg.TRAIN.ITER_SIZE): if i == 0: train_ops[i] = assign_gradients_ref_op elif i < cfg.TRAIN.ITER_SIZE - 1: # because apply_gradients also computes # (see control_dependency), so # no need of running an extra iteration train_ops[i] = accmulate_gradients_op else: train_ops[i] = train_tensor # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True config.intra_op_parallelism_threads = 4 # to avoid too many threads # The following seems optimal... though not sure config.inter_op_parallelism_threads = max( cfg.NUM_PREPROCESSING_THREADS, 12) ########################### # Kicks off the training. # ########################### slim.learning.train( train_ops, train_step_fn=_train_step, logdir=train_dir, master='', is_chief=True, init_fn=_get_init_fn(train_dir), summary_op=summary_op, number_of_steps=cfg.TRAIN.MAX_NUMBER_OF_STEPS, log_every_n_steps=cfg.TRAIN.LOG_EVERY_N_STEPS, save_summaries_secs=cfg.TRAIN.SAVE_SUMMARIES_SECS, save_interval_secs=cfg.TRAIN.SAVE_INTERVAL_SECS, sync_optimizer=None, session_config=config) if __name__ == '__main__': main() ================================================ FILE: utils/convert_mpii_result_for_eval.m ================================================ function convert(h5_file) outfpath = [h5_file '.mat']; TOTAL_ACT_IDS = 983; % = max([RELEASE.act.act_id]) DATA_DIR='../src/data/mpii/mpii_tfrecords'; sample_ids = dlmread(fullfile(DATA_DIR, 'test_ids.txt')); % scores = zeros(numel(sample_ids), numel(class_ids)); class_ids = {}; cid = 0; fid = fopen(fullfile(DATA_DIR, 'classes.txt'), 'r'); while ~feof(fid) line = fgetl(fid); cid = cid + 1; parts = strsplit(line, ';'); class_ids{cid} = parts{1}; % nums = cellfun(@str2num, strsplit(parts{2}, ',')); % cls_to_ids{cid} = nums; end scores = h5read(h5_file, '/logits')'; % for cid = 1 : numel(cls_to_ids) % targets = cls_to_ids{cid}; % for i = 1 : numel(targets) % scores(:, targets(i)) = logits(:, cid); % end % end save(outfpath, 'sample_ids', 'class_ids', 'scores'); ================================================ FILE: utils/convert_mpii_result_for_eval.sh ================================================ if [ $# -lt 1 ]; then echo "Usage $0
" fi nice -n 19 matlab -nodisplay -r "cd ../utils/; convert_mpii_result_for_eval('$1'); exit;" ================================================ FILE: utils/dataset_utils/gen_tfrecord_mpii.py ================================================ from __future__ import absolute_import from __future__ import division from __future__ import print_function import math import os import random import sys import scipy.io import operator import numpy as np import tensorflow as tf os.environ['CUDA_VISIBLE_DEVICES'] = "" # Set the following paths _MPII_MAT_FILE = '/path/to/mpii_human_pose_v1_u12_1.mat' _IMG_DIR = '/path/to/MPII/images/' dataset_dir = '../../src/data/mpii/mpii_tfrecords/' _SPLITS_PATH = '../../src/data/mpii/lists/images_mpii_{0}.txt' # Seed for repeatability. _RANDOM_SEED = 42 # The number of shards per dataset split. _NUM_SHARDS = 20 _NUM_JOINTS = 16 # for pose class ImageReader(object): """Helper class that provides TensorFlow image coding utilities.""" def __init__(self): # Initializes function that decodes RGB JPEG data. self._decode_jpeg_data = tf.placeholder(dtype=tf.string) self._decode_jpeg = tf.image.decode_jpeg(self._decode_jpeg_data, channels=3) def read_image_dims(self, sess, image_data): image = self.decode_jpeg(sess, image_data) return image.shape[0], image.shape[1] def decode_jpeg(self, sess, image_data): image = sess.run(self._decode_jpeg, feed_dict={self._decode_jpeg_data: image_data}) assert len(image.shape) == 3 assert image.shape[2] == 3 return image def int64_feature(values): """Returns a TF-Feature of int64s. Args: values: A scalar or list of values. Returns: a TF-Feature. """ if not isinstance(values, (tuple, list)): values = [values] return tf.train.Feature(int64_list=tf.train.Int64List(value=values)) def float_feature(values): return tf.train.Feature(float_list=tf.train.FloatList(value=values)) def bytes_feature(values): """Returns a TF-Feature of bytes. Args: values: A string. Returns: a TF-Feature. """ return tf.train.Feature(bytes_list=tf.train.BytesList(value=[values])) def image_to_tfexample(image_data, image_format, height, width, pose, # [x,y,is_vis,...] action_label): assert(len(pose) % (_NUM_JOINTS * 3) == 0) return tf.train.Example(features=tf.train.Features(feature={ 'image/encoded': bytes_feature(image_data), 'image/format': bytes_feature(image_format), 'image/class/pose': int64_feature([int(el) for el in pose]), 'image/class/action_label': int64_feature(action_label), 'image/height': int64_feature(height), 'image/width': int64_feature(width), })) def _get_dataset_filename(dataset_dir, split_name, shard_id): output_filename = 'mpii_%s_%05d-of-%05d.tfrecord' % ( split_name, shard_id, _NUM_SHARDS) return os.path.join(dataset_dir, output_filename) def _convert_dataset(split_name, list_to_write, dataset_dir): num_per_shard = int(math.ceil(len(list_to_write) / float(_NUM_SHARDS))) with tf.Graph().as_default(): image_reader = ImageReader() with tf.Session('') as sess: for shard_id in range(_NUM_SHARDS): output_filename = _get_dataset_filename( dataset_dir, split_name, shard_id) with tf.python_io.TFRecordWriter(output_filename) as tfrecord_writer: start_ndx = shard_id * num_per_shard end_ndx = min((shard_id+1) * num_per_shard, len(list_to_write)) for i in range(start_ndx, end_ndx): sys.stdout.write('\r>> Converting image %d/%d shard %d' % ( i+1, len(list_to_write), shard_id)) sys.stdout.flush() # Read the filename: fname = os.path.join(_IMG_DIR, list_to_write[i][0]) action_label = list_to_write[i][1] poses = list_to_write[i][2] all_joints = [] for pose in poses: joints = dict((el[0], [el[1], el[2], el[3]]) for el in pose) final_pose = [] for i in range(_NUM_JOINTS): if i in joints: final_pose.append(joints[i]) else: final_pose.append([-1, -1, 0]) final_pose = [item for sublist in final_pose for item in sublist] all_joints += final_pose assert(len(all_joints) % 16 == 0) image_data = tf.gfile.FastGFile(fname, 'r').read() height, width = image_reader.read_image_dims(sess, image_data) example = image_to_tfexample( image_data, 'jpg', height, width, all_joints, action_label) tfrecord_writer.write(example.SerializeToString()) sys.stdout.write('\n') sys.stdout.flush() def _get_action_class(cname, D, act_id): try: if cname not in D: D[cname] = (len(D.keys()), set([act_id])) # act_id is the actual MPII action id else: D[cname][1].add(act_id) # It's pretty crazy that same action will have multiple action IDs return D[cname][0] except Exception, e: print('Invalid class name {}. setting -1. {}'.format(cname, e)) return -1 def main(): T = scipy.io.loadmat(_MPII_MAT_FILE, squeeze_me=True, struct_as_record=False) annots = T['RELEASE'].annolist is_train = T['RELEASE'].img_train action_label = T['RELEASE'].act splits = ['train', 'val', 'test'] lists_to_write = {} img_id_in_split = {} all_imnames = [] for spl in splits: lists_to_write[spl] = [] img_id_in_split[spl] = [] splits_filenames = {} filename_to_split = {} actclassname_to_id = {} for spl in splits: with open(_SPLITS_PATH.format(spl), 'r') as fin: splits_filenames[spl] = fin.read().splitlines() filename_to_split.update(dict(zip( splits_filenames[spl], [spl] * len(splits_filenames[spl])))) for aid,annot in enumerate(annots): imname = annot.image.name all_imnames.append(imname) try: this_split = filename_to_split[imname[:-4]] except: continue # ignore this image points_fmted = [] # put all points one after the other if 'annorect' in dir(annot): all_rects = annot.annorect if isinstance(all_rects, scipy.io.matlab.mio5_params.mat_struct): all_rects = np.array([all_rects]) for rect in all_rects: points_rect = [] try: points = rect.annopoints.point except: continue if isinstance(points, scipy.io.matlab.mio5_params.mat_struct): points = np.array([points]) for point in points: try: is_visible = point.is_visible if point.is_visible in [1,0] else 0 except: is_visible = 0 points_rect.append((point.id, point.x, point.y, is_visible)) points_fmted.append(points_rect) [el.sort() for el in points_fmted] # the following assert is not true, so putting -1 when writing it out # assert(all([len(el) == 16 for el in points_fmted])) image_obj = (annot.image.name, _get_action_class(action_label[aid].act_name, actclassname_to_id, action_label[aid].act_id), points_fmted) if os.path.exists(os.path.join(_IMG_DIR, imname)): lists_to_write[this_split].append(image_obj) img_id_in_split[this_split].append(aid+1) # 1-indexed cls_ids = sorted(actclassname_to_id.items(), key=operator.itemgetter(1)) print('Total classes found: {}'.format(len(cls_ids))) #write out the dictionary of classnames with open(os.path.join(dataset_dir, 'classes.txt'), 'w') as fout: fout.write('\n'.join([el[0] + ';' + ','.join([ str(e) for e in list(el[1][1])]) for el in cls_ids])) if not tf.gfile.Exists(dataset_dir): tf.gfile.MakeDirs(dataset_dir) # Only randomize the train set random.seed(_RANDOM_SEED) train_ids = range(len(lists_to_write['train'])) random.shuffle(train_ids) lists_to_write['train'] = [lists_to_write['train'][i] for i in train_ids] img_id_in_split['train'] = [img_id_in_split['train'][i] for i in train_ids] with open(os.path.join(dataset_dir, 'imnames.txt'), 'w') as fout: fout.write('\n'.join(all_imnames)) for spl in splits: with open(os.path.join( dataset_dir, '{}_ids.txt'.format(spl)), 'w') as fout: fout.write('\n'.join([str(el) for el in img_id_in_split[spl]])) spl_name = spl if spl in ['train', 'val']: spl_name = 'trainval_' + spl # would be useful when training on tr+val print('Writing {} images for split {}.'.format( len(lists_to_write[spl]), spl)) _convert_dataset(spl_name, lists_to_write[spl], dataset_dir) print('\nFinished converting the MPII dataset!') if __name__ == '__main__': main()