Repository: rbgirshick/py-faster-rcnn Branch: master Commit: 781a917b378d Files: 142 Total size: 531.8 KB Directory structure: gitextract_dlq41wq8/ ├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── data/ │ ├── .gitignore │ ├── README.md │ ├── pylintrc │ └── scripts/ │ ├── fetch_faster_rcnn_models.sh │ ├── fetch_imagenet_models.sh │ └── fetch_selective_search_data.sh ├── experiments/ │ ├── README.md │ ├── cfgs/ │ │ ├── faster_rcnn_alt_opt.yml │ │ └── faster_rcnn_end2end.yml │ ├── logs/ │ │ └── .gitignore │ └── scripts/ │ ├── fast_rcnn.sh │ ├── faster_rcnn_alt_opt.sh │ └── faster_rcnn_end2end.sh ├── lib/ │ ├── Makefile │ ├── datasets/ │ │ ├── VOCdevkit-matlab-wrapper/ │ │ │ ├── get_voc_opts.m │ │ │ ├── voc_eval.m │ │ │ └── xVOCap.m │ │ ├── __init__.py │ │ ├── coco.py │ │ ├── ds_utils.py │ │ ├── factory.py │ │ ├── imdb.py │ │ ├── pascal_voc.py │ │ ├── tools/ │ │ │ └── mcg_munge.py │ │ └── voc_eval.py │ ├── fast_rcnn/ │ │ ├── __init__.py │ │ ├── bbox_transform.py │ │ ├── config.py │ │ ├── nms_wrapper.py │ │ ├── test.py │ │ └── train.py │ ├── nms/ │ │ ├── .gitignore │ │ ├── __init__.py │ │ ├── cpu_nms.pyx │ │ ├── gpu_nms.hpp │ │ ├── gpu_nms.pyx │ │ ├── nms_kernel.cu │ │ └── py_cpu_nms.py │ ├── pycocotools/ │ │ ├── UPSTREAM_REV │ │ ├── __init__.py │ │ ├── _mask.pyx │ │ ├── coco.py │ │ ├── cocoeval.py │ │ ├── license.txt │ │ ├── mask.py │ │ ├── maskApi.c │ │ └── maskApi.h │ ├── roi_data_layer/ │ │ ├── __init__.py │ │ ├── layer.py │ │ ├── minibatch.py │ │ └── roidb.py │ ├── rpn/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── anchor_target_layer.py │ │ ├── generate.py │ │ ├── generate_anchors.py │ │ ├── proposal_layer.py │ │ └── proposal_target_layer.py │ ├── setup.py │ ├── transform/ │ │ ├── __init__.py │ │ └── torch_image_transform_layer.py │ └── utils/ │ ├── .gitignore │ ├── __init__.py │ ├── bbox.pyx │ ├── blob.py │ └── timer.py ├── models/ │ ├── README.md │ ├── coco/ │ │ ├── VGG16/ │ │ │ ├── fast_rcnn/ │ │ │ │ ├── solver.prototxt │ │ │ │ ├── test.prototxt │ │ │ │ └── train.prototxt │ │ │ └── faster_rcnn_end2end/ │ │ │ ├── solver.prototxt │ │ │ ├── test.prototxt │ │ │ └── train.prototxt │ │ └── VGG_CNN_M_1024/ │ │ ├── fast_rcnn/ │ │ │ ├── solver.prototxt │ │ │ ├── test.prototxt │ │ │ └── train.prototxt │ │ └── faster_rcnn_end2end/ │ │ ├── solver.prototxt │ │ ├── test.prototxt │ │ └── train.prototxt │ └── pascal_voc/ │ ├── VGG16/ │ │ ├── fast_rcnn/ │ │ │ ├── solver.prototxt │ │ │ ├── test.prototxt │ │ │ └── train.prototxt │ │ ├── faster_rcnn_alt_opt/ │ │ │ ├── faster_rcnn_test.pt │ │ │ ├── rpn_test.pt │ │ │ ├── stage1_fast_rcnn_solver30k40k.pt │ │ │ ├── stage1_fast_rcnn_train.pt │ │ │ ├── stage1_rpn_solver60k80k.pt │ │ │ ├── stage1_rpn_train.pt │ │ │ ├── stage2_fast_rcnn_solver30k40k.pt │ │ │ ├── stage2_fast_rcnn_train.pt │ │ │ ├── stage2_rpn_solver60k80k.pt │ │ │ └── stage2_rpn_train.pt │ │ └── faster_rcnn_end2end/ │ │ ├── solver.prototxt │ │ ├── test.prototxt │ │ └── train.prototxt │ ├── VGG_CNN_M_1024/ │ │ ├── fast_rcnn/ │ │ │ ├── solver.prototxt │ │ │ ├── test.prototxt │ │ │ └── train.prototxt │ │ ├── faster_rcnn_alt_opt/ │ │ │ ├── faster_rcnn_test.pt │ │ │ ├── rpn_test.pt │ │ │ ├── stage1_fast_rcnn_solver30k40k.pt │ │ │ ├── stage1_fast_rcnn_train.pt │ │ │ ├── stage1_rpn_solver60k80k.pt │ │ │ ├── stage1_rpn_train.pt │ │ │ ├── stage2_fast_rcnn_solver30k40k.pt │ │ │ ├── stage2_fast_rcnn_train.pt │ │ │ ├── stage2_rpn_solver60k80k.pt │ │ │ └── stage2_rpn_train.pt │ │ └── faster_rcnn_end2end/ │ │ ├── solver.prototxt │ │ ├── test.prototxt │ │ └── train.prototxt │ └── ZF/ │ ├── fast_rcnn/ │ │ ├── solver.prototxt │ │ ├── test.prototxt │ │ └── train.prototxt │ ├── faster_rcnn_alt_opt/ │ │ ├── faster_rcnn_test.pt │ │ ├── rpn_test.pt │ │ ├── stage1_fast_rcnn_solver30k40k.pt │ │ ├── stage1_fast_rcnn_train.pt │ │ ├── stage1_rpn_solver60k80k.pt │ │ ├── stage1_rpn_train.pt │ │ ├── stage2_fast_rcnn_solver30k40k.pt │ │ ├── stage2_fast_rcnn_train.pt │ │ ├── stage2_rpn_solver60k80k.pt │ │ └── stage2_rpn_train.pt │ └── faster_rcnn_end2end/ │ ├── solver.prototxt │ ├── test.prototxt │ └── train.prototxt └── tools/ ├── README.md ├── _init_paths.py ├── compress_net.py ├── demo.py ├── eval_recall.py ├── reval.py ├── rpn_generate.py ├── test_net.py ├── train_faster_rcnn_alt_opt.py ├── train_net.py └── train_svms.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ *.pyc .ipynb_checkpoints lib/build lib/pycocotools/_mask.c lib/pycocotools/_mask.so ================================================ FILE: .gitmodules ================================================ [submodule "caffe-fast-rcnn"] path = caffe-fast-rcnn url = https://github.com/rbgirshick/caffe-fast-rcnn.git branch = fast-rcnn ================================================ FILE: LICENSE ================================================ Faster R-CNN The MIT License (MIT) Copyright (c) 2015 Microsoft Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ************************************************************************ THIRD-PARTY SOFTWARE NOTICES AND INFORMATION This project, Faster R-CNN, incorporates material from the project(s) listed below (collectively, "Third Party Code"). Microsoft is not the original author of the Third Party Code. The original copyright notice and license under which Microsoft received such Third Party Code are set out below. This Third Party Code is licensed to you under their original license terms set forth below. Microsoft reserves all other rights not expressly granted, whether by implication, estoppel or otherwise. 1. Caffe, (https://github.com/BVLC/caffe/) COPYRIGHT All contributions by the University of California: Copyright (c) 2014, 2015, The Regents of the University of California (Regents) All rights reserved. All other contributions: Copyright (c) 2014, 2015, the respective contributors All rights reserved. Caffe uses a shared copyright model: each contributor holds copyright over their contributions to Caffe. The project versioning records all such contribution and copyright details. If a contributor wants to further mark their specific copyright on a particular contribution, they should indicate their copyright solely in the commit message of the change when it is committed. The BSD 2-Clause License Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ************END OF THIRD-PARTY SOFTWARE NOTICES AND INFORMATION********** ================================================ FILE: README.md ================================================ # py-faster-rcnn has been deprecated. Please see [Detectron](https://github.com/facebookresearch/Detectron), which includes an implementation of [Mask R-CNN](https://arxiv.org/abs/1703.06870). ### Disclaimer The official Faster R-CNN code (written in MATLAB) is available [here](https://github.com/ShaoqingRen/faster_rcnn). If your goal is to reproduce the results in our NIPS 2015 paper, please use the [official code](https://github.com/ShaoqingRen/faster_rcnn). This repository contains a Python *reimplementation* of the MATLAB code. This Python implementation is built on a fork of [Fast R-CNN](https://github.com/rbgirshick/fast-rcnn). There are slight differences between the two implementations. In particular, this Python port - is ~10% slower at test-time, because some operations execute on the CPU in Python layers (e.g., 220ms / image vs. 200ms / image for VGG16) - gives similar, but not exactly the same, mAP as the MATLAB version - is *not compatible* with models trained using the MATLAB code due to the minor implementation differences - **includes approximate joint training** that is 1.5x faster than alternating optimization (for VGG16) -- see these [slides](https://www.dropbox.com/s/xtr4yd4i5e0vw8g/iccv15_tutorial_training_rbg.pdf?dl=0) for more information # *Faster* R-CNN: Towards Real-Time Object Detection with Region Proposal Networks By Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun (Microsoft Research) This Python implementation contains contributions from Sean Bell (Cornell) written during an MSR internship. Please see the official [README.md](https://github.com/ShaoqingRen/faster_rcnn/blob/master/README.md) for more details. Faster R-CNN was initially described in an [arXiv tech report](http://arxiv.org/abs/1506.01497) and was subsequently published in NIPS 2015. ### License Faster R-CNN is released under the MIT License (refer to the LICENSE file for details). ### Citing Faster R-CNN If you find Faster R-CNN useful in your research, please consider citing: @inproceedings{renNIPS15fasterrcnn, Author = {Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun}, Title = {Faster {R-CNN}: Towards Real-Time Object Detection with Region Proposal Networks}, Booktitle = {Advances in Neural Information Processing Systems ({NIPS})}, Year = {2015} } ### Contents 1. [Requirements: software](#requirements-software) 2. [Requirements: hardware](#requirements-hardware) 3. [Basic installation](#installation-sufficient-for-the-demo) 4. [Demo](#demo) 5. [Beyond the demo: training and testing](#beyond-the-demo-installation-for-training-and-testing-models) 6. [Usage](#usage) ### Requirements: software **NOTE** If you are having issues compiling and you are using a recent version of CUDA/cuDNN, please consult [this issue](https://github.com/rbgirshick/py-faster-rcnn/issues/509?_pjax=%23js-repo-pjax-container#issuecomment-284133868) for a workaround 1. Requirements for `Caffe` and `pycaffe` (see: [Caffe installation instructions](http://caffe.berkeleyvision.org/installation.html)) **Note:** Caffe *must* be built with support for Python layers! ```make # In your Makefile.config, make sure to have this line uncommented WITH_PYTHON_LAYER := 1 # Unrelatedly, it's also recommended that you use CUDNN USE_CUDNN := 1 ``` You can download my [Makefile.config](https://dl.dropboxusercontent.com/s/6joa55k64xo2h68/Makefile.config?dl=0) for reference. 2. Python packages you might not have: `cython`, `python-opencv`, `easydict` 3. [Optional] MATLAB is required for **official** PASCAL VOC evaluation only. The code now includes unofficial Python evaluation code. ### Requirements: hardware 1. For training smaller networks (ZF, VGG_CNN_M_1024) a good GPU (e.g., Titan, K20, K40, ...) with at least 3G of memory suffices 2. For training Fast R-CNN with VGG16, you'll need a K40 (~11G of memory) 3. For training the end-to-end version of Faster R-CNN with VGG16, 3G of GPU memory is sufficient (using CUDNN) ### Installation (sufficient for the demo) 1. Clone the Faster R-CNN repository ```Shell # Make sure to clone with --recursive git clone --recursive https://github.com/rbgirshick/py-faster-rcnn.git ``` 2. We'll call the directory that you cloned Faster R-CNN into `FRCN_ROOT` *Ignore notes 1 and 2 if you followed step 1 above.* **Note 1:** If you didn't clone Faster R-CNN with the `--recursive` flag, then you'll need to manually clone the `caffe-fast-rcnn` submodule: ```Shell git submodule update --init --recursive ``` **Note 2:** The `caffe-fast-rcnn` submodule needs to be on the `faster-rcnn` branch (or equivalent detached state). This will happen automatically *if you followed step 1 instructions*. 3. Build the Cython modules ```Shell cd $FRCN_ROOT/lib make ``` 4. Build Caffe and pycaffe ```Shell cd $FRCN_ROOT/caffe-fast-rcnn # Now follow the Caffe installation instructions here: # http://caffe.berkeleyvision.org/installation.html # If you're experienced with Caffe and have all of the requirements installed # and your Makefile.config in place, then simply do: make -j8 && make pycaffe ``` 5. Download pre-computed Faster R-CNN detectors ```Shell cd $FRCN_ROOT ./data/scripts/fetch_faster_rcnn_models.sh ``` This will populate the `$FRCN_ROOT/data` folder with `faster_rcnn_models`. See `data/README.md` for details. These models were trained on VOC 2007 trainval. ### Demo *After successfully completing [basic installation](#installation-sufficient-for-the-demo)*, you'll be ready to run the demo. To run the demo ```Shell cd $FRCN_ROOT ./tools/demo.py ``` The demo performs detection using a VGG16 network trained for detection on PASCAL VOC 2007. ### Beyond the demo: installation for training and testing models 1. Download the training, validation, test data and VOCdevkit ```Shell wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCdevkit_08-Jun-2007.tar ``` 2. Extract all of these tars into one directory named `VOCdevkit` ```Shell tar xvf VOCtrainval_06-Nov-2007.tar tar xvf VOCtest_06-Nov-2007.tar tar xvf VOCdevkit_08-Jun-2007.tar ``` 3. It should have this basic structure ```Shell $VOCdevkit/ # development kit $VOCdevkit/VOCcode/ # VOC utility code $VOCdevkit/VOC2007 # image sets, annotations, etc. # ... and several other directories ... ``` 4. Create symlinks for the PASCAL VOC dataset ```Shell cd $FRCN_ROOT/data ln -s $VOCdevkit VOCdevkit2007 ``` Using symlinks is a good idea because you will likely want to share the same PASCAL dataset installation between multiple projects. 5. [Optional] follow similar steps to get PASCAL VOC 2010 and 2012 6. [Optional] If you want to use COCO, please see some notes under `data/README.md` 7. Follow the next sections to download pre-trained ImageNet models ### Download pre-trained ImageNet models Pre-trained ImageNet models can be downloaded for the three networks described in the paper: ZF and VGG16. ```Shell cd $FRCN_ROOT ./data/scripts/fetch_imagenet_models.sh ``` VGG16 comes from the [Caffe Model Zoo](https://github.com/BVLC/caffe/wiki/Model-Zoo), but is provided here for your convenience. ZF was trained at MSRA. ### Usage To train and test a Faster R-CNN detector using the **alternating optimization** algorithm from our NIPS 2015 paper, use `experiments/scripts/faster_rcnn_alt_opt.sh`. Output is written underneath `$FRCN_ROOT/output`. ```Shell cd $FRCN_ROOT ./experiments/scripts/faster_rcnn_alt_opt.sh [GPU_ID] [NET] [--set ...] # GPU_ID is the GPU you want to train on # NET in {ZF, VGG_CNN_M_1024, VGG16} is the network arch to use # --set ... allows you to specify fast_rcnn.config options, e.g. # --set EXP_DIR seed_rng1701 RNG_SEED 1701 ``` ("alt opt" refers to the alternating optimization training algorithm described in the NIPS paper.) To train and test a Faster R-CNN detector using the **approximate joint training** method, use `experiments/scripts/faster_rcnn_end2end.sh`. Output is written underneath `$FRCN_ROOT/output`. ```Shell cd $FRCN_ROOT ./experiments/scripts/faster_rcnn_end2end.sh [GPU_ID] [NET] [--set ...] # GPU_ID is the GPU you want to train on # NET in {ZF, VGG_CNN_M_1024, VGG16} is the network arch to use # --set ... allows you to specify fast_rcnn.config options, e.g. # --set EXP_DIR seed_rng1701 RNG_SEED 1701 ``` This method trains the RPN module jointly with the Fast R-CNN network, rather than alternating between training the two. It results in faster (~ 1.5x speedup) training times and similar detection accuracy. See these [slides](https://www.dropbox.com/s/xtr4yd4i5e0vw8g/iccv15_tutorial_training_rbg.pdf?dl=0) for more details. Artifacts generated by the scripts in `tools` are written in this directory. Trained Fast R-CNN networks are saved under: ``` output/// ``` Test outputs are saved under: ``` output//// ``` ================================================ FILE: data/.gitignore ================================================ selective_search* imagenet_models* fast_rcnn_models* VOCdevkit* cache ================================================ FILE: data/README.md ================================================ This directory holds (*after you download them*): - Caffe models pre-trained on ImageNet - Faster R-CNN models - Symlinks to datasets To download Caffe models (ZF, VGG16) pre-trained on ImageNet, run: ``` ./data/scripts/fetch_imagenet_models.sh ``` This script will populate `data/imagenet_models`. To download Faster R-CNN models trained on VOC 2007, run: ``` ./data/scripts/fetch_faster_rcnn_models.sh ``` This script will populate `data/faster_rcnn_models`. In order to train and test with PASCAL VOC, you will need to establish symlinks. From the `data` directory (`cd data`): ``` # For VOC 2007 ln -s /your/path/to/VOC2007/VOCdevkit VOCdevkit2007 # For VOC 2012 ln -s /your/path/to/VOC2012/VOCdevkit VOCdevkit2012 ``` Install the MS COCO dataset at /path/to/coco ``` ln -s /path/to/coco coco ``` For COCO with Fast R-CNN, place object proposals under `coco_proposals` (inside the `data` directory). You can obtain proposals on COCO from Jan Hosang at https://www.mpi-inf.mpg.de/departments/computer-vision-and-multimodal-computing/research/object-recognition-and-scene-understanding/how-good-are-detection-proposals-really/. For COCO, using MCG is recommended over selective search. MCG boxes can be downloaded from http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/mcg/. Use the tool `lib/datasets/tools/mcg_munge.py` to convert the downloaded MCG data into the same file layout as those from Jan Hosang. Since you'll likely be experimenting with multiple installs of Fast/er R-CNN in parallel, you'll probably want to keep all of this data in a shared place and use symlinks. On my system I create the following symlinks inside `data`: Annotations for the 5k image 'minival' subset of COCO val2014 that I like to use can be found at https://dl.dropboxusercontent.com/s/o43o90bna78omob/instances_minival2014.json.zip?dl=0. Annotations for COCO val2014 (set) minus minival (~35k images) can be found at https://dl.dropboxusercontent.com/s/s3tw5zcg7395368/instances_valminusminival2014.json.zip?dl=0. ``` # data/cache holds various outputs created by the datasets package ln -s /data/fast_rcnn_shared/cache # move the imagenet_models to shared location and symlink to them ln -s /data/fast_rcnn_shared/imagenet_models # move the selective search data to a shared location and symlink to them # (only applicable to Fast R-CNN training) ln -s /data/fast_rcnn_shared/selective_search_data ln -s /data/VOC2007/VOCdevkit VOCdevkit2007 ln -s /data/VOC2012/VOCdevkit VOCdevkit2012 ``` ================================================ FILE: data/pylintrc ================================================ [TYPECHECK] ignored-modules = numpy, numpy.random, cv2 ================================================ FILE: data/scripts/fetch_faster_rcnn_models.sh ================================================ #!/bin/bash DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../" && pwd )" cd $DIR FILE=faster_rcnn_models.tgz URL=https://dl.dropboxusercontent.com/s/o6ii098bu51d139/faster_rcnn_models.tgz?dl=0 CHECKSUM=ac116844f66aefe29587214272054668 if [ -f $FILE ]; then echo "File already exists. Checking md5..." os=`uname -s` if [ "$os" = "Linux" ]; then checksum=`md5sum $FILE | awk '{ print $1 }'` elif [ "$os" = "Darwin" ]; then checksum=`cat $FILE | md5` fi if [ "$checksum" = "$CHECKSUM" ]; then echo "Checksum is correct. No need to download." exit 0 else echo "Checksum is incorrect. Need to download again." fi fi echo "Downloading Faster R-CNN demo models (695M)..." wget $URL -O $FILE echo "Unzipping..." tar zxvf $FILE echo "Done. Please run this command again to verify that checksum = $CHECKSUM." ================================================ FILE: data/scripts/fetch_imagenet_models.sh ================================================ #!/bin/bash DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../" && pwd )" cd $DIR FILE=imagenet_models.tgz URL=https://dl.dropbox.com/s/gstw7122padlf0l/imagenet_models.tgz?dl=0 CHECKSUM=ed34ca912d6782edfb673a8c3a0bda6d if [ -f $FILE ]; then echo "File already exists. Checking md5..." os=`uname -s` if [ "$os" = "Linux" ]; then checksum=`md5sum $FILE | awk '{ print $1 }'` elif [ "$os" = "Darwin" ]; then checksum=`cat $FILE | md5` fi if [ "$checksum" = "$CHECKSUM" ]; then echo "Checksum is correct. No need to download." exit 0 else echo "Checksum is incorrect. Need to download again." fi fi echo "Downloading pretrained ImageNet models (1G)..." wget $URL -O $FILE echo "Unzipping..." tar zxvf $FILE echo "Done. Please run this command again to verify that checksum = $CHECKSUM." ================================================ FILE: data/scripts/fetch_selective_search_data.sh ================================================ #!/bin/bash DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../" && pwd )" cd $DIR FILE=selective_search_data.tgz URL=https://dl.dropboxusercontent.com/s/orrt7o6bp6ae0tc/selective_search_data.tgz?dl=0 CHECKSUM=7078c1db87a7851b31966b96774cd9b9 if [ -f $FILE ]; then echo "File already exists. Checking md5..." os=`uname -s` if [ "$os" = "Linux" ]; then checksum=`md5sum $FILE | awk '{ print $1 }'` elif [ "$os" = "Darwin" ]; then checksum=`cat $FILE | md5` fi if [ "$checksum" = "$CHECKSUM" ]; then echo "Checksum is correct. No need to download." exit 0 else echo "Checksum is incorrect. Need to download again." fi fi echo "Downloading precomputed selective search boxes (0.5G)..." wget $URL -O $FILE echo "Unzipping..." tar zxvf $FILE echo "Done. Please run this command again to verify that checksum = $CHECKSUM." ================================================ FILE: experiments/README.md ================================================ Scripts are under `experiments/scripts`. Each script saves a log file under `experiments/logs`. Configuration override files used in the experiments are stored in `experiments/cfgs`. ================================================ FILE: experiments/cfgs/faster_rcnn_alt_opt.yml ================================================ EXP_DIR: faster_rcnn_alt_opt TRAIN: BG_THRESH_LO: 0.0 TEST: HAS_RPN: True ================================================ FILE: experiments/cfgs/faster_rcnn_end2end.yml ================================================ EXP_DIR: faster_rcnn_end2end TRAIN: HAS_RPN: True IMS_PER_BATCH: 1 BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True RPN_POSITIVE_OVERLAP: 0.7 RPN_BATCHSIZE: 256 PROPOSAL_METHOD: gt BG_THRESH_LO: 0.0 TEST: HAS_RPN: True ================================================ FILE: experiments/logs/.gitignore ================================================ *.txt* ================================================ FILE: experiments/scripts/fast_rcnn.sh ================================================ #!/bin/bash # Usage: # ./experiments/scripts/fast_rcnn.sh GPU NET DATASET [options args to {train,test}_net.py] # DATASET is either pascal_voc or coco. # # Example: # ./experiments/scripts/fast_rcnn.sh 0 VGG_CNN_M_1024 pascal_voc \ # --set EXP_DIR foobar RNG_SEED 42 TRAIN.SCALES "[400, 500, 600, 700]" set -x set -e export PYTHONUNBUFFERED="True" GPU_ID=$1 NET=$2 NET_lc=${NET,,} DATASET=$3 array=( $@ ) len=${#array[@]} EXTRA_ARGS=${array[@]:3:$len} EXTRA_ARGS_SLUG=${EXTRA_ARGS// /_} case $DATASET in pascal_voc) TRAIN_IMDB="voc_2007_trainval" TEST_IMDB="voc_2007_test" PT_DIR="pascal_voc" ITERS=40000 ;; coco) TRAIN_IMDB="coco_2014_train" TEST_IMDB="coco_2014_minival" PT_DIR="coco" ITERS=280000 ;; *) echo "No dataset given" exit ;; esac LOG="experiments/logs/fast_rcnn_${NET}_${EXTRA_ARGS_SLUG}.txt.`date +'%Y-%m-%d_%H-%M-%S'`" exec &> >(tee -a "$LOG") echo Logging output to "$LOG" time ./tools/train_net.py --gpu ${GPU_ID} \ --solver models/${PT_DIR}/${NET}/fast_rcnn/solver.prototxt \ --weights data/imagenet_models/${NET}.v2.caffemodel \ --imdb ${TRAIN_IMDB} \ --iters ${ITERS} \ ${EXTRA_ARGS} set +x NET_FINAL=`grep -B 1 "done solving" ${LOG} | grep "Wrote snapshot" | awk '{print $4}'` set -x time ./tools/test_net.py --gpu ${GPU_ID} \ --def models/${PT_DIR}/${NET}/fast_rcnn/test.prototxt \ --net ${NET_FINAL} \ --imdb ${TEST_IMDB} \ ${EXTRA_ARGS} ================================================ FILE: experiments/scripts/faster_rcnn_alt_opt.sh ================================================ #!/bin/bash # Usage: # ./experiments/scripts/faster_rcnn_alt_opt.sh GPU NET DATASET [options args to {train,test}_net.py] # DATASET is only pascal_voc for now # # Example: # ./experiments/scripts/faster_rcnn_alt_opt.sh 0 VGG_CNN_M_1024 pascal_voc \ # --set EXP_DIR foobar RNG_SEED 42 TRAIN.SCALES "[400, 500, 600, 700]" set -x set -e export PYTHONUNBUFFERED="True" GPU_ID=$1 NET=$2 NET_lc=${NET,,} DATASET=$3 array=( $@ ) len=${#array[@]} EXTRA_ARGS=${array[@]:3:$len} EXTRA_ARGS_SLUG=${EXTRA_ARGS// /_} case $DATASET in pascal_voc) TRAIN_IMDB="voc_2007_trainval" TEST_IMDB="voc_2007_test" PT_DIR="pascal_voc" ITERS=40000 ;; coco) echo "Not implemented: use experiments/scripts/faster_rcnn_end2end.sh for coco" exit ;; *) echo "No dataset given" exit ;; esac LOG="experiments/logs/faster_rcnn_alt_opt_${NET}_${EXTRA_ARGS_SLUG}.txt.`date +'%Y-%m-%d_%H-%M-%S'`" exec &> >(tee -a "$LOG") echo Logging output to "$LOG" time ./tools/train_faster_rcnn_alt_opt.py --gpu ${GPU_ID} \ --net_name ${NET} \ --weights data/imagenet_models/${NET}.v2.caffemodel \ --imdb ${TRAIN_IMDB} \ --cfg experiments/cfgs/faster_rcnn_alt_opt.yml \ ${EXTRA_ARGS} set +x NET_FINAL=`grep "Final model:" ${LOG} | awk '{print $3}'` set -x time ./tools/test_net.py --gpu ${GPU_ID} \ --def models/${PT_DIR}/${NET}/faster_rcnn_alt_opt/faster_rcnn_test.pt \ --net ${NET_FINAL} \ --imdb ${TEST_IMDB} \ --cfg experiments/cfgs/faster_rcnn_alt_opt.yml \ ${EXTRA_ARGS} ================================================ FILE: experiments/scripts/faster_rcnn_end2end.sh ================================================ #!/bin/bash # Usage: # ./experiments/scripts/faster_rcnn_end2end.sh GPU NET DATASET [options args to {train,test}_net.py] # DATASET is either pascal_voc or coco. # # Example: # ./experiments/scripts/faster_rcnn_end2end.sh 0 VGG_CNN_M_1024 pascal_voc \ # --set EXP_DIR foobar RNG_SEED 42 TRAIN.SCALES "[400, 500, 600, 700]" set -x set -e export PYTHONUNBUFFERED="True" GPU_ID=$1 NET=$2 NET_lc=${NET,,} DATASET=$3 array=( $@ ) len=${#array[@]} EXTRA_ARGS=${array[@]:3:$len} EXTRA_ARGS_SLUG=${EXTRA_ARGS// /_} case $DATASET in pascal_voc) TRAIN_IMDB="voc_2007_trainval" TEST_IMDB="voc_2007_test" PT_DIR="pascal_voc" ITERS=70000 ;; coco) # This is a very long and slow training schedule # You can probably use fewer iterations and reduce the # time to the LR drop (set in the solver to 350,000 iterations). TRAIN_IMDB="coco_2014_train" TEST_IMDB="coco_2014_minival" PT_DIR="coco" ITERS=490000 ;; *) echo "No dataset given" exit ;; esac LOG="experiments/logs/faster_rcnn_end2end_${NET}_${EXTRA_ARGS_SLUG}.txt.`date +'%Y-%m-%d_%H-%M-%S'`" exec &> >(tee -a "$LOG") echo Logging output to "$LOG" time ./tools/train_net.py --gpu ${GPU_ID} \ --solver models/${PT_DIR}/${NET}/faster_rcnn_end2end/solver.prototxt \ --weights data/imagenet_models/${NET}.v2.caffemodel \ --imdb ${TRAIN_IMDB} \ --iters ${ITERS} \ --cfg experiments/cfgs/faster_rcnn_end2end.yml \ ${EXTRA_ARGS} set +x NET_FINAL=`grep -B 1 "done solving" ${LOG} | grep "Wrote snapshot" | awk '{print $4}'` set -x time ./tools/test_net.py --gpu ${GPU_ID} \ --def models/${PT_DIR}/${NET}/faster_rcnn_end2end/test.prototxt \ --net ${NET_FINAL} \ --imdb ${TEST_IMDB} \ --cfg experiments/cfgs/faster_rcnn_end2end.yml \ ${EXTRA_ARGS} ================================================ FILE: lib/Makefile ================================================ all: python setup.py build_ext --inplace rm -rf build ================================================ FILE: lib/datasets/VOCdevkit-matlab-wrapper/get_voc_opts.m ================================================ function VOCopts = get_voc_opts(path) tmp = pwd; cd(path); try addpath('VOCcode'); VOCinit; catch rmpath('VOCcode'); cd(tmp); error(sprintf('VOCcode directory not found under %s', path)); end rmpath('VOCcode'); cd(tmp); ================================================ FILE: lib/datasets/VOCdevkit-matlab-wrapper/voc_eval.m ================================================ function res = voc_eval(path, comp_id, test_set, output_dir) VOCopts = get_voc_opts(path); VOCopts.testset = test_set; for i = 1:length(VOCopts.classes) cls = VOCopts.classes{i}; res(i) = voc_eval_cls(cls, VOCopts, comp_id, output_dir); end fprintf('\n~~~~~~~~~~~~~~~~~~~~\n'); fprintf('Results:\n'); aps = [res(:).ap]'; fprintf('%.1f\n', aps * 100); fprintf('%.1f\n', mean(aps) * 100); fprintf('~~~~~~~~~~~~~~~~~~~~\n'); function res = voc_eval_cls(cls, VOCopts, comp_id, output_dir) test_set = VOCopts.testset; year = VOCopts.dataset(4:end); addpath(fullfile(VOCopts.datadir, 'VOCcode')); res_fn = sprintf(VOCopts.detrespath, comp_id, cls); recall = []; prec = []; ap = 0; ap_auc = 0; do_eval = (str2num(year) <= 2007) | ~strcmp(test_set, 'test'); if do_eval % Bug in VOCevaldet requires that tic has been called first tic; [recall, prec, ap] = VOCevaldet(VOCopts, comp_id, cls, true); ap_auc = xVOCap(recall, prec); % force plot limits ylim([0 1]); xlim([0 1]); print(gcf, '-djpeg', '-r0', ... [output_dir '/' cls '_pr.jpg']); end fprintf('!!! %s : %.4f %.4f\n', cls, ap, ap_auc); res.recall = recall; res.prec = prec; res.ap = ap; res.ap_auc = ap_auc; save([output_dir '/' cls '_pr.mat'], ... 'res', 'recall', 'prec', 'ap', 'ap_auc'); rmpath(fullfile(VOCopts.datadir, 'VOCcode')); ================================================ FILE: lib/datasets/VOCdevkit-matlab-wrapper/xVOCap.m ================================================ function ap = xVOCap(rec,prec) % From the PASCAL VOC 2011 devkit mrec=[0 ; rec ; 1]; mpre=[0 ; prec ; 0]; for i=numel(mpre)-1:-1:1 mpre(i)=max(mpre(i),mpre(i+1)); end i=find(mrec(2:end)~=mrec(1:end-1))+1; ap=sum((mrec(i)-mrec(i-1)).*mpre(i)); ================================================ FILE: lib/datasets/__init__.py ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- ================================================ FILE: lib/datasets/coco.py ================================================ # -------------------------------------------------------- # Fast/er R-CNN # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- from datasets.imdb import imdb import datasets.ds_utils as ds_utils from fast_rcnn.config import cfg import os.path as osp import sys import os import numpy as np import scipy.sparse import scipy.io as sio import cPickle import json import uuid # COCO API from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval from pycocotools import mask as COCOmask def _filter_crowd_proposals(roidb, crowd_thresh): """ Finds proposals that are inside crowd regions and marks them with overlap = -1 (for all gt rois), which means they will be excluded from training. """ for ix, entry in enumerate(roidb): overlaps = entry['gt_overlaps'].toarray() crowd_inds = np.where(overlaps.max(axis=1) == -1)[0] non_gt_inds = np.where(entry['gt_classes'] == 0)[0] if len(crowd_inds) == 0 or len(non_gt_inds) == 0: continue iscrowd = [int(True) for _ in xrange(len(crowd_inds))] crowd_boxes = ds_utils.xyxy_to_xywh(entry['boxes'][crowd_inds, :]) non_gt_boxes = ds_utils.xyxy_to_xywh(entry['boxes'][non_gt_inds, :]) ious = COCOmask.iou(non_gt_boxes, crowd_boxes, iscrowd) bad_inds = np.where(ious.max(axis=1) > crowd_thresh)[0] overlaps[non_gt_inds[bad_inds], :] = -1 roidb[ix]['gt_overlaps'] = scipy.sparse.csr_matrix(overlaps) return roidb class coco(imdb): def __init__(self, image_set, year): imdb.__init__(self, 'coco_' + year + '_' + image_set) # COCO specific config options self.config = {'top_k' : 2000, 'use_salt' : True, 'cleanup' : True, 'crowd_thresh' : 0.7, 'min_size' : 2} # name, paths self._year = year self._image_set = image_set self._data_path = osp.join(cfg.DATA_DIR, 'coco') # load COCO API, classes, class <-> id mappings self._COCO = COCO(self._get_ann_file()) cats = self._COCO.loadCats(self._COCO.getCatIds()) self._classes = tuple(['__background__'] + [c['name'] for c in cats]) self._class_to_ind = dict(zip(self.classes, xrange(self.num_classes))) self._class_to_coco_cat_id = dict(zip([c['name'] for c in cats], self._COCO.getCatIds())) self._image_index = self._load_image_set_index() # Default to roidb handler self.set_proposal_method('selective_search') self.competition_mode(False) # Some image sets are "views" (i.e. subsets) into others. # For example, minival2014 is a random 5000 image subset of val2014. # This mapping tells us where the view's images and proposals come from. self._view_map = { 'minival2014' : 'val2014', # 5k val2014 subset 'valminusminival2014' : 'val2014', # val2014 \setminus minival2014 } coco_name = image_set + year # e.g., "val2014" self._data_name = (self._view_map[coco_name] if self._view_map.has_key(coco_name) else coco_name) # Dataset splits that have ground-truth annotations (test splits # do not have gt annotations) self._gt_splits = ('train', 'val', 'minival') def _get_ann_file(self): prefix = 'instances' if self._image_set.find('test') == -1 \ else 'image_info' return osp.join(self._data_path, 'annotations', prefix + '_' + self._image_set + self._year + '.json') def _load_image_set_index(self): """ Load image ids. """ image_ids = self._COCO.getImgIds() return image_ids def _get_widths(self): anns = self._COCO.loadImgs(self._image_index) widths = [ann['width'] for ann in anns] return widths def image_path_at(self, i): """ Return the absolute path to image i in the image sequence. """ return self.image_path_from_index(self._image_index[i]) def image_path_from_index(self, index): """ Construct an image path from the image's "index" identifier. """ # Example image path for index=119993: # images/train2014/COCO_train2014_000000119993.jpg file_name = ('COCO_' + self._data_name + '_' + str(index).zfill(12) + '.jpg') image_path = osp.join(self._data_path, 'images', self._data_name, file_name) assert osp.exists(image_path), \ 'Path does not exist: {}'.format(image_path) return image_path def selective_search_roidb(self): return self._roidb_from_proposals('selective_search') def edge_boxes_roidb(self): return self._roidb_from_proposals('edge_boxes_AR') def mcg_roidb(self): return self._roidb_from_proposals('MCG') def _roidb_from_proposals(self, method): """ Creates a roidb from pre-computed proposals of a particular methods. """ top_k = self.config['top_k'] cache_file = osp.join(self.cache_path, self.name + '_{:s}_top{:d}'.format(method, top_k) + '_roidb.pkl') if osp.exists(cache_file): with open(cache_file, 'rb') as fid: roidb = cPickle.load(fid) print '{:s} {:s} roidb loaded from {:s}'.format(self.name, method, cache_file) return roidb if self._image_set in self._gt_splits: gt_roidb = self.gt_roidb() method_roidb = self._load_proposals(method, gt_roidb) roidb = imdb.merge_roidbs(gt_roidb, method_roidb) # Make sure we don't use proposals that are contained in crowds roidb = _filter_crowd_proposals(roidb, self.config['crowd_thresh']) else: roidb = self._load_proposals(method, None) with open(cache_file, 'wb') as fid: cPickle.dump(roidb, fid, cPickle.HIGHEST_PROTOCOL) print 'wrote {:s} roidb to {:s}'.format(method, cache_file) return roidb def _load_proposals(self, method, gt_roidb): """ Load pre-computed proposals in the format provided by Jan Hosang: http://www.mpi-inf.mpg.de/departments/computer-vision-and-multimodal- computing/research/object-recognition-and-scene-understanding/how- good-are-detection-proposals-really/ For MCG, use boxes from http://www.eecs.berkeley.edu/Research/Projects/ CS/vision/grouping/mcg/ and convert the file layout using lib/datasets/tools/mcg_munge.py. """ box_list = [] top_k = self.config['top_k'] valid_methods = [ 'MCG', 'selective_search', 'edge_boxes_AR', 'edge_boxes_70'] assert method in valid_methods print 'Loading {} boxes'.format(method) for i, index in enumerate(self._image_index): if i % 1000 == 0: print '{:d} / {:d}'.format(i + 1, len(self._image_index)) box_file = osp.join( cfg.DATA_DIR, 'coco_proposals', method, 'mat', self._get_box_file(index)) raw_data = sio.loadmat(box_file)['boxes'] boxes = np.maximum(raw_data - 1, 0).astype(np.uint16) if method == 'MCG': # Boxes from the MCG website are in (y1, x1, y2, x2) order boxes = boxes[:, (1, 0, 3, 2)] # Remove duplicate boxes and very small boxes and then take top k keep = ds_utils.unique_boxes(boxes) boxes = boxes[keep, :] keep = ds_utils.filter_small_boxes(boxes, self.config['min_size']) boxes = boxes[keep, :] boxes = boxes[:top_k, :] box_list.append(boxes) # Sanity check im_ann = self._COCO.loadImgs(index)[0] width = im_ann['width'] height = im_ann['height'] ds_utils.validate_boxes(boxes, width=width, height=height) return self.create_roidb_from_box_list(box_list, gt_roidb) def gt_roidb(self): """ Return the database of ground-truth regions of interest. This function loads/saves from/to a cache file to speed up future calls. """ cache_file = osp.join(self.cache_path, self.name + '_gt_roidb.pkl') if osp.exists(cache_file): with open(cache_file, 'rb') as fid: roidb = cPickle.load(fid) print '{} gt roidb loaded from {}'.format(self.name, cache_file) return roidb gt_roidb = [self._load_coco_annotation(index) for index in self._image_index] with open(cache_file, 'wb') as fid: cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL) print 'wrote gt roidb to {}'.format(cache_file) return gt_roidb def _load_coco_annotation(self, index): """ Loads COCO bounding-box instance annotations. Crowd instances are handled by marking their overlaps (with all categories) to -1. This overlap value means that crowd "instances" are excluded from training. """ im_ann = self._COCO.loadImgs(index)[0] width = im_ann['width'] height = im_ann['height'] annIds = self._COCO.getAnnIds(imgIds=index, iscrowd=None) objs = self._COCO.loadAnns(annIds) # Sanitize bboxes -- some are invalid valid_objs = [] for obj in objs: x1 = np.max((0, obj['bbox'][0])) y1 = np.max((0, obj['bbox'][1])) x2 = np.min((width - 1, x1 + np.max((0, obj['bbox'][2] - 1)))) y2 = np.min((height - 1, y1 + np.max((0, obj['bbox'][3] - 1)))) if obj['area'] > 0 and x2 >= x1 and y2 >= y1: obj['clean_bbox'] = [x1, y1, x2, y2] valid_objs.append(obj) objs = valid_objs num_objs = len(objs) boxes = np.zeros((num_objs, 4), dtype=np.uint16) gt_classes = np.zeros((num_objs), dtype=np.int32) overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32) seg_areas = np.zeros((num_objs), dtype=np.float32) # Lookup table to map from COCO category ids to our internal class # indices coco_cat_id_to_class_ind = dict([(self._class_to_coco_cat_id[cls], self._class_to_ind[cls]) for cls in self._classes[1:]]) for ix, obj in enumerate(objs): cls = coco_cat_id_to_class_ind[obj['category_id']] boxes[ix, :] = obj['clean_bbox'] gt_classes[ix] = cls seg_areas[ix] = obj['area'] if obj['iscrowd']: # Set overlap to -1 for all classes for crowd objects # so they will be excluded during training overlaps[ix, :] = -1.0 else: overlaps[ix, cls] = 1.0 ds_utils.validate_boxes(boxes, width=width, height=height) overlaps = scipy.sparse.csr_matrix(overlaps) return {'boxes' : boxes, 'gt_classes': gt_classes, 'gt_overlaps' : overlaps, 'flipped' : False, 'seg_areas' : seg_areas} def _get_box_file(self, index): # first 14 chars / first 22 chars / all chars + .mat # COCO_val2014_0/COCO_val2014_000000447/COCO_val2014_000000447991.mat file_name = ('COCO_' + self._data_name + '_' + str(index).zfill(12) + '.mat') return osp.join(file_name[:14], file_name[:22], file_name) def _print_detection_eval_metrics(self, coco_eval): IoU_lo_thresh = 0.5 IoU_hi_thresh = 0.95 def _get_thr_ind(coco_eval, thr): ind = np.where((coco_eval.params.iouThrs > thr - 1e-5) & (coco_eval.params.iouThrs < thr + 1e-5))[0][0] iou_thr = coco_eval.params.iouThrs[ind] assert np.isclose(iou_thr, thr) return ind ind_lo = _get_thr_ind(coco_eval, IoU_lo_thresh) ind_hi = _get_thr_ind(coco_eval, IoU_hi_thresh) # precision has dims (iou, recall, cls, area range, max dets) # area range index 0: all area ranges # max dets index 2: 100 per image precision = \ coco_eval.eval['precision'][ind_lo:(ind_hi + 1), :, :, 0, 2] ap_default = np.mean(precision[precision > -1]) print ('~~~~ Mean and per-category AP @ IoU=[{:.2f},{:.2f}] ' '~~~~').format(IoU_lo_thresh, IoU_hi_thresh) print '{:.1f}'.format(100 * ap_default) for cls_ind, cls in enumerate(self.classes): if cls == '__background__': continue # minus 1 because of __background__ precision = coco_eval.eval['precision'][ind_lo:(ind_hi + 1), :, cls_ind - 1, 0, 2] ap = np.mean(precision[precision > -1]) print '{:.1f}'.format(100 * ap) print '~~~~ Summary metrics ~~~~' coco_eval.summarize() def _do_detection_eval(self, res_file, output_dir): ann_type = 'bbox' coco_dt = self._COCO.loadRes(res_file) coco_eval = COCOeval(self._COCO, coco_dt) coco_eval.params.useSegm = (ann_type == 'segm') coco_eval.evaluate() coco_eval.accumulate() self._print_detection_eval_metrics(coco_eval) eval_file = osp.join(output_dir, 'detection_results.pkl') with open(eval_file, 'wb') as fid: cPickle.dump(coco_eval, fid, cPickle.HIGHEST_PROTOCOL) print 'Wrote COCO eval results to: {}'.format(eval_file) def _coco_results_one_category(self, boxes, cat_id): results = [] for im_ind, index in enumerate(self.image_index): dets = boxes[im_ind].astype(np.float) if dets == []: continue scores = dets[:, -1] xs = dets[:, 0] ys = dets[:, 1] ws = dets[:, 2] - xs + 1 hs = dets[:, 3] - ys + 1 results.extend( [{'image_id' : index, 'category_id' : cat_id, 'bbox' : [xs[k], ys[k], ws[k], hs[k]], 'score' : scores[k]} for k in xrange(dets.shape[0])]) return results def _write_coco_results_file(self, all_boxes, res_file): # [{"image_id": 42, # "category_id": 18, # "bbox": [258.15,41.29,348.26,243.78], # "score": 0.236}, ...] results = [] for cls_ind, cls in enumerate(self.classes): if cls == '__background__': continue print 'Collecting {} results ({:d}/{:d})'.format(cls, cls_ind, self.num_classes - 1) coco_cat_id = self._class_to_coco_cat_id[cls] results.extend(self._coco_results_one_category(all_boxes[cls_ind], coco_cat_id)) print 'Writing results json to {}'.format(res_file) with open(res_file, 'w') as fid: json.dump(results, fid) def evaluate_detections(self, all_boxes, output_dir): res_file = osp.join(output_dir, ('detections_' + self._image_set + self._year + '_results')) if self.config['use_salt']: res_file += '_{}'.format(str(uuid.uuid4())) res_file += '.json' self._write_coco_results_file(all_boxes, res_file) # Only do evaluation on non-test sets if self._image_set.find('test') == -1: self._do_detection_eval(res_file, output_dir) # Optionally cleanup results json file if self.config['cleanup']: os.remove(res_file) def competition_mode(self, on): if on: self.config['use_salt'] = False self.config['cleanup'] = False else: self.config['use_salt'] = True self.config['cleanup'] = True ================================================ FILE: lib/datasets/ds_utils.py ================================================ # -------------------------------------------------------- # Fast/er R-CNN # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- import numpy as np def unique_boxes(boxes, scale=1.0): """Return indices of unique boxes.""" v = np.array([1, 1e3, 1e6, 1e9]) hashes = np.round(boxes * scale).dot(v) _, index = np.unique(hashes, return_index=True) return np.sort(index) def xywh_to_xyxy(boxes): """Convert [x y w h] box format to [x1 y1 x2 y2] format.""" return np.hstack((boxes[:, 0:2], boxes[:, 0:2] + boxes[:, 2:4] - 1)) def xyxy_to_xywh(boxes): """Convert [x1 y1 x2 y2] box format to [x y w h] format.""" return np.hstack((boxes[:, 0:2], boxes[:, 2:4] - boxes[:, 0:2] + 1)) def validate_boxes(boxes, width=0, height=0): """Check that a set of boxes are valid.""" x1 = boxes[:, 0] y1 = boxes[:, 1] x2 = boxes[:, 2] y2 = boxes[:, 3] assert (x1 >= 0).all() assert (y1 >= 0).all() assert (x2 >= x1).all() assert (y2 >= y1).all() assert (x2 < width).all() assert (y2 < height).all() def filter_small_boxes(boxes, min_size): w = boxes[:, 2] - boxes[:, 0] h = boxes[:, 3] - boxes[:, 1] keep = np.where((w >= min_size) & (h > min_size))[0] return keep ================================================ FILE: lib/datasets/factory.py ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- """Factory method for easily getting imdbs by name.""" __sets = {} from datasets.pascal_voc import pascal_voc from datasets.coco import coco import numpy as np # Set up voc__ using selective search "fast" mode for year in ['2007', '2012']: for split in ['train', 'val', 'trainval', 'test']: name = 'voc_{}_{}'.format(year, split) __sets[name] = (lambda split=split, year=year: pascal_voc(split, year)) # Set up coco_2014_ for year in ['2014']: for split in ['train', 'val', 'minival', 'valminusminival']: name = 'coco_{}_{}'.format(year, split) __sets[name] = (lambda split=split, year=year: coco(split, year)) # Set up coco_2015_ for year in ['2015']: for split in ['test', 'test-dev']: name = 'coco_{}_{}'.format(year, split) __sets[name] = (lambda split=split, year=year: coco(split, year)) def get_imdb(name): """Get an imdb (image database) by name.""" if not __sets.has_key(name): raise KeyError('Unknown dataset: {}'.format(name)) return __sets[name]() def list_imdbs(): """List all registered imdbs.""" return __sets.keys() ================================================ FILE: lib/datasets/imdb.py ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- import os import os.path as osp import PIL from utils.cython_bbox import bbox_overlaps import numpy as np import scipy.sparse from fast_rcnn.config import cfg class imdb(object): """Image database.""" def __init__(self, name): self._name = name self._num_classes = 0 self._classes = [] self._image_index = [] self._obj_proposer = 'selective_search' self._roidb = None self._roidb_handler = self.default_roidb # Use this dict for storing dataset specific config options self.config = {} @property def name(self): return self._name @property def num_classes(self): return len(self._classes) @property def classes(self): return self._classes @property def image_index(self): return self._image_index @property def roidb_handler(self): return self._roidb_handler @roidb_handler.setter def roidb_handler(self, val): self._roidb_handler = val def set_proposal_method(self, method): method = eval('self.' + method + '_roidb') self.roidb_handler = method @property def roidb(self): # A roidb is a list of dictionaries, each with the following keys: # boxes # gt_overlaps # gt_classes # flipped if self._roidb is not None: return self._roidb self._roidb = self.roidb_handler() return self._roidb @property def cache_path(self): cache_path = osp.abspath(osp.join(cfg.DATA_DIR, 'cache')) if not os.path.exists(cache_path): os.makedirs(cache_path) return cache_path @property def num_images(self): return len(self.image_index) def image_path_at(self, i): raise NotImplementedError def default_roidb(self): raise NotImplementedError def evaluate_detections(self, all_boxes, output_dir=None): """ all_boxes is a list of length number-of-classes. Each list element is a list of length number-of-images. Each of those list elements is either an empty list [] or a numpy array of detection. all_boxes[class][image] = [] or np.array of shape #dets x 5 """ raise NotImplementedError def _get_widths(self): return [PIL.Image.open(self.image_path_at(i)).size[0] for i in xrange(self.num_images)] def append_flipped_images(self): num_images = self.num_images widths = self._get_widths() for i in xrange(num_images): boxes = self.roidb[i]['boxes'].copy() oldx1 = boxes[:, 0].copy() oldx2 = boxes[:, 2].copy() boxes[:, 0] = widths[i] - oldx2 - 1 boxes[:, 2] = widths[i] - oldx1 - 1 assert (boxes[:, 2] >= boxes[:, 0]).all() entry = {'boxes' : boxes, 'gt_overlaps' : self.roidb[i]['gt_overlaps'], 'gt_classes' : self.roidb[i]['gt_classes'], 'flipped' : True} self.roidb.append(entry) self._image_index = self._image_index * 2 def evaluate_recall(self, candidate_boxes=None, thresholds=None, area='all', limit=None): """Evaluate detection proposal recall metrics. Returns: results: dictionary of results with keys 'ar': average recall 'recalls': vector recalls at each IoU overlap threshold 'thresholds': vector of IoU overlap thresholds 'gt_overlaps': vector of all ground-truth overlaps """ # Record max overlap value for each gt box # Return vector of overlap values areas = { 'all': 0, 'small': 1, 'medium': 2, 'large': 3, '96-128': 4, '128-256': 5, '256-512': 6, '512-inf': 7} area_ranges = [ [0**2, 1e5**2], # all [0**2, 32**2], # small [32**2, 96**2], # medium [96**2, 1e5**2], # large [96**2, 128**2], # 96-128 [128**2, 256**2], # 128-256 [256**2, 512**2], # 256-512 [512**2, 1e5**2], # 512-inf ] assert areas.has_key(area), 'unknown area range: {}'.format(area) area_range = area_ranges[areas[area]] gt_overlaps = np.zeros(0) num_pos = 0 for i in xrange(self.num_images): # Checking for max_overlaps == 1 avoids including crowd annotations # (...pretty hacking :/) max_gt_overlaps = self.roidb[i]['gt_overlaps'].toarray().max(axis=1) gt_inds = np.where((self.roidb[i]['gt_classes'] > 0) & (max_gt_overlaps == 1))[0] gt_boxes = self.roidb[i]['boxes'][gt_inds, :] gt_areas = self.roidb[i]['seg_areas'][gt_inds] valid_gt_inds = np.where((gt_areas >= area_range[0]) & (gt_areas <= area_range[1]))[0] gt_boxes = gt_boxes[valid_gt_inds, :] num_pos += len(valid_gt_inds) if candidate_boxes is None: # If candidate_boxes is not supplied, the default is to use the # non-ground-truth boxes from this roidb non_gt_inds = np.where(self.roidb[i]['gt_classes'] == 0)[0] boxes = self.roidb[i]['boxes'][non_gt_inds, :] else: boxes = candidate_boxes[i] if boxes.shape[0] == 0: continue if limit is not None and boxes.shape[0] > limit: boxes = boxes[:limit, :] overlaps = bbox_overlaps(boxes.astype(np.float), gt_boxes.astype(np.float)) _gt_overlaps = np.zeros((gt_boxes.shape[0])) for j in xrange(gt_boxes.shape[0]): # find which proposal box maximally covers each gt box argmax_overlaps = overlaps.argmax(axis=0) # and get the iou amount of coverage for each gt box max_overlaps = overlaps.max(axis=0) # find which gt box is 'best' covered (i.e. 'best' = most iou) gt_ind = max_overlaps.argmax() gt_ovr = max_overlaps.max() assert(gt_ovr >= 0) # find the proposal box that covers the best covered gt box box_ind = argmax_overlaps[gt_ind] # record the iou coverage of this gt box _gt_overlaps[j] = overlaps[box_ind, gt_ind] assert(_gt_overlaps[j] == gt_ovr) # mark the proposal box and the gt box as used overlaps[box_ind, :] = -1 overlaps[:, gt_ind] = -1 # append recorded iou coverage level gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps)) gt_overlaps = np.sort(gt_overlaps) if thresholds is None: step = 0.05 thresholds = np.arange(0.5, 0.95 + 1e-5, step) recalls = np.zeros_like(thresholds) # compute recall for each iou threshold for i, t in enumerate(thresholds): recalls[i] = (gt_overlaps >= t).sum() / float(num_pos) # ar = 2 * np.trapz(recalls, thresholds) ar = recalls.mean() return {'ar': ar, 'recalls': recalls, 'thresholds': thresholds, 'gt_overlaps': gt_overlaps} def create_roidb_from_box_list(self, box_list, gt_roidb): assert len(box_list) == self.num_images, \ 'Number of boxes must match number of ground-truth images' roidb = [] for i in xrange(self.num_images): boxes = box_list[i] num_boxes = boxes.shape[0] overlaps = np.zeros((num_boxes, self.num_classes), dtype=np.float32) if gt_roidb is not None and gt_roidb[i]['boxes'].size > 0: gt_boxes = gt_roidb[i]['boxes'] gt_classes = gt_roidb[i]['gt_classes'] gt_overlaps = bbox_overlaps(boxes.astype(np.float), gt_boxes.astype(np.float)) argmaxes = gt_overlaps.argmax(axis=1) maxes = gt_overlaps.max(axis=1) I = np.where(maxes > 0)[0] overlaps[I, gt_classes[argmaxes[I]]] = maxes[I] overlaps = scipy.sparse.csr_matrix(overlaps) roidb.append({ 'boxes' : boxes, 'gt_classes' : np.zeros((num_boxes,), dtype=np.int32), 'gt_overlaps' : overlaps, 'flipped' : False, 'seg_areas' : np.zeros((num_boxes,), dtype=np.float32), }) return roidb @staticmethod def merge_roidbs(a, b): assert len(a) == len(b) for i in xrange(len(a)): a[i]['boxes'] = np.vstack((a[i]['boxes'], b[i]['boxes'])) a[i]['gt_classes'] = np.hstack((a[i]['gt_classes'], b[i]['gt_classes'])) a[i]['gt_overlaps'] = scipy.sparse.vstack([a[i]['gt_overlaps'], b[i]['gt_overlaps']]) a[i]['seg_areas'] = np.hstack((a[i]['seg_areas'], b[i]['seg_areas'])) return a def competition_mode(self, on): """Turn competition mode on or off.""" pass ================================================ FILE: lib/datasets/pascal_voc.py ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- import os from datasets.imdb import imdb import datasets.ds_utils as ds_utils import xml.etree.ElementTree as ET import numpy as np import scipy.sparse import scipy.io as sio import utils.cython_bbox import cPickle import subprocess import uuid from voc_eval import voc_eval from fast_rcnn.config import cfg class pascal_voc(imdb): def __init__(self, image_set, year, devkit_path=None): imdb.__init__(self, 'voc_' + year + '_' + image_set) self._year = year self._image_set = image_set self._devkit_path = self._get_default_path() if devkit_path is None \ else devkit_path self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year) self._classes = ('__background__', # always index 0 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor') self._class_to_ind = dict(zip(self.classes, xrange(self.num_classes))) self._image_ext = '.jpg' self._image_index = self._load_image_set_index() # Default to roidb handler self._roidb_handler = self.selective_search_roidb self._salt = str(uuid.uuid4()) self._comp_id = 'comp4' # PASCAL specific config options self.config = {'cleanup' : True, 'use_salt' : True, 'use_diff' : False, 'matlab_eval' : False, 'rpn_file' : None, 'min_size' : 2} assert os.path.exists(self._devkit_path), \ 'VOCdevkit path does not exist: {}'.format(self._devkit_path) assert os.path.exists(self._data_path), \ 'Path does not exist: {}'.format(self._data_path) def image_path_at(self, i): """ Return the absolute path to image i in the image sequence. """ return self.image_path_from_index(self._image_index[i]) def image_path_from_index(self, index): """ Construct an image path from the image's "index" identifier. """ image_path = os.path.join(self._data_path, 'JPEGImages', index + self._image_ext) assert os.path.exists(image_path), \ 'Path does not exist: {}'.format(image_path) return image_path def _load_image_set_index(self): """ Load the indexes listed in this dataset's image set file. """ # Example path to image set file: # self._devkit_path + /VOCdevkit2007/VOC2007/ImageSets/Main/val.txt image_set_file = os.path.join(self._data_path, 'ImageSets', 'Main', self._image_set + '.txt') assert os.path.exists(image_set_file), \ 'Path does not exist: {}'.format(image_set_file) with open(image_set_file) as f: image_index = [x.strip() for x in f.readlines()] return image_index def _get_default_path(self): """ Return the default path where PASCAL VOC is expected to be installed. """ return os.path.join(cfg.DATA_DIR, 'VOCdevkit' + self._year) def gt_roidb(self): """ Return the database of ground-truth regions of interest. This function loads/saves from/to a cache file to speed up future calls. """ cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl') if os.path.exists(cache_file): with open(cache_file, 'rb') as fid: roidb = cPickle.load(fid) print '{} gt roidb loaded from {}'.format(self.name, cache_file) return roidb gt_roidb = [self._load_pascal_annotation(index) for index in self.image_index] with open(cache_file, 'wb') as fid: cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL) print 'wrote gt roidb to {}'.format(cache_file) return gt_roidb def selective_search_roidb(self): """ Return the database of selective search regions of interest. Ground-truth ROIs are also included. This function loads/saves from/to a cache file to speed up future calls. """ cache_file = os.path.join(self.cache_path, self.name + '_selective_search_roidb.pkl') if os.path.exists(cache_file): with open(cache_file, 'rb') as fid: roidb = cPickle.load(fid) print '{} ss roidb loaded from {}'.format(self.name, cache_file) return roidb if int(self._year) == 2007 or self._image_set != 'test': gt_roidb = self.gt_roidb() ss_roidb = self._load_selective_search_roidb(gt_roidb) roidb = imdb.merge_roidbs(gt_roidb, ss_roidb) else: roidb = self._load_selective_search_roidb(None) with open(cache_file, 'wb') as fid: cPickle.dump(roidb, fid, cPickle.HIGHEST_PROTOCOL) print 'wrote ss roidb to {}'.format(cache_file) return roidb def rpn_roidb(self): if int(self._year) == 2007 or self._image_set != 'test': gt_roidb = self.gt_roidb() rpn_roidb = self._load_rpn_roidb(gt_roidb) roidb = imdb.merge_roidbs(gt_roidb, rpn_roidb) else: roidb = self._load_rpn_roidb(None) return roidb def _load_rpn_roidb(self, gt_roidb): filename = self.config['rpn_file'] print 'loading {}'.format(filename) assert os.path.exists(filename), \ 'rpn data not found at: {}'.format(filename) with open(filename, 'rb') as f: box_list = cPickle.load(f) return self.create_roidb_from_box_list(box_list, gt_roidb) def _load_selective_search_roidb(self, gt_roidb): filename = os.path.abspath(os.path.join(cfg.DATA_DIR, 'selective_search_data', self.name + '.mat')) assert os.path.exists(filename), \ 'Selective search data not found at: {}'.format(filename) raw_data = sio.loadmat(filename)['boxes'].ravel() box_list = [] for i in xrange(raw_data.shape[0]): boxes = raw_data[i][:, (1, 0, 3, 2)] - 1 keep = ds_utils.unique_boxes(boxes) boxes = boxes[keep, :] keep = ds_utils.filter_small_boxes(boxes, self.config['min_size']) boxes = boxes[keep, :] box_list.append(boxes) return self.create_roidb_from_box_list(box_list, gt_roidb) def _load_pascal_annotation(self, index): """ Load image and bounding boxes info from XML file in the PASCAL VOC format. """ filename = os.path.join(self._data_path, 'Annotations', index + '.xml') tree = ET.parse(filename) objs = tree.findall('object') if not self.config['use_diff']: # Exclude the samples labeled as difficult non_diff_objs = [ obj for obj in objs if int(obj.find('difficult').text) == 0] # if len(non_diff_objs) != len(objs): # print 'Removed {} difficult objects'.format( # len(objs) - len(non_diff_objs)) objs = non_diff_objs num_objs = len(objs) boxes = np.zeros((num_objs, 4), dtype=np.uint16) gt_classes = np.zeros((num_objs), dtype=np.int32) overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32) # "Seg" area for pascal is just the box area seg_areas = np.zeros((num_objs), dtype=np.float32) # Load object bounding boxes into a data frame. for ix, obj in enumerate(objs): bbox = obj.find('bndbox') # Make pixel indexes 0-based x1 = float(bbox.find('xmin').text) - 1 y1 = float(bbox.find('ymin').text) - 1 x2 = float(bbox.find('xmax').text) - 1 y2 = float(bbox.find('ymax').text) - 1 cls = self._class_to_ind[obj.find('name').text.lower().strip()] boxes[ix, :] = [x1, y1, x2, y2] gt_classes[ix] = cls overlaps[ix, cls] = 1.0 seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1) overlaps = scipy.sparse.csr_matrix(overlaps) return {'boxes' : boxes, 'gt_classes': gt_classes, 'gt_overlaps' : overlaps, 'flipped' : False, 'seg_areas' : seg_areas} def _get_comp_id(self): comp_id = (self._comp_id + '_' + self._salt if self.config['use_salt'] else self._comp_id) return comp_id def _get_voc_results_file_template(self): # VOCdevkit/results/VOC2007/Main/_det_test_aeroplane.txt filename = self._get_comp_id() + '_det_' + self._image_set + '_{:s}.txt' path = os.path.join( self._devkit_path, 'results', 'VOC' + self._year, 'Main', filename) return path def _write_voc_results_file(self, all_boxes): for cls_ind, cls in enumerate(self.classes): if cls == '__background__': continue print 'Writing {} VOC results file'.format(cls) filename = self._get_voc_results_file_template().format(cls) with open(filename, 'wt') as f: for im_ind, index in enumerate(self.image_index): dets = all_boxes[cls_ind][im_ind] if dets == []: continue # the VOCdevkit expects 1-based indices for k in xrange(dets.shape[0]): f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. format(index, dets[k, -1], dets[k, 0] + 1, dets[k, 1] + 1, dets[k, 2] + 1, dets[k, 3] + 1)) def _do_python_eval(self, output_dir = 'output'): annopath = os.path.join( self._devkit_path, 'VOC' + self._year, 'Annotations', '{:s}.xml') imagesetfile = os.path.join( self._devkit_path, 'VOC' + self._year, 'ImageSets', 'Main', self._image_set + '.txt') cachedir = os.path.join(self._devkit_path, 'annotations_cache') aps = [] # The PASCAL VOC metric changed in 2010 use_07_metric = True if int(self._year) < 2010 else False print 'VOC07 metric? ' + ('Yes' if use_07_metric else 'No') if not os.path.isdir(output_dir): os.mkdir(output_dir) for i, cls in enumerate(self._classes): if cls == '__background__': continue filename = self._get_voc_results_file_template().format(cls) rec, prec, ap = voc_eval( filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5, use_07_metric=use_07_metric) aps += [ap] print('AP for {} = {:.4f}'.format(cls, ap)) with open(os.path.join(output_dir, cls + '_pr.pkl'), 'w') as f: cPickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) print('Mean AP = {:.4f}'.format(np.mean(aps))) print('~~~~~~~~') print('Results:') for ap in aps: print('{:.3f}'.format(ap)) print('{:.3f}'.format(np.mean(aps))) print('~~~~~~~~') print('') print('--------------------------------------------------------------') print('Results computed with the **unofficial** Python eval code.') print('Results should be very close to the official MATLAB eval code.') print('Recompute with `./tools/reval.py --matlab ...` for your paper.') print('-- Thanks, The Management') print('--------------------------------------------------------------') def _do_matlab_eval(self, output_dir='output'): print '-----------------------------------------------------' print 'Computing results with the official MATLAB eval code.' print '-----------------------------------------------------' path = os.path.join(cfg.ROOT_DIR, 'lib', 'datasets', 'VOCdevkit-matlab-wrapper') cmd = 'cd {} && '.format(path) cmd += '{:s} -nodisplay -nodesktop '.format(cfg.MATLAB) cmd += '-r "dbstop if error; ' cmd += 'voc_eval(\'{:s}\',\'{:s}\',\'{:s}\',\'{:s}\'); quit;"' \ .format(self._devkit_path, self._get_comp_id(), self._image_set, output_dir) print('Running:\n{}'.format(cmd)) status = subprocess.call(cmd, shell=True) def evaluate_detections(self, all_boxes, output_dir): self._write_voc_results_file(all_boxes) self._do_python_eval(output_dir) if self.config['matlab_eval']: self._do_matlab_eval(output_dir) if self.config['cleanup']: for cls in self._classes: if cls == '__background__': continue filename = self._get_voc_results_file_template().format(cls) os.remove(filename) def competition_mode(self, on): if on: self.config['use_salt'] = False self.config['cleanup'] = False else: self.config['use_salt'] = True self.config['cleanup'] = True if __name__ == '__main__': from datasets.pascal_voc import pascal_voc d = pascal_voc('trainval', '2007') res = d.roidb from IPython import embed; embed() ================================================ FILE: lib/datasets/tools/mcg_munge.py ================================================ import os import sys """Hacky tool to convert file system layout of MCG boxes downloaded from http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/mcg/ so that it's consistent with those computed by Jan Hosang (see: http://www.mpi-inf.mpg.de/departments/computer-vision-and-multimodal- computing/research/object-recognition-and-scene-understanding/how- good-are-detection-proposals-really/) NB: Boxes from the MCG website are in (y1, x1, y2, x2) order. Boxes from Hosang et al. are in (x1, y1, x2, y2) order. """ def munge(src_dir): # stored as: ./MCG-COCO-val2014-boxes/COCO_val2014_000000193401.mat # want: ./MCG/mat/COCO_val2014_0/COCO_val2014_000000141/COCO_val2014_000000141334.mat files = os.listdir(src_dir) for fn in files: base, ext = os.path.splitext(fn) # first 14 chars / first 22 chars / all chars + .mat # COCO_val2014_0/COCO_val2014_000000447/COCO_val2014_000000447991.mat first = base[:14] second = base[:22] dst_dir = os.path.join('MCG', 'mat', first, second) if not os.path.exists(dst_dir): os.makedirs(dst_dir) src = os.path.join(src_dir, fn) dst = os.path.join(dst_dir, fn) print 'MV: {} -> {}'.format(src, dst) os.rename(src, dst) if __name__ == '__main__': # src_dir should look something like: # src_dir = 'MCG-COCO-val2014-boxes' src_dir = sys.argv[1] munge(src_dir) ================================================ FILE: lib/datasets/voc_eval.py ================================================ # -------------------------------------------------------- # Fast/er R-CNN # Licensed under The MIT License [see LICENSE for details] # Written by Bharath Hariharan # -------------------------------------------------------- import xml.etree.ElementTree as ET import os import cPickle import numpy as np def parse_rec(filename): """ Parse a PASCAL VOC xml file """ tree = ET.parse(filename) objects = [] for obj in tree.findall('object'): obj_struct = {} obj_struct['name'] = obj.find('name').text obj_struct['pose'] = obj.find('pose').text obj_struct['truncated'] = int(obj.find('truncated').text) obj_struct['difficult'] = int(obj.find('difficult').text) bbox = obj.find('bndbox') obj_struct['bbox'] = [int(bbox.find('xmin').text), int(bbox.find('ymin').text), int(bbox.find('xmax').text), int(bbox.find('ymax').text)] objects.append(obj_struct) return objects def voc_ap(rec, prec, use_07_metric=False): """ ap = voc_ap(rec, prec, [use_07_metric]) Compute VOC AP given precision and recall. If use_07_metric is true, uses the VOC 07 11 point method (default:False). """ if use_07_metric: # 11 point metric ap = 0. for t in np.arange(0., 1.1, 0.1): if np.sum(rec >= t) == 0: p = 0 else: p = np.max(prec[rec >= t]) ap = ap + p / 11. else: # correct AP calculation # first append sentinel values at the end mrec = np.concatenate(([0.], rec, [1.])) mpre = np.concatenate(([0.], prec, [0.])) # compute the precision envelope for i in range(mpre.size - 1, 0, -1): mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) # to calculate area under PR curve, look for points # where X axis (recall) changes value i = np.where(mrec[1:] != mrec[:-1])[0] # and sum (\Delta recall) * prec ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) return ap def voc_eval(detpath, annopath, imagesetfile, classname, cachedir, ovthresh=0.5, use_07_metric=False): """rec, prec, ap = voc_eval(detpath, annopath, imagesetfile, classname, [ovthresh], [use_07_metric]) Top level function that does the PASCAL VOC evaluation. detpath: Path to detections detpath.format(classname) should produce the detection results file. annopath: Path to annotations annopath.format(imagename) should be the xml annotations file. imagesetfile: Text file containing the list of images, one image per line. classname: Category name (duh) cachedir: Directory for caching the annotations [ovthresh]: Overlap threshold (default = 0.5) [use_07_metric]: Whether to use VOC07's 11 point AP computation (default False) """ # assumes detections are in detpath.format(classname) # assumes annotations are in annopath.format(imagename) # assumes imagesetfile is a text file with each line an image name # cachedir caches the annotations in a pickle file # first load gt if not os.path.isdir(cachedir): os.mkdir(cachedir) cachefile = os.path.join(cachedir, 'annots.pkl') # read list of images with open(imagesetfile, 'r') as f: lines = f.readlines() imagenames = [x.strip() for x in lines] if not os.path.isfile(cachefile): # load annots recs = {} for i, imagename in enumerate(imagenames): recs[imagename] = parse_rec(annopath.format(imagename)) if i % 100 == 0: print 'Reading annotation for {:d}/{:d}'.format( i + 1, len(imagenames)) # save print 'Saving cached annotations to {:s}'.format(cachefile) with open(cachefile, 'w') as f: cPickle.dump(recs, f) else: # load with open(cachefile, 'r') as f: recs = cPickle.load(f) # extract gt objects for this class class_recs = {} npos = 0 for imagename in imagenames: R = [obj for obj in recs[imagename] if obj['name'] == classname] bbox = np.array([x['bbox'] for x in R]) difficult = np.array([x['difficult'] for x in R]).astype(np.bool) det = [False] * len(R) npos = npos + sum(~difficult) class_recs[imagename] = {'bbox': bbox, 'difficult': difficult, 'det': det} # read dets detfile = detpath.format(classname) with open(detfile, 'r') as f: lines = f.readlines() splitlines = [x.strip().split(' ') for x in lines] image_ids = [x[0] for x in splitlines] confidence = np.array([float(x[1]) for x in splitlines]) BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) # sort by confidence sorted_ind = np.argsort(-confidence) sorted_scores = np.sort(-confidence) BB = BB[sorted_ind, :] image_ids = [image_ids[x] for x in sorted_ind] # go down dets and mark TPs and FPs nd = len(image_ids) tp = np.zeros(nd) fp = np.zeros(nd) for d in range(nd): R = class_recs[image_ids[d]] bb = BB[d, :].astype(float) ovmax = -np.inf BBGT = R['bbox'].astype(float) if BBGT.size > 0: # compute overlaps # intersection ixmin = np.maximum(BBGT[:, 0], bb[0]) iymin = np.maximum(BBGT[:, 1], bb[1]) ixmax = np.minimum(BBGT[:, 2], bb[2]) iymax = np.minimum(BBGT[:, 3], bb[3]) iw = np.maximum(ixmax - ixmin + 1., 0.) ih = np.maximum(iymax - iymin + 1., 0.) inters = iw * ih # union uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + (BBGT[:, 2] - BBGT[:, 0] + 1.) * (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) overlaps = inters / uni ovmax = np.max(overlaps) jmax = np.argmax(overlaps) if ovmax > ovthresh: if not R['difficult'][jmax]: if not R['det'][jmax]: tp[d] = 1. R['det'][jmax] = 1 else: fp[d] = 1. else: fp[d] = 1. # compute precision recall fp = np.cumsum(fp) tp = np.cumsum(tp) rec = tp / float(npos) # avoid divide by zero in case the first detection matches a difficult # ground truth prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) ap = voc_ap(rec, prec, use_07_metric) return rec, prec, ap ================================================ FILE: lib/fast_rcnn/__init__.py ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- ================================================ FILE: lib/fast_rcnn/bbox_transform.py ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- import numpy as np def bbox_transform(ex_rois, gt_rois): ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0 gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0 gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights targets_dw = np.log(gt_widths / ex_widths) targets_dh = np.log(gt_heights / ex_heights) targets = np.vstack( (targets_dx, targets_dy, targets_dw, targets_dh)).transpose() return targets def bbox_transform_inv(boxes, deltas): if boxes.shape[0] == 0: return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype) boxes = boxes.astype(deltas.dtype, copy=False) widths = boxes[:, 2] - boxes[:, 0] + 1.0 heights = boxes[:, 3] - boxes[:, 1] + 1.0 ctr_x = boxes[:, 0] + 0.5 * widths ctr_y = boxes[:, 1] + 0.5 * heights dx = deltas[:, 0::4] dy = deltas[:, 1::4] dw = deltas[:, 2::4] dh = deltas[:, 3::4] pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis] pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis] pred_w = np.exp(dw) * widths[:, np.newaxis] pred_h = np.exp(dh) * heights[:, np.newaxis] pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype) # x1 pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w # y1 pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h # x2 pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w # y2 pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h return pred_boxes def clip_boxes(boxes, im_shape): """ Clip boxes to image boundaries. """ # x1 >= 0 boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) # y1 >= 0 boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0) # x2 < im_shape[1] boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0) # y2 < im_shape[0] boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0) return boxes ================================================ FILE: lib/fast_rcnn/config.py ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- """Fast R-CNN config system. This file specifies default config options for Fast R-CNN. You should not change values in this file. Instead, you should write a config file (in yaml) and use cfg_from_file(yaml_file) to load it and override the default options. Most tools in $ROOT/tools take a --cfg option to specify an override file. - See tools/{train,test}_net.py for example code that uses cfg_from_file() - See experiments/cfgs/*.yml for example YAML config override files """ import os import os.path as osp import numpy as np # `pip install easydict` if you don't have it from easydict import EasyDict as edict __C = edict() # Consumers can get config by: # from fast_rcnn_config import cfg cfg = __C # # Training options # __C.TRAIN = edict() # Scales to use during training (can list multiple scales) # Each scale is the pixel size of an image's shortest side __C.TRAIN.SCALES = (600,) # Max pixel size of the longest side of a scaled input image __C.TRAIN.MAX_SIZE = 1000 # Images to use per minibatch __C.TRAIN.IMS_PER_BATCH = 2 # Minibatch size (number of regions of interest [ROIs]) __C.TRAIN.BATCH_SIZE = 128 # Fraction of minibatch that is labeled foreground (i.e. class > 0) __C.TRAIN.FG_FRACTION = 0.25 # Overlap threshold for a ROI to be considered foreground (if >= FG_THRESH) __C.TRAIN.FG_THRESH = 0.5 # Overlap threshold for a ROI to be considered background (class = 0 if # overlap in [LO, HI)) __C.TRAIN.BG_THRESH_HI = 0.5 __C.TRAIN.BG_THRESH_LO = 0.1 # Use horizontally-flipped images during training? __C.TRAIN.USE_FLIPPED = True # Train bounding-box regressors __C.TRAIN.BBOX_REG = True # Overlap required between a ROI and ground-truth box in order for that ROI to # be used as a bounding-box regression training example __C.TRAIN.BBOX_THRESH = 0.5 # Iterations between snapshots __C.TRAIN.SNAPSHOT_ITERS = 10000 # solver.prototxt specifies the snapshot path prefix, this adds an optional # infix to yield the path: [_]_iters_XYZ.caffemodel __C.TRAIN.SNAPSHOT_INFIX = '' # Use a prefetch thread in roi_data_layer.layer # So far I haven't found this useful; likely more engineering work is required __C.TRAIN.USE_PREFETCH = False # Normalize the targets (subtract empirical mean, divide by empirical stddev) __C.TRAIN.BBOX_NORMALIZE_TARGETS = True # Deprecated (inside weights) __C.TRAIN.BBOX_INSIDE_WEIGHTS = (1.0, 1.0, 1.0, 1.0) # Normalize the targets using "precomputed" (or made up) means and stdevs # (BBOX_NORMALIZE_TARGETS must also be True) __C.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED = False __C.TRAIN.BBOX_NORMALIZE_MEANS = (0.0, 0.0, 0.0, 0.0) __C.TRAIN.BBOX_NORMALIZE_STDS = (0.1, 0.1, 0.2, 0.2) # Train using these proposals __C.TRAIN.PROPOSAL_METHOD = 'selective_search' # Make minibatches from images that have similar aspect ratios (i.e. both # tall and thin or both short and wide) in order to avoid wasting computation # on zero-padding. __C.TRAIN.ASPECT_GROUPING = True # Use RPN to detect objects __C.TRAIN.HAS_RPN = False # IOU >= thresh: positive example __C.TRAIN.RPN_POSITIVE_OVERLAP = 0.7 # IOU < thresh: negative example __C.TRAIN.RPN_NEGATIVE_OVERLAP = 0.3 # If an anchor statisfied by positive and negative conditions set to negative __C.TRAIN.RPN_CLOBBER_POSITIVES = False # Max number of foreground examples __C.TRAIN.RPN_FG_FRACTION = 0.5 # Total number of examples __C.TRAIN.RPN_BATCHSIZE = 256 # NMS threshold used on RPN proposals __C.TRAIN.RPN_NMS_THRESH = 0.7 # Number of top scoring boxes to keep before apply NMS to RPN proposals __C.TRAIN.RPN_PRE_NMS_TOP_N = 12000 # Number of top scoring boxes to keep after applying NMS to RPN proposals __C.TRAIN.RPN_POST_NMS_TOP_N = 2000 # Proposal height and width both need to be greater than RPN_MIN_SIZE (at orig image scale) __C.TRAIN.RPN_MIN_SIZE = 16 # Deprecated (outside weights) __C.TRAIN.RPN_BBOX_INSIDE_WEIGHTS = (1.0, 1.0, 1.0, 1.0) # Give the positive RPN examples weight of p * 1 / {num positives} # and give negatives a weight of (1 - p) # Set to -1.0 to use uniform example weighting __C.TRAIN.RPN_POSITIVE_WEIGHT = -1.0 # # Testing options # __C.TEST = edict() # Scales to use during testing (can list multiple scales) # Each scale is the pixel size of an image's shortest side __C.TEST.SCALES = (600,) # Max pixel size of the longest side of a scaled input image __C.TEST.MAX_SIZE = 1000 # Overlap threshold used for non-maximum suppression (suppress boxes with # IoU >= this threshold) __C.TEST.NMS = 0.3 # Experimental: treat the (K+1) units in the cls_score layer as linear # predictors (trained, eg, with one-vs-rest SVMs). __C.TEST.SVM = False # Test using bounding-box regressors __C.TEST.BBOX_REG = True # Propose boxes __C.TEST.HAS_RPN = False # Test using these proposals __C.TEST.PROPOSAL_METHOD = 'selective_search' ## NMS threshold used on RPN proposals __C.TEST.RPN_NMS_THRESH = 0.7 ## Number of top scoring boxes to keep before apply NMS to RPN proposals __C.TEST.RPN_PRE_NMS_TOP_N = 6000 ## Number of top scoring boxes to keep after applying NMS to RPN proposals __C.TEST.RPN_POST_NMS_TOP_N = 300 # Proposal height and width both need to be greater than RPN_MIN_SIZE (at orig image scale) __C.TEST.RPN_MIN_SIZE = 16 # # MISC # # The mapping from image coordinates to feature map coordinates might cause # some boxes that are distinct in image space to become identical in feature # coordinates. If DEDUP_BOXES > 0, then DEDUP_BOXES is used as the scale factor # for identifying duplicate boxes. # 1/16 is correct for {Alex,Caffe}Net, VGG_CNN_M_1024, and VGG16 __C.DEDUP_BOXES = 1./16. # Pixel mean values (BGR order) as a (1, 1, 3) array # We use the same pixel mean for all networks even though it's not exactly what # they were trained with __C.PIXEL_MEANS = np.array([[[102.9801, 115.9465, 122.7717]]]) # For reproducibility __C.RNG_SEED = 3 # A small number that's used many times __C.EPS = 1e-14 # Root directory of project __C.ROOT_DIR = osp.abspath(osp.join(osp.dirname(__file__), '..', '..')) # Data directory __C.DATA_DIR = osp.abspath(osp.join(__C.ROOT_DIR, 'data')) # Model directory __C.MODELS_DIR = osp.abspath(osp.join(__C.ROOT_DIR, 'models', 'pascal_voc')) # Name (or path to) the matlab executable __C.MATLAB = 'matlab' # Place outputs under an experiments directory __C.EXP_DIR = 'default' # Use GPU implementation of non-maximum suppression __C.USE_GPU_NMS = True # Default GPU device id __C.GPU_ID = 0 def get_output_dir(imdb, net=None): """Return the directory where experimental artifacts are placed. If the directory does not exist, it is created. A canonical path is built using the name from an imdb and a network (if not None). """ outdir = osp.abspath(osp.join(__C.ROOT_DIR, 'output', __C.EXP_DIR, imdb.name)) if net is not None: outdir = osp.join(outdir, net.name) if not os.path.exists(outdir): os.makedirs(outdir) return outdir def _merge_a_into_b(a, b): """Merge config dictionary a into config dictionary b, clobbering the options in b whenever they are also specified in a. """ if type(a) is not edict: return for k, v in a.iteritems(): # a must specify keys that are in b if not b.has_key(k): raise KeyError('{} is not a valid config key'.format(k)) # the types must match, too old_type = type(b[k]) if old_type is not type(v): if isinstance(b[k], np.ndarray): v = np.array(v, dtype=b[k].dtype) else: raise ValueError(('Type mismatch ({} vs. {}) ' 'for config key: {}').format(type(b[k]), type(v), k)) # recursively merge dicts if type(v) is edict: try: _merge_a_into_b(a[k], b[k]) except: print('Error under config key: {}'.format(k)) raise else: b[k] = v def cfg_from_file(filename): """Load a config file and merge it into the default options.""" import yaml with open(filename, 'r') as f: yaml_cfg = edict(yaml.load(f)) _merge_a_into_b(yaml_cfg, __C) def cfg_from_list(cfg_list): """Set config keys via list (e.g., from command line).""" from ast import literal_eval assert len(cfg_list) % 2 == 0 for k, v in zip(cfg_list[0::2], cfg_list[1::2]): key_list = k.split('.') d = __C for subkey in key_list[:-1]: assert d.has_key(subkey) d = d[subkey] subkey = key_list[-1] assert d.has_key(subkey) try: value = literal_eval(v) except: # handle the case when v is a string literal value = v assert type(value) == type(d[subkey]), \ 'type {} does not match original type {}'.format( type(value), type(d[subkey])) d[subkey] = value ================================================ FILE: lib/fast_rcnn/nms_wrapper.py ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- from fast_rcnn.config import cfg from nms.gpu_nms import gpu_nms from nms.cpu_nms import cpu_nms def nms(dets, thresh, force_cpu=False): """Dispatch to either CPU or GPU NMS implementations.""" if dets.shape[0] == 0: return [] if cfg.USE_GPU_NMS and not force_cpu: return gpu_nms(dets, thresh, device_id=cfg.GPU_ID) else: return cpu_nms(dets, thresh) ================================================ FILE: lib/fast_rcnn/test.py ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- """Test a Fast R-CNN network on an imdb (image database).""" from fast_rcnn.config import cfg, get_output_dir from fast_rcnn.bbox_transform import clip_boxes, bbox_transform_inv import argparse from utils.timer import Timer import numpy as np import cv2 import caffe from fast_rcnn.nms_wrapper import nms import cPickle from utils.blob import im_list_to_blob import os def _get_image_blob(im): """Converts an image into a network input. Arguments: im (ndarray): a color image in BGR order Returns: blob (ndarray): a data blob holding an image pyramid im_scale_factors (list): list of image scales (relative to im) used in the image pyramid """ im_orig = im.astype(np.float32, copy=True) im_orig -= cfg.PIXEL_MEANS im_shape = im_orig.shape im_size_min = np.min(im_shape[0:2]) im_size_max = np.max(im_shape[0:2]) processed_ims = [] im_scale_factors = [] for target_size in cfg.TEST.SCALES: im_scale = float(target_size) / float(im_size_min) # Prevent the biggest axis from being more than MAX_SIZE if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE: im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max) im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) im_scale_factors.append(im_scale) processed_ims.append(im) # Create a blob to hold the input images blob = im_list_to_blob(processed_ims) return blob, np.array(im_scale_factors) def _get_rois_blob(im_rois, im_scale_factors): """Converts RoIs into network inputs. Arguments: im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates im_scale_factors (list): scale factors as returned by _get_image_blob Returns: blob (ndarray): R x 5 matrix of RoIs in the image pyramid """ rois, levels = _project_im_rois(im_rois, im_scale_factors) rois_blob = np.hstack((levels, rois)) return rois_blob.astype(np.float32, copy=False) def _project_im_rois(im_rois, scales): """Project image RoIs into the image pyramid built by _get_image_blob. Arguments: im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates scales (list): scale factors as returned by _get_image_blob Returns: rois (ndarray): R x 4 matrix of projected RoI coordinates levels (list): image pyramid levels used by each projected RoI """ im_rois = im_rois.astype(np.float, copy=False) if len(scales) > 1: widths = im_rois[:, 2] - im_rois[:, 0] + 1 heights = im_rois[:, 3] - im_rois[:, 1] + 1 areas = widths * heights scaled_areas = areas[:, np.newaxis] * (scales[np.newaxis, :] ** 2) diff_areas = np.abs(scaled_areas - 224 * 224) levels = diff_areas.argmin(axis=1)[:, np.newaxis] else: levels = np.zeros((im_rois.shape[0], 1), dtype=np.int) rois = im_rois * scales[levels] return rois, levels def _get_blobs(im, rois): """Convert an image and RoIs within that image into network inputs.""" blobs = {'data' : None, 'rois' : None} blobs['data'], im_scale_factors = _get_image_blob(im) if not cfg.TEST.HAS_RPN: blobs['rois'] = _get_rois_blob(rois, im_scale_factors) return blobs, im_scale_factors def im_detect(net, im, boxes=None): """Detect object classes in an image given object proposals. Arguments: net (caffe.Net): Fast R-CNN network to use im (ndarray): color image to test (in BGR order) boxes (ndarray): R x 4 array of object proposals or None (for RPN) Returns: scores (ndarray): R x K array of object class scores (K includes background as object category 0) boxes (ndarray): R x (4*K) array of predicted bounding boxes """ blobs, im_scales = _get_blobs(im, boxes) # When mapping from image ROIs to feature map ROIs, there's some aliasing # (some distinct image ROIs get mapped to the same feature ROI). # Here, we identify duplicate feature ROIs, so we only compute features # on the unique subset. if cfg.DEDUP_BOXES > 0 and not cfg.TEST.HAS_RPN: v = np.array([1, 1e3, 1e6, 1e9, 1e12]) hashes = np.round(blobs['rois'] * cfg.DEDUP_BOXES).dot(v) _, index, inv_index = np.unique(hashes, return_index=True, return_inverse=True) blobs['rois'] = blobs['rois'][index, :] boxes = boxes[index, :] if cfg.TEST.HAS_RPN: im_blob = blobs['data'] blobs['im_info'] = np.array( [[im_blob.shape[2], im_blob.shape[3], im_scales[0]]], dtype=np.float32) # reshape network inputs net.blobs['data'].reshape(*(blobs['data'].shape)) if cfg.TEST.HAS_RPN: net.blobs['im_info'].reshape(*(blobs['im_info'].shape)) else: net.blobs['rois'].reshape(*(blobs['rois'].shape)) # do forward forward_kwargs = {'data': blobs['data'].astype(np.float32, copy=False)} if cfg.TEST.HAS_RPN: forward_kwargs['im_info'] = blobs['im_info'].astype(np.float32, copy=False) else: forward_kwargs['rois'] = blobs['rois'].astype(np.float32, copy=False) blobs_out = net.forward(**forward_kwargs) if cfg.TEST.HAS_RPN: assert len(im_scales) == 1, "Only single-image batch implemented" rois = net.blobs['rois'].data.copy() # unscale back to raw image space boxes = rois[:, 1:5] / im_scales[0] if cfg.TEST.SVM: # use the raw scores before softmax under the assumption they # were trained as linear SVMs scores = net.blobs['cls_score'].data else: # use softmax estimated probabilities scores = blobs_out['cls_prob'] if cfg.TEST.BBOX_REG: # Apply bounding-box regression deltas box_deltas = blobs_out['bbox_pred'] pred_boxes = bbox_transform_inv(boxes, box_deltas) pred_boxes = clip_boxes(pred_boxes, im.shape) else: # Simply repeat the boxes, once for each class pred_boxes = np.tile(boxes, (1, scores.shape[1])) if cfg.DEDUP_BOXES > 0 and not cfg.TEST.HAS_RPN: # Map scores and predictions back to the original set of boxes scores = scores[inv_index, :] pred_boxes = pred_boxes[inv_index, :] return scores, pred_boxes def vis_detections(im, class_name, dets, thresh=0.3): """Visual debugging of detections.""" import matplotlib.pyplot as plt im = im[:, :, (2, 1, 0)] for i in xrange(np.minimum(10, dets.shape[0])): bbox = dets[i, :4] score = dets[i, -1] if score > thresh: plt.cla() plt.imshow(im) plt.gca().add_patch( plt.Rectangle((bbox[0], bbox[1]), bbox[2] - bbox[0], bbox[3] - bbox[1], fill=False, edgecolor='g', linewidth=3) ) plt.title('{} {:.3f}'.format(class_name, score)) plt.show() def apply_nms(all_boxes, thresh): """Apply non-maximum suppression to all predicted boxes output by the test_net method. """ num_classes = len(all_boxes) num_images = len(all_boxes[0]) nms_boxes = [[[] for _ in xrange(num_images)] for _ in xrange(num_classes)] for cls_ind in xrange(num_classes): for im_ind in xrange(num_images): dets = all_boxes[cls_ind][im_ind] if dets == []: continue # CPU NMS is much faster than GPU NMS when the number of boxes # is relative small (e.g., < 10k) # TODO(rbg): autotune NMS dispatch keep = nms(dets, thresh, force_cpu=True) if len(keep) == 0: continue nms_boxes[cls_ind][im_ind] = dets[keep, :].copy() return nms_boxes def test_net(net, imdb, max_per_image=100, thresh=0.05, vis=False): """Test a Fast R-CNN network on an image database.""" num_images = len(imdb.image_index) # all detections are collected into: # all_boxes[cls][image] = N x 5 array of detections in # (x1, y1, x2, y2, score) all_boxes = [[[] for _ in xrange(num_images)] for _ in xrange(imdb.num_classes)] output_dir = get_output_dir(imdb, net) # timers _t = {'im_detect' : Timer(), 'misc' : Timer()} if not cfg.TEST.HAS_RPN: roidb = imdb.roidb for i in xrange(num_images): # filter out any ground truth boxes if cfg.TEST.HAS_RPN: box_proposals = None else: # The roidb may contain ground-truth rois (for example, if the roidb # comes from the training or val split). We only want to evaluate # detection on the *non*-ground-truth rois. We select those the rois # that have the gt_classes field set to 0, which means there's no # ground truth. box_proposals = roidb[i]['boxes'][roidb[i]['gt_classes'] == 0] im = cv2.imread(imdb.image_path_at(i)) _t['im_detect'].tic() scores, boxes = im_detect(net, im, box_proposals) _t['im_detect'].toc() _t['misc'].tic() # skip j = 0, because it's the background class for j in xrange(1, imdb.num_classes): inds = np.where(scores[:, j] > thresh)[0] cls_scores = scores[inds, j] cls_boxes = boxes[inds, j*4:(j+1)*4] cls_dets = np.hstack((cls_boxes, cls_scores[:, np.newaxis])) \ .astype(np.float32, copy=False) keep = nms(cls_dets, cfg.TEST.NMS) cls_dets = cls_dets[keep, :] if vis: vis_detections(im, imdb.classes[j], cls_dets) all_boxes[j][i] = cls_dets # Limit to max_per_image detections *over all classes* if max_per_image > 0: image_scores = np.hstack([all_boxes[j][i][:, -1] for j in xrange(1, imdb.num_classes)]) if len(image_scores) > max_per_image: image_thresh = np.sort(image_scores)[-max_per_image] for j in xrange(1, imdb.num_classes): keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0] all_boxes[j][i] = all_boxes[j][i][keep, :] _t['misc'].toc() print 'im_detect: {:d}/{:d} {:.3f}s {:.3f}s' \ .format(i + 1, num_images, _t['im_detect'].average_time, _t['misc'].average_time) det_file = os.path.join(output_dir, 'detections.pkl') with open(det_file, 'wb') as f: cPickle.dump(all_boxes, f, cPickle.HIGHEST_PROTOCOL) print 'Evaluating detections' imdb.evaluate_detections(all_boxes, output_dir) ================================================ FILE: lib/fast_rcnn/train.py ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- """Train a Fast R-CNN network.""" import caffe from fast_rcnn.config import cfg import roi_data_layer.roidb as rdl_roidb from utils.timer import Timer import numpy as np import os from caffe.proto import caffe_pb2 import google.protobuf as pb2 class SolverWrapper(object): """A simple wrapper around Caffe's solver. This wrapper gives us control over he snapshotting process, which we use to unnormalize the learned bounding-box regression weights. """ def __init__(self, solver_prototxt, roidb, output_dir, pretrained_model=None): """Initialize the SolverWrapper.""" self.output_dir = output_dir if (cfg.TRAIN.HAS_RPN and cfg.TRAIN.BBOX_REG and cfg.TRAIN.BBOX_NORMALIZE_TARGETS): # RPN can only use precomputed normalization because there are no # fixed statistics to compute a priori assert cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED if cfg.TRAIN.BBOX_REG: print 'Computing bounding-box regression targets...' self.bbox_means, self.bbox_stds = \ rdl_roidb.add_bbox_regression_targets(roidb) print 'done' self.solver = caffe.SGDSolver(solver_prototxt) if pretrained_model is not None: print ('Loading pretrained model ' 'weights from {:s}').format(pretrained_model) self.solver.net.copy_from(pretrained_model) self.solver_param = caffe_pb2.SolverParameter() with open(solver_prototxt, 'rt') as f: pb2.text_format.Merge(f.read(), self.solver_param) self.solver.net.layers[0].set_roidb(roidb) def snapshot(self): """Take a snapshot of the network after unnormalizing the learned bounding-box regression weights. This enables easy use at test-time. """ net = self.solver.net scale_bbox_params = (cfg.TRAIN.BBOX_REG and cfg.TRAIN.BBOX_NORMALIZE_TARGETS and net.params.has_key('bbox_pred')) if scale_bbox_params: # save original values orig_0 = net.params['bbox_pred'][0].data.copy() orig_1 = net.params['bbox_pred'][1].data.copy() # scale and shift with bbox reg unnormalization; then save snapshot net.params['bbox_pred'][0].data[...] = \ (net.params['bbox_pred'][0].data * self.bbox_stds[:, np.newaxis]) net.params['bbox_pred'][1].data[...] = \ (net.params['bbox_pred'][1].data * self.bbox_stds + self.bbox_means) infix = ('_' + cfg.TRAIN.SNAPSHOT_INFIX if cfg.TRAIN.SNAPSHOT_INFIX != '' else '') filename = (self.solver_param.snapshot_prefix + infix + '_iter_{:d}'.format(self.solver.iter) + '.caffemodel') filename = os.path.join(self.output_dir, filename) net.save(str(filename)) print 'Wrote snapshot to: {:s}'.format(filename) if scale_bbox_params: # restore net to original state net.params['bbox_pred'][0].data[...] = orig_0 net.params['bbox_pred'][1].data[...] = orig_1 return filename def train_model(self, max_iters): """Network training loop.""" last_snapshot_iter = -1 timer = Timer() model_paths = [] while self.solver.iter < max_iters: # Make one SGD update timer.tic() self.solver.step(1) timer.toc() if self.solver.iter % (10 * self.solver_param.display) == 0: print 'speed: {:.3f}s / iter'.format(timer.average_time) if self.solver.iter % cfg.TRAIN.SNAPSHOT_ITERS == 0: last_snapshot_iter = self.solver.iter model_paths.append(self.snapshot()) if last_snapshot_iter != self.solver.iter: model_paths.append(self.snapshot()) return model_paths def get_training_roidb(imdb): """Returns a roidb (Region of Interest database) for use in training.""" if cfg.TRAIN.USE_FLIPPED: print 'Appending horizontally-flipped training examples...' imdb.append_flipped_images() print 'done' print 'Preparing training data...' rdl_roidb.prepare_roidb(imdb) print 'done' return imdb.roidb def filter_roidb(roidb): """Remove roidb entries that have no usable RoIs.""" def is_valid(entry): # Valid images have: # (1) At least one foreground RoI OR # (2) At least one background RoI overlaps = entry['max_overlaps'] # find boxes with sufficient overlap fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0] # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) & (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] # image is only valid if such boxes exist valid = len(fg_inds) > 0 or len(bg_inds) > 0 return valid num = len(roidb) filtered_roidb = [entry for entry in roidb if is_valid(entry)] num_after = len(filtered_roidb) print 'Filtered {} roidb entries: {} -> {}'.format(num - num_after, num, num_after) return filtered_roidb def train_net(solver_prototxt, roidb, output_dir, pretrained_model=None, max_iters=40000): """Train a Fast R-CNN network.""" roidb = filter_roidb(roidb) sw = SolverWrapper(solver_prototxt, roidb, output_dir, pretrained_model=pretrained_model) print 'Solving...' model_paths = sw.train_model(max_iters) print 'done solving' return model_paths ================================================ FILE: lib/nms/.gitignore ================================================ *.c *.cpp *.so ================================================ FILE: lib/nms/__init__.py ================================================ ================================================ FILE: lib/nms/cpu_nms.pyx ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- import numpy as np cimport numpy as np cdef inline np.float32_t max(np.float32_t a, np.float32_t b): return a if a >= b else b cdef inline np.float32_t min(np.float32_t a, np.float32_t b): return a if a <= b else b def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] cdef int ndets = dets.shape[0] cdef np.ndarray[np.int_t, ndim=1] suppressed = \ np.zeros((ndets), dtype=np.int) # nominal indices cdef int _i, _j # sorted indices cdef int i, j # temp variables for box i's (the box currently under consideration) cdef np.float32_t ix1, iy1, ix2, iy2, iarea # variables for computing overlap with box j (lower scoring box) cdef np.float32_t xx1, yy1, xx2, yy2 cdef np.float32_t w, h cdef np.float32_t inter, ovr keep = [] for _i in range(ndets): i = order[_i] if suppressed[i] == 1: continue keep.append(i) ix1 = x1[i] iy1 = y1[i] ix2 = x2[i] iy2 = y2[i] iarea = areas[i] for _j in range(_i + 1, ndets): j = order[_j] if suppressed[j] == 1: continue xx1 = max(ix1, x1[j]) yy1 = max(iy1, y1[j]) xx2 = min(ix2, x2[j]) yy2 = min(iy2, y2[j]) w = max(0.0, xx2 - xx1 + 1) h = max(0.0, yy2 - yy1 + 1) inter = w * h ovr = inter / (iarea + areas[j] - inter) if ovr >= thresh: suppressed[j] = 1 return keep ================================================ FILE: lib/nms/gpu_nms.hpp ================================================ void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, int boxes_dim, float nms_overlap_thresh, int device_id); ================================================ FILE: lib/nms/gpu_nms.pyx ================================================ # -------------------------------------------------------- # Faster R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- import numpy as np cimport numpy as np assert sizeof(int) == sizeof(np.int32_t) cdef extern from "gpu_nms.hpp": void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, np.int32_t device_id=0): cdef int boxes_num = dets.shape[0] cdef int boxes_dim = dets.shape[1] cdef int num_out cdef np.ndarray[np.int32_t, ndim=1] \ keep = np.zeros(boxes_num, dtype=np.int32) cdef np.ndarray[np.float32_t, ndim=1] \ scores = dets[:, 4] cdef np.ndarray[np.int_t, ndim=1] \ order = scores.argsort()[::-1] cdef np.ndarray[np.float32_t, ndim=2] \ sorted_dets = dets[order, :] _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) keep = keep[:num_out] return list(order[keep]) ================================================ FILE: lib/nms/nms_kernel.cu ================================================ // ------------------------------------------------------------------ // Faster R-CNN // Copyright (c) 2015 Microsoft // Licensed under The MIT License [see fast-rcnn/LICENSE for details] // Written by Shaoqing Ren // ------------------------------------------------------------------ #include "gpu_nms.hpp" #include #include #define CUDA_CHECK(condition) \ /* Code block avoids redefinition of cudaError_t error */ \ do { \ cudaError_t error = condition; \ if (error != cudaSuccess) { \ std::cout << cudaGetErrorString(error) << std::endl; \ } \ } while (0) #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) int const threadsPerBlock = sizeof(unsigned long long) * 8; __device__ inline float devIoU(float const * const a, float const * const b) { float left = max(a[0], b[0]), right = min(a[2], b[2]); float top = max(a[1], b[1]), bottom = min(a[3], b[3]); float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); float interS = width * height; float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); return interS / (Sa + Sb - interS); } __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, const float *dev_boxes, unsigned long long *dev_mask) { const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); const int col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); __shared__ float block_boxes[threadsPerBlock * 5]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 5 + 0] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; block_boxes[threadIdx.x * 5 + 1] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; block_boxes[threadIdx.x * 5 + 2] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; block_boxes[threadIdx.x * 5 + 3] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; block_boxes[threadIdx.x * 5 + 4] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; const float *cur_box = dev_boxes + cur_box_idx * 5; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { t |= 1ULL << i; } } const int col_blocks = DIVUP(n_boxes, threadsPerBlock); dev_mask[cur_box_idx * col_blocks + col_start] = t; } } void _set_device(int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } // The call to cudaSetDevice must come before any calls to Get, which // may perform initialization using the GPU. CUDA_CHECK(cudaSetDevice(device_id)); } void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, int boxes_dim, float nms_overlap_thresh, int device_id) { _set_device(device_id); float* boxes_dev = NULL; unsigned long long* mask_dev = NULL; const int col_blocks = DIVUP(boxes_num, threadsPerBlock); CUDA_CHECK(cudaMalloc(&boxes_dev, boxes_num * boxes_dim * sizeof(float))); CUDA_CHECK(cudaMemcpy(boxes_dev, boxes_host, boxes_num * boxes_dim * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMalloc(&mask_dev, boxes_num * col_blocks * sizeof(unsigned long long))); dim3 blocks(DIVUP(boxes_num, threadsPerBlock), DIVUP(boxes_num, threadsPerBlock)); dim3 threads(threadsPerBlock); nms_kernel<<>>(boxes_num, nms_overlap_thresh, boxes_dev, mask_dev); std::vector mask_host(boxes_num * col_blocks); CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev, sizeof(unsigned long long) * boxes_num * col_blocks, cudaMemcpyDeviceToHost)); std::vector remv(col_blocks); memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); int num_to_keep = 0; for (int i = 0; i < boxes_num; i++) { int nblock = i / threadsPerBlock; int inblock = i % threadsPerBlock; if (!(remv[nblock] & (1ULL << inblock))) { keep_out[num_to_keep++] = i; unsigned long long *p = &mask_host[0] + i * col_blocks; for (int j = nblock; j < col_blocks; j++) { remv[j] |= p[j]; } } } *num_out = num_to_keep; CUDA_CHECK(cudaFree(boxes_dev)); CUDA_CHECK(cudaFree(mask_dev)); } ================================================ FILE: lib/nms/py_cpu_nms.py ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- import numpy as np def py_cpu_nms(dets, thresh): """Pure Python NMS baseline.""" x1 = dets[:, 0] y1 = dets[:, 1] x2 = dets[:, 2] y2 = dets[:, 3] scores = dets[:, 4] areas = (x2 - x1 + 1) * (y2 - y1 + 1) order = scores.argsort()[::-1] keep = [] while order.size > 0: i = order[0] keep.append(i) xx1 = np.maximum(x1[i], x1[order[1:]]) yy1 = np.maximum(y1[i], y1[order[1:]]) xx2 = np.minimum(x2[i], x2[order[1:]]) yy2 = np.minimum(y2[i], y2[order[1:]]) w = np.maximum(0.0, xx2 - xx1 + 1) h = np.maximum(0.0, yy2 - yy1 + 1) inter = w * h ovr = inter / (areas[i] + areas[order[1:]] - inter) inds = np.where(ovr <= thresh)[0] order = order[inds + 1] return keep ================================================ FILE: lib/pycocotools/UPSTREAM_REV ================================================ https://github.com/pdollar/coco/commit/3ac47c77ebd5a1ed4254a98b7fbf2ef4765a3574 ================================================ FILE: lib/pycocotools/__init__.py ================================================ __author__ = 'tylin' ================================================ FILE: lib/pycocotools/_mask.pyx ================================================ # distutils: language = c # distutils: sources = ../MatlabAPI/private/maskApi.c #************************************************************************** # Microsoft COCO Toolbox. version 2.0 # Data, paper, and tutorials available at: http://mscoco.org/ # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. # Licensed under the Simplified BSD License [see coco/license.txt] #************************************************************************** __author__ = 'tsungyi' # import both Python-level and C-level symbols of Numpy # the API uses Numpy to interface C and Python import numpy as np cimport numpy as np from libc.stdlib cimport malloc, free # intialized Numpy. must do. np.import_array() # import numpy C function # we use PyArray_ENABLEFLAGS to make Numpy ndarray responsible to memoery management cdef extern from "numpy/arrayobject.h": void PyArray_ENABLEFLAGS(np.ndarray arr, int flags) # Declare the prototype of the C functions in MaskApi.h cdef extern from "maskApi.h": ctypedef unsigned int uint ctypedef unsigned long siz ctypedef unsigned char byte ctypedef double* BB ctypedef struct RLE: siz h, siz w, siz m, uint* cnts, void rlesInit( RLE **R, siz n ) void rleEncode( RLE *R, const byte *M, siz h, siz w, siz n ) void rleDecode( const RLE *R, byte *mask, siz n ) void rleMerge( const RLE *R, RLE *M, siz n, bint intersect ) void rleArea( const RLE *R, siz n, uint *a ) void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ) void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ) void rleToBbox( const RLE *R, BB bb, siz n ) void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ) void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ) char* rleToString( const RLE *R ) void rleFrString( RLE *R, char *s, siz h, siz w ) # python class to wrap RLE array in C # the class handles the memory allocation and deallocation cdef class RLEs: cdef RLE *_R cdef siz _n def __cinit__(self, siz n =0): rlesInit(&self._R, n) self._n = n # free the RLE array here def __dealloc__(self): if self._R is not NULL: for i in range(self._n): free(self._R[i].cnts) free(self._R) def __getattr__(self, key): if key == 'n': return self._n raise AttributeError(key) # python class to wrap Mask array in C # the class handles the memory allocation and deallocation cdef class Masks: cdef byte *_mask cdef siz _h cdef siz _w cdef siz _n def __cinit__(self, h, w, n): self._mask = malloc(h*w*n* sizeof(byte)) self._h = h self._w = w self._n = n # def __dealloc__(self): # the memory management of _mask has been passed to np.ndarray # it doesn't need to be freed here # called when passing into np.array() and return an np.ndarray in column-major order def __array__(self): cdef np.npy_intp shape[1] shape[0] = self._h*self._w*self._n # Create a 1D array, and reshape it to fortran/Matlab column-major array ndarray = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT8, self._mask).reshape((self._h, self._w, self._n), order='F') # The _mask allocated by Masks is now handled by ndarray PyArray_ENABLEFLAGS(ndarray, np.NPY_OWNDATA) return ndarray # internal conversion from Python RLEs object to compressed RLE format def _toString(RLEs Rs): cdef siz n = Rs.n cdef bytes py_string cdef char* c_string objs = [] for i in range(n): c_string = rleToString( &Rs._R[i] ) py_string = c_string objs.append({ 'size': [Rs._R[i].h, Rs._R[i].w], 'counts': py_string }) free(c_string) return objs # internal conversion from compressed RLE format to Python RLEs object def _frString(rleObjs): cdef siz n = len(rleObjs) Rs = RLEs(n) cdef bytes py_string cdef char* c_string for i, obj in enumerate(rleObjs): py_string = str(obj['counts']) c_string = py_string rleFrString( &Rs._R[i], c_string, obj['size'][0], obj['size'][1] ) return Rs # encode mask to RLEs objects # list of RLE string can be generated by RLEs member function def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask): h, w, n = mask.shape[0], mask.shape[1], mask.shape[2] cdef RLEs Rs = RLEs(n) rleEncode(Rs._R,mask.data,h,w,n) objs = _toString(Rs) return objs # decode mask from compressed list of RLE string or RLEs object def decode(rleObjs): cdef RLEs Rs = _frString(rleObjs) h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n masks = Masks(h, w, n) rleDecode( Rs._R, masks._mask, n ); return np.array(masks) def merge(rleObjs, bint intersect=0): cdef RLEs Rs = _frString(rleObjs) cdef RLEs R = RLEs(1) rleMerge(Rs._R, R._R, Rs._n, intersect) obj = _toString(R)[0] return obj def area(rleObjs): cdef RLEs Rs = _frString(rleObjs) cdef uint* _a = malloc(Rs._n* sizeof(uint)) rleArea(Rs._R, Rs._n, _a) cdef np.npy_intp shape[1] shape[0] = Rs._n a = np.array((Rs._n, ), dtype=np.uint8) a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a) PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA) return a # iou computation. support function overload (RLEs-RLEs and bbox-bbox). def iou( dt, gt, pyiscrowd ): def _preproc(objs): if len(objs) == 0: return objs if type(objs) == np.ndarray: if len(objs.shape) == 1: objs = objs.reshape((objs[0], 1)) # check if it's Nx4 bbox if not len(objs.shape) == 2 or not objs.shape[1] == 4: raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension') objs = objs.astype(np.double) elif type(objs) == list: # check if list is in box format and convert it to np.ndarray isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs])) isrle = np.all(np.array([type(obj) == dict for obj in objs])) if isbox: objs = np.array(objs, dtype=np.double) if len(objs.shape) == 1: objs = objs.reshape((1,objs.shape[0])) elif isrle: objs = _frString(objs) else: raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])') else: raise Exception('unrecognized type. The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.') return objs def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou): rleIou( dt._R, gt._R, m, n, iscrowd.data, _iou.data ) def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou): bbIou( dt.data, gt.data, m, n, iscrowd.data, _iou.data ) def _len(obj): cdef siz N = 0 if type(obj) == RLEs: N = obj.n elif len(obj)==0: pass elif type(obj) == np.ndarray: N = obj.shape[0] return N # convert iscrowd to numpy array cdef np.ndarray[np.uint8_t, ndim=1] iscrowd = np.array(pyiscrowd, dtype=np.uint8) # simple type checking cdef siz m, n dt = _preproc(dt) gt = _preproc(gt) m = _len(dt) n = _len(gt) if m == 0 or n == 0: return [] if not type(dt) == type(gt): raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray') # define local variables cdef double* _iou = 0 cdef np.npy_intp shape[1] # check type and assign iou function if type(dt) == RLEs: _iouFun = _rleIou elif type(dt) == np.ndarray: _iouFun = _bbIou else: raise Exception('input data type not allowed.') _iou = malloc(m*n* sizeof(double)) iou = np.zeros((m*n, ), dtype=np.double) shape[0] = m*n iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou) PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA) _iouFun(dt, gt, iscrowd, m, n, iou) return iou.reshape((m,n), order='F') def toBbox( rleObjs ): cdef RLEs Rs = _frString(rleObjs) cdef siz n = Rs.n cdef BB _bb = malloc(4*n* sizeof(double)) rleToBbox( Rs._R, _bb, n ) cdef np.npy_intp shape[1] shape[0] = 4*n bb = np.array((1,4*n), dtype=np.double) bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4)) PyArray_ENABLEFLAGS(bb, np.NPY_OWNDATA) return bb def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ): cdef siz n = bb.shape[0] Rs = RLEs(n) rleFrBbox( Rs._R, bb.data, h, w, n ) objs = _toString(Rs) return objs def frPoly( poly, siz h, siz w ): cdef np.ndarray[np.double_t, ndim=1] np_poly n = len(poly) Rs = RLEs(n) for i, p in enumerate(poly): np_poly = np.array(p, dtype=np.double, order='F') rleFrPoly( &Rs._R[i], np_poly.data, len(np_poly)/2, h, w ) objs = _toString(Rs) return objs def frUncompressedRLE(ucRles, siz h, siz w): cdef np.ndarray[np.uint32_t, ndim=1] cnts cdef RLE R cdef uint *data n = len(ucRles) objs = [] for i in range(n): Rs = RLEs(1) cnts = np.array(ucRles[i]['counts'], dtype=np.uint32) # time for malloc can be saved here but it's fine data = malloc(len(cnts)* sizeof(uint)) for j in range(len(cnts)): data[j] = cnts[j] R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), data) Rs._R[0] = R objs.append(_toString(Rs)[0]) return objs def frPyObjects(pyobj, siz h, w): if type(pyobj) == np.ndarray: objs = frBbox(pyobj, h, w ) elif type(pyobj) == list and len(pyobj[0]) == 4: objs = frBbox(pyobj, h, w ) elif type(pyobj) == list and len(pyobj[0]) > 4: objs = frPoly(pyobj, h, w ) elif type(pyobj) == list and type(pyobj[0]) == dict: objs = frUncompressedRLE(pyobj, h, w) else: raise Exception('input type is not supported.') return objs ================================================ FILE: lib/pycocotools/coco.py ================================================ __author__ = 'tylin' __version__ = '1.0.1' # Interface for accessing the Microsoft COCO dataset. # Microsoft COCO is a large image dataset designed for object detection, # segmentation, and caption generation. pycocotools is a Python API that # assists in loading, parsing and visualizing the annotations in COCO. # Please visit http://mscoco.org/ for more information on COCO, including # for the data, paper, and tutorials. The exact format of the annotations # is also described on the COCO website. For example usage of the pycocotools # please see pycocotools_demo.ipynb. In addition to this API, please download both # the COCO images and annotations in order to run the demo. # An alternative to using the API is to load the annotations directly # into Python dictionary # Using the API provides additional utility functions. Note that this API # supports both *instance* and *caption* annotations. In the case of # captions not all functions are defined (e.g. categories are undefined). # The following API functions are defined: # COCO - COCO api class that loads COCO annotation file and prepare data structures. # decodeMask - Decode binary mask M encoded via run-length encoding. # encodeMask - Encode binary mask M using run-length encoding. # getAnnIds - Get ann ids that satisfy given filter conditions. # getCatIds - Get cat ids that satisfy given filter conditions. # getImgIds - Get img ids that satisfy given filter conditions. # loadAnns - Load anns with the specified ids. # loadCats - Load cats with the specified ids. # loadImgs - Load imgs with the specified ids. # segToMask - Convert polygon segmentation to binary mask. # showAnns - Display the specified annotations. # loadRes - Load algorithm results and create API for accessing them. # download - Download COCO images from mscoco.org server. # Throughout the API "ann"=annotation, "cat"=category, and "img"=image. # Help on each functions can be accessed by: "help COCO>function". # See also COCO>decodeMask, # COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds, # COCO>getImgIds, COCO>loadAnns, COCO>loadCats, # COCO>loadImgs, COCO>segToMask, COCO>showAnns # Microsoft COCO Toolbox. version 2.0 # Data, paper, and tutorials available at: http://mscoco.org/ # Code written by Piotr Dollar and Tsung-Yi Lin, 2014. # Licensed under the Simplified BSD License [see bsd.txt] import json import datetime import time import matplotlib.pyplot as plt from matplotlib.collections import PatchCollection from matplotlib.patches import Polygon import numpy as np from skimage.draw import polygon import urllib import copy import itertools import mask import os class COCO: def __init__(self, annotation_file=None): """ Constructor of Microsoft COCO helper class for reading and visualizing annotations. :param annotation_file (str): location of annotation file :param image_folder (str): location to the folder that hosts images. :return: """ # load dataset self.dataset = {} self.anns = [] self.imgToAnns = {} self.catToImgs = {} self.imgs = {} self.cats = {} if not annotation_file == None: print 'loading annotations into memory...' tic = time.time() dataset = json.load(open(annotation_file, 'r')) print 'Done (t=%0.2fs)'%(time.time()- tic) self.dataset = dataset self.createIndex() def createIndex(self): # create index print 'creating index...' anns = {} imgToAnns = {} catToImgs = {} cats = {} imgs = {} if 'annotations' in self.dataset: imgToAnns = {ann['image_id']: [] for ann in self.dataset['annotations']} anns = {ann['id']: [] for ann in self.dataset['annotations']} for ann in self.dataset['annotations']: imgToAnns[ann['image_id']] += [ann] anns[ann['id']] = ann if 'images' in self.dataset: imgs = {im['id']: {} for im in self.dataset['images']} for img in self.dataset['images']: imgs[img['id']] = img if 'categories' in self.dataset: cats = {cat['id']: [] for cat in self.dataset['categories']} for cat in self.dataset['categories']: cats[cat['id']] = cat catToImgs = {cat['id']: [] for cat in self.dataset['categories']} if 'annotations' in self.dataset: for ann in self.dataset['annotations']: catToImgs[ann['category_id']] += [ann['image_id']] print 'index created!' # create class members self.anns = anns self.imgToAnns = imgToAnns self.catToImgs = catToImgs self.imgs = imgs self.cats = cats def info(self): """ Print information about the annotation file. :return: """ for key, value in self.dataset['info'].items(): print '%s: %s'%(key, value) def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None): """ Get ann ids that satisfy given filter conditions. default skips that filter :param imgIds (int array) : get anns for given imgs catIds (int array) : get anns for given cats areaRng (float array) : get anns for given area range (e.g. [0 inf]) iscrowd (boolean) : get anns for given crowd label (False or True) :return: ids (int array) : integer array of ann ids """ imgIds = imgIds if type(imgIds) == list else [imgIds] catIds = catIds if type(catIds) == list else [catIds] if len(imgIds) == len(catIds) == len(areaRng) == 0: anns = self.dataset['annotations'] else: if not len(imgIds) == 0: # this can be changed by defaultdict lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns] anns = list(itertools.chain.from_iterable(lists)) else: anns = self.dataset['annotations'] anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds] anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]] if not iscrowd == None: ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd] else: ids = [ann['id'] for ann in anns] return ids def getCatIds(self, catNms=[], supNms=[], catIds=[]): """ filtering parameters. default skips that filter. :param catNms (str array) : get cats for given cat names :param supNms (str array) : get cats for given supercategory names :param catIds (int array) : get cats for given cat ids :return: ids (int array) : integer array of cat ids """ catNms = catNms if type(catNms) == list else [catNms] supNms = supNms if type(supNms) == list else [supNms] catIds = catIds if type(catIds) == list else [catIds] if len(catNms) == len(supNms) == len(catIds) == 0: cats = self.dataset['categories'] else: cats = self.dataset['categories'] cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name'] in catNms] cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms] cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id'] in catIds] ids = [cat['id'] for cat in cats] return ids def getImgIds(self, imgIds=[], catIds=[]): ''' Get img ids that satisfy given filter conditions. :param imgIds (int array) : get imgs for given ids :param catIds (int array) : get imgs with all given cats :return: ids (int array) : integer array of img ids ''' imgIds = imgIds if type(imgIds) == list else [imgIds] catIds = catIds if type(catIds) == list else [catIds] if len(imgIds) == len(catIds) == 0: ids = self.imgs.keys() else: ids = set(imgIds) for i, catId in enumerate(catIds): if i == 0 and len(ids) == 0: ids = set(self.catToImgs[catId]) else: ids &= set(self.catToImgs[catId]) return list(ids) def loadAnns(self, ids=[]): """ Load anns with the specified ids. :param ids (int array) : integer ids specifying anns :return: anns (object array) : loaded ann objects """ if type(ids) == list: return [self.anns[id] for id in ids] elif type(ids) == int: return [self.anns[ids]] def loadCats(self, ids=[]): """ Load cats with the specified ids. :param ids (int array) : integer ids specifying cats :return: cats (object array) : loaded cat objects """ if type(ids) == list: return [self.cats[id] for id in ids] elif type(ids) == int: return [self.cats[ids]] def loadImgs(self, ids=[]): """ Load anns with the specified ids. :param ids (int array) : integer ids specifying img :return: imgs (object array) : loaded img objects """ if type(ids) == list: return [self.imgs[id] for id in ids] elif type(ids) == int: return [self.imgs[ids]] def showAnns(self, anns): """ Display the specified annotations. :param anns (array of object): annotations to display :return: None """ if len(anns) == 0: return 0 if 'segmentation' in anns[0]: datasetType = 'instances' elif 'caption' in anns[0]: datasetType = 'captions' if datasetType == 'instances': ax = plt.gca() polygons = [] color = [] for ann in anns: c = np.random.random((1, 3)).tolist()[0] if type(ann['segmentation']) == list: # polygon for seg in ann['segmentation']: poly = np.array(seg).reshape((len(seg)/2, 2)) polygons.append(Polygon(poly, True,alpha=0.4)) color.append(c) else: # mask t = self.imgs[ann['image_id']] if type(ann['segmentation']['counts']) == list: rle = mask.frPyObjects([ann['segmentation']], t['height'], t['width']) else: rle = [ann['segmentation']] m = mask.decode(rle) img = np.ones( (m.shape[0], m.shape[1], 3) ) if ann['iscrowd'] == 1: color_mask = np.array([2.0,166.0,101.0])/255 if ann['iscrowd'] == 0: color_mask = np.random.random((1, 3)).tolist()[0] for i in range(3): img[:,:,i] = color_mask[i] ax.imshow(np.dstack( (img, m*0.5) )) p = PatchCollection(polygons, facecolors=color, edgecolors=(0,0,0,1), linewidths=3, alpha=0.4) ax.add_collection(p) elif datasetType == 'captions': for ann in anns: print ann['caption'] def loadRes(self, resFile): """ Load result file and return a result api object. :param resFile (str) : file name of result file :return: res (obj) : result api object """ res = COCO() res.dataset['images'] = [img for img in self.dataset['images']] # res.dataset['info'] = copy.deepcopy(self.dataset['info']) # res.dataset['licenses'] = copy.deepcopy(self.dataset['licenses']) print 'Loading and preparing results... ' tic = time.time() anns = json.load(open(resFile)) assert type(anns) == list, 'results in not an array of objects' annsImgIds = [ann['image_id'] for ann in anns] assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \ 'Results do not correspond to current coco set' if 'caption' in anns[0]: imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns]) res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds] for id, ann in enumerate(anns): ann['id'] = id+1 elif 'bbox' in anns[0] and not anns[0]['bbox'] == []: res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) for id, ann in enumerate(anns): bb = ann['bbox'] x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]] if not 'segmentation' in ann: ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]] ann['area'] = bb[2]*bb[3] ann['id'] = id+1 ann['iscrowd'] = 0 elif 'segmentation' in anns[0]: res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) for id, ann in enumerate(anns): # now only support compressed RLE format as segmentation results ann['area'] = mask.area([ann['segmentation']])[0] if not 'bbox' in ann: ann['bbox'] = mask.toBbox([ann['segmentation']])[0] ann['id'] = id+1 ann['iscrowd'] = 0 print 'DONE (t=%0.2fs)'%(time.time()- tic) res.dataset['annotations'] = anns res.createIndex() return res def download( self, tarDir = None, imgIds = [] ): ''' Download COCO images from mscoco.org server. :param tarDir (str): COCO results directory name imgIds (list): images to be downloaded :return: ''' if tarDir is None: print 'Please specify target directory' return -1 if len(imgIds) == 0: imgs = self.imgs.values() else: imgs = self.loadImgs(imgIds) N = len(imgs) if not os.path.exists(tarDir): os.makedirs(tarDir) for i, img in enumerate(imgs): tic = time.time() fname = os.path.join(tarDir, img['file_name']) if not os.path.exists(fname): urllib.urlretrieve(img['coco_url'], fname) print 'downloaded %d/%d images (t=%.1fs)'%(i, N, time.time()- tic) ================================================ FILE: lib/pycocotools/cocoeval.py ================================================ __author__ = 'tsungyi' import numpy as np import datetime import time from collections import defaultdict import mask import copy class COCOeval: # Interface for evaluating detection on the Microsoft COCO dataset. # # The usage for CocoEval is as follows: # cocoGt=..., cocoDt=... # load dataset and results # E = CocoEval(cocoGt,cocoDt); # initialize CocoEval object # E.params.recThrs = ...; # set parameters as desired # E.evaluate(); # run per image evaluation # E.accumulate(); # accumulate per image results # E.summarize(); # display summary metrics of results # For example usage see evalDemo.m and http://mscoco.org/. # # The evaluation parameters are as follows (defaults in brackets): # imgIds - [all] N img ids to use for evaluation # catIds - [all] K cat ids to use for evaluation # iouThrs - [.5:.05:.95] T=10 IoU thresholds for evaluation # recThrs - [0:.01:1] R=101 recall thresholds for evaluation # areaRng - [...] A=4 object area ranges for evaluation # maxDets - [1 10 100] M=3 thresholds on max detections per image # useSegm - [1] if true evaluate against ground-truth segments # useCats - [1] if true use category labels for evaluation # Note: if useSegm=0 the evaluation is run on bounding boxes. # Note: if useCats=0 category labels are ignored as in proposal scoring. # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified. # # evaluate(): evaluates detections on every image and every category and # concats the results into the "evalImgs" with fields: # dtIds - [1xD] id for each of the D detections (dt) # gtIds - [1xG] id for each of the G ground truths (gt) # dtMatches - [TxD] matching gt id at each IoU or 0 # gtMatches - [TxG] matching dt id at each IoU or 0 # dtScores - [1xD] confidence of each dt # gtIgnore - [1xG] ignore flag for each gt # dtIgnore - [TxD] ignore flag for each dt at each IoU # # accumulate(): accumulates the per-image, per-category evaluation # results in "evalImgs" into the dictionary "eval" with fields: # params - parameters used for evaluation # date - date evaluation was performed # counts - [T,R,K,A,M] parameter dimensions (see above) # precision - [TxRxKxAxM] precision for every evaluation setting # recall - [TxKxAxM] max recall for every evaluation setting # Note: precision and recall==-1 for settings with no gt objects. # # See also coco, mask, pycocoDemo, pycocoEvalDemo # # Microsoft COCO Toolbox. version 2.0 # Data, paper, and tutorials available at: http://mscoco.org/ # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. # Licensed under the Simplified BSD License [see coco/license.txt] def __init__(self, cocoGt=None, cocoDt=None): ''' Initialize CocoEval using coco APIs for gt and dt :param cocoGt: coco object with ground truth annotations :param cocoDt: coco object with detection results :return: None ''' self.cocoGt = cocoGt # ground truth COCO API self.cocoDt = cocoDt # detections COCO API self.params = {} # evaluation parameters self.evalImgs = defaultdict(list) # per-image per-category evaluation results [KxAxI] elements self.eval = {} # accumulated evaluation results self._gts = defaultdict(list) # gt for evaluation self._dts = defaultdict(list) # dt for evaluation self.params = Params() # parameters self._paramsEval = {} # parameters for evaluation self.stats = [] # result summarization self.ious = {} # ious between all gts and dts if not cocoGt is None: self.params.imgIds = sorted(cocoGt.getImgIds()) self.params.catIds = sorted(cocoGt.getCatIds()) def _prepare(self): ''' Prepare ._gts and ._dts for evaluation based on params :return: None ''' # def _toMask(objs, coco): # modify segmentation by reference for obj in objs: t = coco.imgs[obj['image_id']] if type(obj['segmentation']) == list: if type(obj['segmentation'][0]) == dict: print 'debug' obj['segmentation'] = mask.frPyObjects(obj['segmentation'],t['height'],t['width']) if len(obj['segmentation']) == 1: obj['segmentation'] = obj['segmentation'][0] else: # an object can have multiple polygon regions # merge them into one RLE mask obj['segmentation'] = mask.merge(obj['segmentation']) elif type(obj['segmentation']) == dict and type(obj['segmentation']['counts']) == list: obj['segmentation'] = mask.frPyObjects([obj['segmentation']],t['height'],t['width'])[0] elif type(obj['segmentation']) == dict and \ type(obj['segmentation']['counts'] == unicode or type(obj['segmentation']['counts']) == str): pass else: raise Exception('segmentation format not supported.') p = self.params if p.useCats: gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) else: gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds)) dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds)) if p.useSegm: _toMask(gts, self.cocoGt) _toMask(dts, self.cocoDt) self._gts = defaultdict(list) # gt for evaluation self._dts = defaultdict(list) # dt for evaluation for gt in gts: self._gts[gt['image_id'], gt['category_id']].append(gt) for dt in dts: self._dts[dt['image_id'], dt['category_id']].append(dt) self.evalImgs = defaultdict(list) # per-image per-category evaluation results self.eval = {} # accumulated evaluation results def evaluate(self): ''' Run per image evaluation on given images and store results (a list of dict) in self.evalImgs :return: None ''' tic = time.time() print 'Running per image evaluation... ' p = self.params p.imgIds = list(np.unique(p.imgIds)) if p.useCats: p.catIds = list(np.unique(p.catIds)) p.maxDets = sorted(p.maxDets) self.params=p self._prepare() # loop through images, area range, max detection number catIds = p.catIds if p.useCats else [-1] computeIoU = self.computeIoU self.ious = {(imgId, catId): computeIoU(imgId, catId) \ for imgId in p.imgIds for catId in catIds} evaluateImg = self.evaluateImg maxDet = p.maxDets[-1] self.evalImgs = [evaluateImg(imgId, catId, areaRng, maxDet) for catId in catIds for areaRng in p.areaRng for imgId in p.imgIds ] self._paramsEval = copy.deepcopy(self.params) toc = time.time() print 'DONE (t=%0.2fs).'%(toc-tic) def computeIoU(self, imgId, catId): p = self.params if p.useCats: gt = self._gts[imgId,catId] dt = self._dts[imgId,catId] else: gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]] dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]] if len(gt) == 0 and len(dt) ==0: return [] dt = sorted(dt, key=lambda x: -x['score']) if len(dt) > p.maxDets[-1]: dt=dt[0:p.maxDets[-1]] if p.useSegm: g = [g['segmentation'] for g in gt] d = [d['segmentation'] for d in dt] else: g = [g['bbox'] for g in gt] d = [d['bbox'] for d in dt] # compute iou between each dt and gt region iscrowd = [int(o['iscrowd']) for o in gt] ious = mask.iou(d,g,iscrowd) return ious def evaluateImg(self, imgId, catId, aRng, maxDet): ''' perform evaluation for single category and image :return: dict (single image results) ''' # p = self.params if p.useCats: gt = self._gts[imgId,catId] dt = self._dts[imgId,catId] else: gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]] dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]] if len(gt) == 0 and len(dt) ==0: return None for g in gt: if 'ignore' not in g: g['ignore'] = 0 if g['iscrowd'] == 1 or g['ignore'] or (g['area']aRng[1]): g['_ignore'] = 1 else: g['_ignore'] = 0 # sort dt highest score first, sort gt ignore last # gt = sorted(gt, key=lambda x: x['_ignore']) gtind = [ind for (ind, g) in sorted(enumerate(gt), key=lambda (ind, g): g['_ignore']) ] gt = [gt[ind] for ind in gtind] dt = sorted(dt, key=lambda x: -x['score'])[0:maxDet] iscrowd = [int(o['iscrowd']) for o in gt] # load computed ious N_iou = len(self.ious[imgId, catId]) ious = self.ious[imgId, catId][0:maxDet, np.array(gtind)] if N_iou >0 else self.ious[imgId, catId] T = len(p.iouThrs) G = len(gt) D = len(dt) gtm = np.zeros((T,G)) dtm = np.zeros((T,D)) gtIg = np.array([g['_ignore'] for g in gt]) dtIg = np.zeros((T,D)) if not len(ious)==0: for tind, t in enumerate(p.iouThrs): for dind, d in enumerate(dt): # information about best match so far (m=-1 -> unmatched) iou = min([t,1-1e-10]) m = -1 for gind, g in enumerate(gt): # if this gt already matched, and not a crowd, continue if gtm[tind,gind]>0 and not iscrowd[gind]: continue # if dt matched to reg gt, and on ignore gt, stop if m>-1 and gtIg[m]==0 and gtIg[gind]==1: break # continue to next gt unless better match made if ious[dind,gind] < iou: continue # match successful and best so far, store appropriately iou=ious[dind,gind] m=gind # if match made store id of match for both dt and gt if m ==-1: continue dtIg[tind,dind] = gtIg[m] dtm[tind,dind] = gt[m]['id'] gtm[tind,m] = d['id'] # set unmatched detections outside of area range to ignore a = np.array([d['area']aRng[1] for d in dt]).reshape((1, len(dt))) dtIg = np.logical_or(dtIg, np.logical_and(dtm==0, np.repeat(a,T,0))) # store results for given image and category return { 'image_id': imgId, 'category_id': catId, 'aRng': aRng, 'maxDet': maxDet, 'dtIds': [d['id'] for d in dt], 'gtIds': [g['id'] for g in gt], 'dtMatches': dtm, 'gtMatches': gtm, 'dtScores': [d['score'] for d in dt], 'gtIgnore': gtIg, 'dtIgnore': dtIg, } def accumulate(self, p = None): ''' Accumulate per image evaluation results and store the result in self.eval :param p: input params for evaluation :return: None ''' print 'Accumulating evaluation results... ' tic = time.time() if not self.evalImgs: print 'Please run evaluate() first' # allows input customized parameters if p is None: p = self.params p.catIds = p.catIds if p.useCats == 1 else [-1] T = len(p.iouThrs) R = len(p.recThrs) K = len(p.catIds) if p.useCats else 1 A = len(p.areaRng) M = len(p.maxDets) precision = -np.ones((T,R,K,A,M)) # -1 for the precision of absent categories recall = -np.ones((T,K,A,M)) # create dictionary for future indexing _pe = self._paramsEval catIds = _pe.catIds if _pe.useCats else [-1] setK = set(catIds) setA = set(map(tuple, _pe.areaRng)) setM = set(_pe.maxDets) setI = set(_pe.imgIds) # get inds to evaluate k_list = [n for n, k in enumerate(p.catIds) if k in setK] m_list = [m for n, m in enumerate(p.maxDets) if m in setM] a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA] i_list = [n for n, i in enumerate(p.imgIds) if i in setI] # K0 = len(_pe.catIds) I0 = len(_pe.imgIds) A0 = len(_pe.areaRng) # retrieve E at each category, area range, and max number of detections for k, k0 in enumerate(k_list): Nk = k0*A0*I0 for a, a0 in enumerate(a_list): Na = a0*I0 for m, maxDet in enumerate(m_list): E = [self.evalImgs[Nk+Na+i] for i in i_list] E = filter(None, E) if len(E) == 0: continue dtScores = np.concatenate([e['dtScores'][0:maxDet] for e in E]) # different sorting method generates slightly different results. # mergesort is used to be consistent as Matlab implementation. inds = np.argsort(-dtScores, kind='mergesort') dtm = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds] dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet] for e in E], axis=1)[:,inds] gtIg = np.concatenate([e['gtIgnore'] for e in E]) npig = len([ig for ig in gtIg if ig == 0]) if npig == 0: continue tps = np.logical_and( dtm, np.logical_not(dtIg) ) fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg) ) tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float) fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float) for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)): tp = np.array(tp) fp = np.array(fp) nd = len(tp) rc = tp / npig pr = tp / (fp+tp+np.spacing(1)) q = np.zeros((R,)) if nd: recall[t,k,a,m] = rc[-1] else: recall[t,k,a,m] = 0 # numpy is slow without cython optimization for accessing elements # use python array gets significant speed improvement pr = pr.tolist(); q = q.tolist() for i in range(nd-1, 0, -1): if pr[i] > pr[i-1]: pr[i-1] = pr[i] inds = np.searchsorted(rc, p.recThrs) try: for ri, pi in enumerate(inds): q[ri] = pr[pi] except: pass precision[t,:,k,a,m] = np.array(q) self.eval = { 'params': p, 'counts': [T, R, K, A, M], 'date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'precision': precision, 'recall': recall, } toc = time.time() print 'DONE (t=%0.2fs).'%( toc-tic ) def summarize(self): ''' Compute and display summary metrics for evaluation results. Note this functin can *only* be applied on the default parameter setting ''' def _summarize( ap=1, iouThr=None, areaRng='all', maxDets=100 ): p = self.params iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6} | maxDets={:>3} ] = {}' titleStr = 'Average Precision' if ap == 1 else 'Average Recall' typeStr = '(AP)' if ap==1 else '(AR)' iouStr = '%0.2f:%0.2f'%(p.iouThrs[0], p.iouThrs[-1]) if iouThr is None else '%0.2f'%(iouThr) areaStr = areaRng maxDetsStr = '%d'%(maxDets) aind = [i for i, aRng in enumerate(['all', 'small', 'medium', 'large']) if aRng == areaRng] mind = [i for i, mDet in enumerate([1, 10, 100]) if mDet == maxDets] if ap == 1: # dimension of precision: [TxRxKxAxM] s = self.eval['precision'] # IoU if iouThr is not None: t = np.where(iouThr == p.iouThrs)[0] s = s[t] # areaRng s = s[:,:,:,aind,mind] else: # dimension of recall: [TxKxAxM] s = self.eval['recall'] s = s[:,:,aind,mind] if len(s[s>-1])==0: mean_s = -1 else: mean_s = np.mean(s[s>-1]) print iStr.format(titleStr, typeStr, iouStr, areaStr, maxDetsStr, '%.3f'%(float(mean_s))) return mean_s if not self.eval: raise Exception('Please run accumulate() first') self.stats = np.zeros((12,)) self.stats[0] = _summarize(1) self.stats[1] = _summarize(1,iouThr=.5) self.stats[2] = _summarize(1,iouThr=.75) self.stats[3] = _summarize(1,areaRng='small') self.stats[4] = _summarize(1,areaRng='medium') self.stats[5] = _summarize(1,areaRng='large') self.stats[6] = _summarize(0,maxDets=1) self.stats[7] = _summarize(0,maxDets=10) self.stats[8] = _summarize(0,maxDets=100) self.stats[9] = _summarize(0,areaRng='small') self.stats[10] = _summarize(0,areaRng='medium') self.stats[11] = _summarize(0,areaRng='large') def __str__(self): self.summarize() class Params: ''' Params for coco evaluation api ''' def __init__(self): self.imgIds = [] self.catIds = [] # np.arange causes trouble. the data point on arange is slightly larger than the true value self.iouThrs = np.linspace(.5, 0.95, np.round((0.95-.5)/.05)+1, endpoint=True) self.recThrs = np.linspace(.0, 1.00, np.round((1.00-.0)/.01)+1, endpoint=True) self.maxDets = [1,10,100] self.areaRng = [ [0**2,1e5**2], [0**2, 32**2], [32**2, 96**2], [96**2, 1e5**2] ] self.useSegm = 0 self.useCats = 1 ================================================ FILE: lib/pycocotools/license.txt ================================================ Copyright (c) 2014, Piotr Dollar and Tsung-Yi Lin All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. The views and conclusions contained in the software and documentation are those of the authors and should not be interpreted as representing official policies, either expressed or implied, of the FreeBSD Project. ================================================ FILE: lib/pycocotools/mask.py ================================================ __author__ = 'tsungyi' import pycocotools._mask as _mask # Interface for manipulating masks stored in RLE format. # # RLE is a simple yet efficient format for storing binary masks. RLE # first divides a vector (or vectorized image) into a series of piecewise # constant regions and then for each piece simply stores the length of # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] # (note that the odd counts are always the numbers of zeros). Instead of # storing the counts directly, additional compression is achieved with a # variable bitrate representation based on a common scheme called LEB128. # # Compression is greatest given large piecewise constant regions. # Specifically, the size of the RLE is proportional to the number of # *boundaries* in M (or for an image the number of boundaries in the y # direction). Assuming fairly simple shapes, the RLE representation is # O(sqrt(n)) where n is number of pixels in the object. Hence space usage # is substantially lower, especially for large simple objects (large n). # # Many common operations on masks can be computed directly using the RLE # (without need for decoding). This includes computations such as area, # union, intersection, etc. All of these operations are linear in the # size of the RLE, in other words they are O(sqrt(n)) where n is the area # of the object. Computing these operations on the original mask is O(n). # Thus, using the RLE can result in substantial computational savings. # # The following API functions are defined: # encode - Encode binary masks using RLE. # decode - Decode binary masks encoded via RLE. # merge - Compute union or intersection of encoded masks. # iou - Compute intersection over union between masks. # area - Compute area of encoded masks. # toBbox - Get bounding boxes surrounding encoded masks. # frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask. # # Usage: # Rs = encode( masks ) # masks = decode( Rs ) # R = merge( Rs, intersect=false ) # o = iou( dt, gt, iscrowd ) # a = area( Rs ) # bbs = toBbox( Rs ) # Rs = frPyObjects( [pyObjects], h, w ) # # In the API the following formats are used: # Rs - [dict] Run-length encoding of binary masks # R - dict Run-length encoding of binary mask # masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order) # iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore # bbs - [nx4] Bounding box(es) stored as [x y w h] # poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list) # dt,gt - May be either bounding boxes or encoded masks # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). # # Finally, a note about the intersection over union (iou) computation. # The standard iou of a ground truth (gt) and detected (dt) object is # iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) # For "crowd" regions, we use a modified criteria. If a gt object is # marked as "iscrowd", we allow a dt to match any subregion of the gt. # Choosing gt' in the crowd gt that best matches the dt can be done using # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing # iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) # For crowd gt regions we use this modified criteria above for the iou. # # To compile run "python setup.py build_ext --inplace" # Please do not contact us for help with compiling. # # Microsoft COCO Toolbox. version 2.0 # Data, paper, and tutorials available at: http://mscoco.org/ # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. # Licensed under the Simplified BSD License [see coco/license.txt] encode = _mask.encode decode = _mask.decode iou = _mask.iou merge = _mask.merge area = _mask.area toBbox = _mask.toBbox frPyObjects = _mask.frPyObjects ================================================ FILE: lib/pycocotools/maskApi.c ================================================ /************************************************************************** * Microsoft COCO Toolbox. version 2.0 * Data, paper, and tutorials available at: http://mscoco.org/ * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. * Licensed under the Simplified BSD License [see coco/license.txt] **************************************************************************/ #include "maskApi.h" #include #include uint umin( uint a, uint b ) { return (ab) ? a : b; } void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ) { R->h=h; R->w=w; R->m=m; R->cnts=(m==0)?0:malloc(sizeof(uint)*m); if(cnts) for(siz j=0; jcnts[j]=cnts[j]; } void rleFree( RLE *R ) { free(R->cnts); R->cnts=0; } void rlesInit( RLE **R, siz n ) { *R = (RLE*) malloc(sizeof(RLE)*n); for(siz i=0; i0 ) { c=umin(ca,cb); cc+=c; ct=0; ca-=c; if(!ca && a0) { crowd=iscrowd!=NULL && iscrowd[g]; if(dt[d].h!=gt[g].h || dt[d].w!=gt[g].w) { o[g*m+d]=-1; continue; } siz ka, kb, a, b; uint c, ca, cb, ct, i, u; bool va, vb; ca=dt[d].cnts[0]; ka=dt[d].m; va=vb=0; cb=gt[g].cnts[0]; kb=gt[g].m; a=b=1; i=u=0; ct=1; while( ct>0 ) { c=umin(ca,cb); if(va||vb) { u+=c; if(va&&vb) i+=c; } ct=0; ca-=c; if(!ca && ad?1:c=dy && xs>xe) || (dxye); if(flip) { t=xs; xs=xe; xe=t; t=ys; ys=ye; ye=t; } s = dx>=dy ? (double)(ye-ys)/dx : (double)(xe-xs)/dy; if(dx>=dy) for( int d=0; d<=dx; d++ ) { t=flip?dx-d:d; u[m]=t+xs; v[m]=(int)(ys+s*t+.5); m++; } else for( int d=0; d<=dy; d++ ) { t=flip?dy-d:d; v[m]=t+ys; u[m]=(int)(xs+s*t+.5); m++; } } // get points along y-boundary and downsample free(x); free(y); k=m; m=0; double xd, yd; x=malloc(sizeof(int)*k); y=malloc(sizeof(int)*k); for( j=1; jw-1 ) continue; yd=(double)(v[j]h) yd=h; yd=ceil(yd); x[m]=(int) xd; y[m]=(int) yd; m++; } // compute rle encoding given y-boundary points k=m; a=malloc(sizeof(uint)*(k+1)); for( j=0; j0) b[m++]=a[j++]; else { j++; if(jm, p=0; long x; bool more; char *s=malloc(sizeof(char)*m*6); for( i=0; icnts[i]; if(i>2) x-=(long) R->cnts[i-2]; more=1; while( more ) { char c=x & 0x1f; x >>= 5; more=(c & 0x10) ? x!=-1 : x!=0; if(more) c |= 0x20; c+=48; s[p++]=c; } } s[p]=0; return s; } void rleFrString( RLE *R, char *s, siz h, siz w ) { siz m=0, p=0, k; long x; bool more; uint *cnts; while( s[m] ) m++; cnts=malloc(sizeof(uint)*m); m=0; while( s[p] ) { x=0; k=0; more=1; while( more ) { char c=s[p]-48; x |= (c & 0x1f) << 5*k; more = c & 0x20; p++; k++; if(!more && (c & 0x10)) x |= -1 << 5*k; } if(m>2) x+=(long) cnts[m-2]; cnts[m++]=(uint) x; } rleInit(R,h,w,m,cnts); free(cnts); } ================================================ FILE: lib/pycocotools/maskApi.h ================================================ /************************************************************************** * Microsoft COCO Toolbox. version 2.0 * Data, paper, and tutorials available at: http://mscoco.org/ * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. * Licensed under the Simplified BSD License [see coco/license.txt] **************************************************************************/ #pragma once #include typedef unsigned int uint; typedef unsigned long siz; typedef unsigned char byte; typedef double* BB; typedef struct { siz h, w, m; uint *cnts; } RLE; // Initialize/destroy RLE. void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ); void rleFree( RLE *R ); // Initialize/destroy RLE array. void rlesInit( RLE **R, siz n ); void rlesFree( RLE **R, siz n ); // Encode binary masks using RLE. void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n ); // Decode binary masks encoded via RLE. void rleDecode( const RLE *R, byte *mask, siz n ); // Compute union or intersection of encoded masks. void rleMerge( const RLE *R, RLE *M, siz n, bool intersect ); // Compute area of encoded masks. void rleArea( const RLE *R, siz n, uint *a ); // Compute intersection over union between masks. void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ); // Compute intersection over union between bounding boxes. void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ); // Get bounding boxes surrounding encoded masks. void rleToBbox( const RLE *R, BB bb, siz n ); // Convert bounding boxes to encoded masks. void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ); // Convert polygon to encoded mask. void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ); // Get compressed string representation of encoded mask. char* rleToString( const RLE *R ); // Convert from compressed string representation of encoded mask. void rleFrString( RLE *R, char *s, siz h, siz w ); ================================================ FILE: lib/roi_data_layer/__init__.py ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- ================================================ FILE: lib/roi_data_layer/layer.py ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- """The data layer used during training to train a Fast R-CNN network. RoIDataLayer implements a Caffe Python layer. """ import caffe from fast_rcnn.config import cfg from roi_data_layer.minibatch import get_minibatch import numpy as np import yaml from multiprocessing import Process, Queue class RoIDataLayer(caffe.Layer): """Fast R-CNN data layer used for training.""" def _shuffle_roidb_inds(self): """Randomly permute the training roidb.""" if cfg.TRAIN.ASPECT_GROUPING: widths = np.array([r['width'] for r in self._roidb]) heights = np.array([r['height'] for r in self._roidb]) horz = (widths >= heights) vert = np.logical_not(horz) horz_inds = np.where(horz)[0] vert_inds = np.where(vert)[0] inds = np.hstack(( np.random.permutation(horz_inds), np.random.permutation(vert_inds))) inds = np.reshape(inds, (-1, 2)) row_perm = np.random.permutation(np.arange(inds.shape[0])) inds = np.reshape(inds[row_perm, :], (-1,)) self._perm = inds else: self._perm = np.random.permutation(np.arange(len(self._roidb))) self._cur = 0 def _get_next_minibatch_inds(self): """Return the roidb indices for the next minibatch.""" if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb): self._shuffle_roidb_inds() db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH] self._cur += cfg.TRAIN.IMS_PER_BATCH return db_inds def _get_next_minibatch(self): """Return the blobs to be used for the next minibatch. If cfg.TRAIN.USE_PREFETCH is True, then blobs will be computed in a separate process and made available through self._blob_queue. """ if cfg.TRAIN.USE_PREFETCH: return self._blob_queue.get() else: db_inds = self._get_next_minibatch_inds() minibatch_db = [self._roidb[i] for i in db_inds] return get_minibatch(minibatch_db, self._num_classes) def set_roidb(self, roidb): """Set the roidb to be used by this layer during training.""" self._roidb = roidb self._shuffle_roidb_inds() if cfg.TRAIN.USE_PREFETCH: self._blob_queue = Queue(10) self._prefetch_process = BlobFetcher(self._blob_queue, self._roidb, self._num_classes) self._prefetch_process.start() # Terminate the child process when the parent exists def cleanup(): print 'Terminating BlobFetcher' self._prefetch_process.terminate() self._prefetch_process.join() import atexit atexit.register(cleanup) def setup(self, bottom, top): """Setup the RoIDataLayer.""" # parse the layer parameter string, which must be valid YAML layer_params = yaml.load(self.param_str_) self._num_classes = layer_params['num_classes'] self._name_to_top_map = {} # data blob: holds a batch of N images, each with 3 channels idx = 0 top[idx].reshape(cfg.TRAIN.IMS_PER_BATCH, 3, max(cfg.TRAIN.SCALES), cfg.TRAIN.MAX_SIZE) self._name_to_top_map['data'] = idx idx += 1 if cfg.TRAIN.HAS_RPN: top[idx].reshape(1, 3) self._name_to_top_map['im_info'] = idx idx += 1 top[idx].reshape(1, 4) self._name_to_top_map['gt_boxes'] = idx idx += 1 else: # not using RPN # rois blob: holds R regions of interest, each is a 5-tuple # (n, x1, y1, x2, y2) specifying an image batch index n and a # rectangle (x1, y1, x2, y2) top[idx].reshape(1, 5) self._name_to_top_map['rois'] = idx idx += 1 # labels blob: R categorical labels in [0, ..., K] for K foreground # classes plus background top[idx].reshape(1) self._name_to_top_map['labels'] = idx idx += 1 if cfg.TRAIN.BBOX_REG: # bbox_targets blob: R bounding-box regression targets with 4 # targets per class top[idx].reshape(1, self._num_classes * 4) self._name_to_top_map['bbox_targets'] = idx idx += 1 # bbox_inside_weights blob: At most 4 targets per roi are active; # thisbinary vector sepcifies the subset of active targets top[idx].reshape(1, self._num_classes * 4) self._name_to_top_map['bbox_inside_weights'] = idx idx += 1 top[idx].reshape(1, self._num_classes * 4) self._name_to_top_map['bbox_outside_weights'] = idx idx += 1 print 'RoiDataLayer: name_to_top:', self._name_to_top_map assert len(top) == len(self._name_to_top_map) def forward(self, bottom, top): """Get blobs and copy them into this layer's top blob vector.""" blobs = self._get_next_minibatch() for blob_name, blob in blobs.iteritems(): top_ind = self._name_to_top_map[blob_name] # Reshape net's input blobs top[top_ind].reshape(*(blob.shape)) # Copy data into net's input blobs top[top_ind].data[...] = blob.astype(np.float32, copy=False) def backward(self, top, propagate_down, bottom): """This layer does not propagate gradients.""" pass def reshape(self, bottom, top): """Reshaping happens during the call to forward.""" pass class BlobFetcher(Process): """Experimental class for prefetching blobs in a separate process.""" def __init__(self, queue, roidb, num_classes): super(BlobFetcher, self).__init__() self._queue = queue self._roidb = roidb self._num_classes = num_classes self._perm = None self._cur = 0 self._shuffle_roidb_inds() # fix the random seed for reproducibility np.random.seed(cfg.RNG_SEED) def _shuffle_roidb_inds(self): """Randomly permute the training roidb.""" # TODO(rbg): remove duplicated code self._perm = np.random.permutation(np.arange(len(self._roidb))) self._cur = 0 def _get_next_minibatch_inds(self): """Return the roidb indices for the next minibatch.""" # TODO(rbg): remove duplicated code if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb): self._shuffle_roidb_inds() db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH] self._cur += cfg.TRAIN.IMS_PER_BATCH return db_inds def run(self): print 'BlobFetcher started' while True: db_inds = self._get_next_minibatch_inds() minibatch_db = [self._roidb[i] for i in db_inds] blobs = get_minibatch(minibatch_db, self._num_classes) self._queue.put(blobs) ================================================ FILE: lib/roi_data_layer/minibatch.py ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- """Compute minibatch blobs for training a Fast R-CNN network.""" import numpy as np import numpy.random as npr import cv2 from fast_rcnn.config import cfg from utils.blob import prep_im_for_blob, im_list_to_blob def get_minibatch(roidb, num_classes): """Given a roidb, construct a minibatch sampled from it.""" num_images = len(roidb) # Sample random scales to use for each image in this batch random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES), size=num_images) assert(cfg.TRAIN.BATCH_SIZE % num_images == 0), \ 'num_images ({}) must divide BATCH_SIZE ({})'. \ format(num_images, cfg.TRAIN.BATCH_SIZE) rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image) # Get the input image blob, formatted for caffe im_blob, im_scales = _get_image_blob(roidb, random_scale_inds) blobs = {'data': im_blob} if cfg.TRAIN.HAS_RPN: assert len(im_scales) == 1, "Single batch only" assert len(roidb) == 1, "Single batch only" # gt boxes: (x1, y1, x2, y2, cls) gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0] gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32) gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0] gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds] blobs['gt_boxes'] = gt_boxes blobs['im_info'] = np.array( [[im_blob.shape[2], im_blob.shape[3], im_scales[0]]], dtype=np.float32) else: # not using RPN # Now, build the region of interest and label blobs rois_blob = np.zeros((0, 5), dtype=np.float32) labels_blob = np.zeros((0), dtype=np.float32) bbox_targets_blob = np.zeros((0, 4 * num_classes), dtype=np.float32) bbox_inside_blob = np.zeros(bbox_targets_blob.shape, dtype=np.float32) # all_overlaps = [] for im_i in xrange(num_images): labels, overlaps, im_rois, bbox_targets, bbox_inside_weights \ = _sample_rois(roidb[im_i], fg_rois_per_image, rois_per_image, num_classes) # Add to RoIs blob rois = _project_im_rois(im_rois, im_scales[im_i]) batch_ind = im_i * np.ones((rois.shape[0], 1)) rois_blob_this_image = np.hstack((batch_ind, rois)) rois_blob = np.vstack((rois_blob, rois_blob_this_image)) # Add to labels, bbox targets, and bbox loss blobs labels_blob = np.hstack((labels_blob, labels)) bbox_targets_blob = np.vstack((bbox_targets_blob, bbox_targets)) bbox_inside_blob = np.vstack((bbox_inside_blob, bbox_inside_weights)) # all_overlaps = np.hstack((all_overlaps, overlaps)) # For debug visualizations # _vis_minibatch(im_blob, rois_blob, labels_blob, all_overlaps) blobs['rois'] = rois_blob blobs['labels'] = labels_blob if cfg.TRAIN.BBOX_REG: blobs['bbox_targets'] = bbox_targets_blob blobs['bbox_inside_weights'] = bbox_inside_blob blobs['bbox_outside_weights'] = \ np.array(bbox_inside_blob > 0).astype(np.float32) return blobs def _sample_rois(roidb, fg_rois_per_image, rois_per_image, num_classes): """Generate a random sample of RoIs comprising foreground and background examples. """ # label = class RoI has max overlap with labels = roidb['max_classes'] overlaps = roidb['max_overlaps'] rois = roidb['boxes'] # Select foreground RoIs as those with >= FG_THRESH overlap fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0] # Guard against the case when an image has fewer than fg_rois_per_image # foreground RoIs fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size) # Sample foreground regions without replacement if fg_inds.size > 0: fg_inds = npr.choice( fg_inds, size=fg_rois_per_this_image, replace=False) # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) & (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] # Compute number of background RoIs to take from this image (guarding # against there being fewer than desired) bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size) # Sample foreground regions without replacement if bg_inds.size > 0: bg_inds = npr.choice( bg_inds, size=bg_rois_per_this_image, replace=False) # The indices that we're selecting (both fg and bg) keep_inds = np.append(fg_inds, bg_inds) # Select sampled values from various arrays: labels = labels[keep_inds] # Clamp labels for the background RoIs to 0 labels[fg_rois_per_this_image:] = 0 overlaps = overlaps[keep_inds] rois = rois[keep_inds] bbox_targets, bbox_inside_weights = _get_bbox_regression_labels( roidb['bbox_targets'][keep_inds, :], num_classes) return labels, overlaps, rois, bbox_targets, bbox_inside_weights def _get_image_blob(roidb, scale_inds): """Builds an input blob from the images in the roidb at the specified scales. """ num_images = len(roidb) processed_ims = [] im_scales = [] for i in xrange(num_images): im = cv2.imread(roidb[i]['image']) if roidb[i]['flipped']: im = im[:, ::-1, :] target_size = cfg.TRAIN.SCALES[scale_inds[i]] im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size, cfg.TRAIN.MAX_SIZE) im_scales.append(im_scale) processed_ims.append(im) # Create a blob to hold the input images blob = im_list_to_blob(processed_ims) return blob, im_scales def _project_im_rois(im_rois, im_scale_factor): """Project image RoIs into the rescaled training image.""" rois = im_rois * im_scale_factor return rois def _get_bbox_regression_labels(bbox_target_data, num_classes): """Bounding-box regression targets are stored in a compact form in the roidb. This function expands those targets into the 4-of-4*K representation used by the network (i.e. only one class has non-zero targets). The loss weights are similarly expanded. Returns: bbox_target_data (ndarray): N x 4K blob of regression targets bbox_inside_weights (ndarray): N x 4K blob of loss weights """ clss = bbox_target_data[:, 0] bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32) bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32) inds = np.where(clss > 0)[0] for ind in inds: cls = clss[ind] start = 4 * cls end = start + 4 bbox_targets[ind, start:end] = bbox_target_data[ind, 1:] bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS return bbox_targets, bbox_inside_weights def _vis_minibatch(im_blob, rois_blob, labels_blob, overlaps): """Visualize a mini-batch for debugging.""" import matplotlib.pyplot as plt for i in xrange(rois_blob.shape[0]): rois = rois_blob[i, :] im_ind = rois[0] roi = rois[1:] im = im_blob[im_ind, :, :, :].transpose((1, 2, 0)).copy() im += cfg.PIXEL_MEANS im = im[:, :, (2, 1, 0)] im = im.astype(np.uint8) cls = labels_blob[i] plt.imshow(im) print 'class: ', cls, ' overlap: ', overlaps[i] plt.gca().add_patch( plt.Rectangle((roi[0], roi[1]), roi[2] - roi[0], roi[3] - roi[1], fill=False, edgecolor='r', linewidth=3) ) plt.show() ================================================ FILE: lib/roi_data_layer/roidb.py ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- """Transform a roidb into a trainable roidb by adding a bunch of metadata.""" import numpy as np from fast_rcnn.config import cfg from fast_rcnn.bbox_transform import bbox_transform from utils.cython_bbox import bbox_overlaps import PIL def prepare_roidb(imdb): """Enrich the imdb's roidb by adding some derived quantities that are useful for training. This function precomputes the maximum overlap, taken over ground-truth boxes, between each ROI and each ground-truth box. The class with maximum overlap is also recorded. """ sizes = [PIL.Image.open(imdb.image_path_at(i)).size for i in xrange(imdb.num_images)] roidb = imdb.roidb for i in xrange(len(imdb.image_index)): roidb[i]['image'] = imdb.image_path_at(i) roidb[i]['width'] = sizes[i][0] roidb[i]['height'] = sizes[i][1] # need gt_overlaps as a dense array for argmax gt_overlaps = roidb[i]['gt_overlaps'].toarray() # max overlap with gt over classes (columns) max_overlaps = gt_overlaps.max(axis=1) # gt class that had the max overlap max_classes = gt_overlaps.argmax(axis=1) roidb[i]['max_classes'] = max_classes roidb[i]['max_overlaps'] = max_overlaps # sanity checks # max overlap of 0 => class should be zero (background) zero_inds = np.where(max_overlaps == 0)[0] assert all(max_classes[zero_inds] == 0) # max overlap > 0 => class should not be zero (must be a fg class) nonzero_inds = np.where(max_overlaps > 0)[0] assert all(max_classes[nonzero_inds] != 0) def add_bbox_regression_targets(roidb): """Add information needed to train bounding-box regressors.""" assert len(roidb) > 0 assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?' num_images = len(roidb) # Infer number of classes from the number of columns in gt_overlaps num_classes = roidb[0]['gt_overlaps'].shape[1] for im_i in xrange(num_images): rois = roidb[im_i]['boxes'] max_overlaps = roidb[im_i]['max_overlaps'] max_classes = roidb[im_i]['max_classes'] roidb[im_i]['bbox_targets'] = \ _compute_targets(rois, max_overlaps, max_classes) if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Use fixed / precomputed "means" and "stds" instead of empirical values means = np.tile( np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1)) stds = np.tile( np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1)) else: # Compute values needed for means and stds # var(x) = E(x^2) - E(x)^2 class_counts = np.zeros((num_classes, 1)) + cfg.EPS sums = np.zeros((num_classes, 4)) squared_sums = np.zeros((num_classes, 4)) for im_i in xrange(num_images): targets = roidb[im_i]['bbox_targets'] for cls in xrange(1, num_classes): cls_inds = np.where(targets[:, 0] == cls)[0] if cls_inds.size > 0: class_counts[cls] += cls_inds.size sums[cls, :] += targets[cls_inds, 1:].sum(axis=0) squared_sums[cls, :] += \ (targets[cls_inds, 1:] ** 2).sum(axis=0) means = sums / class_counts stds = np.sqrt(squared_sums / class_counts - means ** 2) print 'bbox target means:' print means print means[1:, :].mean(axis=0) # ignore bg class print 'bbox target stdevs:' print stds print stds[1:, :].mean(axis=0) # ignore bg class # Normalize targets if cfg.TRAIN.BBOX_NORMALIZE_TARGETS: print "Normalizing targets" for im_i in xrange(num_images): targets = roidb[im_i]['bbox_targets'] for cls in xrange(1, num_classes): cls_inds = np.where(targets[:, 0] == cls)[0] roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :] roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :] else: print "NOT normalizing targets" # These values will be needed for making predictions # (the predicts will need to be unnormalized and uncentered) return means.ravel(), stds.ravel() def _compute_targets(rois, overlaps, labels): """Compute bounding-box regression targets for an image.""" # Indices of ground-truth ROIs gt_inds = np.where(overlaps == 1)[0] if len(gt_inds) == 0: # Bail if the image has no ground-truth ROIs return np.zeros((rois.shape[0], 5), dtype=np.float32) # Indices of examples for which we try to make predictions ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0] # Get IoU overlap between each ex ROI and gt ROI ex_gt_overlaps = bbox_overlaps( np.ascontiguousarray(rois[ex_inds, :], dtype=np.float), np.ascontiguousarray(rois[gt_inds, :], dtype=np.float)) # Find which gt ROI each ex ROI has max overlap with: # this will be the ex ROI's gt target gt_assignment = ex_gt_overlaps.argmax(axis=1) gt_rois = rois[gt_inds[gt_assignment], :] ex_rois = rois[ex_inds, :] targets = np.zeros((rois.shape[0], 5), dtype=np.float32) targets[ex_inds, 0] = labels[ex_inds] targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois) return targets ================================================ FILE: lib/rpn/README.md ================================================ ### `rpn` module overview ##### `generate_anchors.py` Generates a regular grid of multi-scale, multi-aspect anchor boxes. ##### `proposal_layer.py` Converts RPN outputs (per-anchor scores and bbox regression estimates) into object proposals. ##### `anchor_target_layer.py` Generates training targets/labels for each anchor. Classification labels are 1 (object), 0 (not object) or -1 (ignore). Bbox regression targets are specified when the classification label is > 0. ##### `proposal_target_layer.py` Generates training targets/labels for each object proposal: classification labels 0 - K (bg or object class 1, ... , K) and bbox regression targets in that case that the label is > 0. ##### `generate.py` Generate object detection proposals from an imdb using an RPN. ================================================ FILE: lib/rpn/__init__.py ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick and Sean Bell # -------------------------------------------------------- ================================================ FILE: lib/rpn/anchor_target_layer.py ================================================ # -------------------------------------------------------- # Faster R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick and Sean Bell # -------------------------------------------------------- import os import caffe import yaml from fast_rcnn.config import cfg import numpy as np import numpy.random as npr from generate_anchors import generate_anchors from utils.cython_bbox import bbox_overlaps from fast_rcnn.bbox_transform import bbox_transform DEBUG = False class AnchorTargetLayer(caffe.Layer): """ Assign anchors to ground-truth targets. Produces anchor classification labels and bounding-box regression targets. """ def setup(self, bottom, top): layer_params = yaml.load(self.param_str_) anchor_scales = layer_params.get('scales', (8, 16, 32)) self._anchors = generate_anchors(scales=np.array(anchor_scales)) self._num_anchors = self._anchors.shape[0] self._feat_stride = layer_params['feat_stride'] if DEBUG: print 'anchors:' print self._anchors print 'anchor shapes:' print np.hstack(( self._anchors[:, 2::4] - self._anchors[:, 0::4], self._anchors[:, 3::4] - self._anchors[:, 1::4], )) self._counts = cfg.EPS self._sums = np.zeros((1, 4)) self._squared_sums = np.zeros((1, 4)) self._fg_sum = 0 self._bg_sum = 0 self._count = 0 # allow boxes to sit over the edge by a small amount self._allowed_border = layer_params.get('allowed_border', 0) height, width = bottom[0].data.shape[-2:] if DEBUG: print 'AnchorTargetLayer: height', height, 'width', width A = self._num_anchors # labels top[0].reshape(1, 1, A * height, width) # bbox_targets top[1].reshape(1, A * 4, height, width) # bbox_inside_weights top[2].reshape(1, A * 4, height, width) # bbox_outside_weights top[3].reshape(1, A * 4, height, width) def forward(self, bottom, top): # Algorithm: # # for each (H, W) location i # generate 9 anchor boxes centered on cell i # apply predicted bbox deltas at cell i to each of the 9 anchors # filter out-of-image anchors # measure GT overlap assert bottom[0].data.shape[0] == 1, \ 'Only single item batches are supported' # map of shape (..., H, W) height, width = bottom[0].data.shape[-2:] # GT boxes (x1, y1, x2, y2, label) gt_boxes = bottom[1].data # im_info im_info = bottom[2].data[0, :] if DEBUG: print '' print 'im_size: ({}, {})'.format(im_info[0], im_info[1]) print 'scale: {}'.format(im_info[2]) print 'height, width: ({}, {})'.format(height, width) print 'rpn: gt_boxes.shape', gt_boxes.shape print 'rpn: gt_boxes', gt_boxes # 1. Generate proposals from bbox deltas and shifted anchors shift_x = np.arange(0, width) * self._feat_stride shift_y = np.arange(0, height) * self._feat_stride shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() # add A anchors (1, A, 4) to # cell K shifts (K, 1, 4) to get # shift anchors (K, A, 4) # reshape to (K*A, 4) shifted anchors A = self._num_anchors K = shifts.shape[0] all_anchors = (self._anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2))) all_anchors = all_anchors.reshape((K * A, 4)) total_anchors = int(K * A) # only keep anchors inside the image inds_inside = np.where( (all_anchors[:, 0] >= -self._allowed_border) & (all_anchors[:, 1] >= -self._allowed_border) & (all_anchors[:, 2] < im_info[1] + self._allowed_border) & # width (all_anchors[:, 3] < im_info[0] + self._allowed_border) # height )[0] if DEBUG: print 'total_anchors', total_anchors print 'inds_inside', len(inds_inside) # keep only inside anchors anchors = all_anchors[inds_inside, :] if DEBUG: print 'anchors.shape', anchors.shape # label: 1 is positive, 0 is negative, -1 is dont care labels = np.empty((len(inds_inside), ), dtype=np.float32) labels.fill(-1) # overlaps between the anchors and the gt boxes # overlaps (ex, gt) overlaps = bbox_overlaps( np.ascontiguousarray(anchors, dtype=np.float), np.ascontiguousarray(gt_boxes, dtype=np.float)) argmax_overlaps = overlaps.argmax(axis=1) max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] gt_argmax_overlaps = overlaps.argmax(axis=0) gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])] gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] if not cfg.TRAIN.RPN_CLOBBER_POSITIVES: # assign bg labels first so that positive labels can clobber them labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 # fg label: for each gt, anchor with highest overlap labels[gt_argmax_overlaps] = 1 # fg label: above threshold IOU labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1 if cfg.TRAIN.RPN_CLOBBER_POSITIVES: # assign bg labels last so that negative labels can clobber positives labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 # subsample positive labels if we have too many num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE) fg_inds = np.where(labels == 1)[0] if len(fg_inds) > num_fg: disable_inds = npr.choice( fg_inds, size=(len(fg_inds) - num_fg), replace=False) labels[disable_inds] = -1 # subsample negative labels if we have too many num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1) bg_inds = np.where(labels == 0)[0] if len(bg_inds) > num_bg: disable_inds = npr.choice( bg_inds, size=(len(bg_inds) - num_bg), replace=False) labels[disable_inds] = -1 #print "was %s inds, disabling %s, now %s inds" % ( #len(bg_inds), len(disable_inds), np.sum(labels == 0)) bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32) bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :]) bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) bbox_inside_weights[labels == 1, :] = np.array(cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS) bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0: # uniform weighting of examples (given non-uniform sampling) num_examples = np.sum(labels >= 0) positive_weights = np.ones((1, 4)) * 1.0 / num_examples negative_weights = np.ones((1, 4)) * 1.0 / num_examples else: assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) & (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1)) positive_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT / np.sum(labels == 1)) negative_weights = ((1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) / np.sum(labels == 0)) bbox_outside_weights[labels == 1, :] = positive_weights bbox_outside_weights[labels == 0, :] = negative_weights if DEBUG: self._sums += bbox_targets[labels == 1, :].sum(axis=0) self._squared_sums += (bbox_targets[labels == 1, :] ** 2).sum(axis=0) self._counts += np.sum(labels == 1) means = self._sums / self._counts stds = np.sqrt(self._squared_sums / self._counts - means ** 2) print 'means:' print means print 'stdevs:' print stds # map up to original set of anchors labels = _unmap(labels, total_anchors, inds_inside, fill=-1) bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0) bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0) bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0) if DEBUG: print 'rpn: max max_overlap', np.max(max_overlaps) print 'rpn: num_positive', np.sum(labels == 1) print 'rpn: num_negative', np.sum(labels == 0) self._fg_sum += np.sum(labels == 1) self._bg_sum += np.sum(labels == 0) self._count += 1 print 'rpn: num_positive avg', self._fg_sum / self._count print 'rpn: num_negative avg', self._bg_sum / self._count # labels labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2) labels = labels.reshape((1, 1, A * height, width)) top[0].reshape(*labels.shape) top[0].data[...] = labels # bbox_targets bbox_targets = bbox_targets \ .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2) top[1].reshape(*bbox_targets.shape) top[1].data[...] = bbox_targets # bbox_inside_weights bbox_inside_weights = bbox_inside_weights \ .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2) assert bbox_inside_weights.shape[2] == height assert bbox_inside_weights.shape[3] == width top[2].reshape(*bbox_inside_weights.shape) top[2].data[...] = bbox_inside_weights # bbox_outside_weights bbox_outside_weights = bbox_outside_weights \ .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2) assert bbox_outside_weights.shape[2] == height assert bbox_outside_weights.shape[3] == width top[3].reshape(*bbox_outside_weights.shape) top[3].data[...] = bbox_outside_weights def backward(self, top, propagate_down, bottom): """This layer does not propagate gradients.""" pass def reshape(self, bottom, top): """Reshaping happens during the call to forward.""" pass def _unmap(data, count, inds, fill=0): """ Unmap a subset of item (data) back to the original set of items (of size count) """ if len(data.shape) == 1: ret = np.empty((count, ), dtype=np.float32) ret.fill(fill) ret[inds] = data else: ret = np.empty((count, ) + data.shape[1:], dtype=np.float32) ret.fill(fill) ret[inds, :] = data return ret def _compute_targets(ex_rois, gt_rois): """Compute bounding-box regression targets for an image.""" assert ex_rois.shape[0] == gt_rois.shape[0] assert ex_rois.shape[1] == 4 assert gt_rois.shape[1] == 5 return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False) ================================================ FILE: lib/rpn/generate.py ================================================ # -------------------------------------------------------- # Faster R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- from fast_rcnn.config import cfg from utils.blob import im_list_to_blob from utils.timer import Timer import numpy as np import cv2 def _vis_proposals(im, dets, thresh=0.5): """Draw detected bounding boxes.""" inds = np.where(dets[:, -1] >= thresh)[0] if len(inds) == 0: return class_name = 'obj' im = im[:, :, (2, 1, 0)] fig, ax = plt.subplots(figsize=(12, 12)) ax.imshow(im, aspect='equal') for i in inds: bbox = dets[i, :4] score = dets[i, -1] ax.add_patch( plt.Rectangle((bbox[0], bbox[1]), bbox[2] - bbox[0], bbox[3] - bbox[1], fill=False, edgecolor='red', linewidth=3.5) ) ax.text(bbox[0], bbox[1] - 2, '{:s} {:.3f}'.format(class_name, score), bbox=dict(facecolor='blue', alpha=0.5), fontsize=14, color='white') ax.set_title(('{} detections with ' 'p({} | box) >= {:.1f}').format(class_name, class_name, thresh), fontsize=14) plt.axis('off') plt.tight_layout() plt.draw() def _get_image_blob(im): """Converts an image into a network input. Arguments: im (ndarray): a color image in BGR order Returns: blob (ndarray): a data blob holding an image pyramid im_scale_factors (list): list of image scales (relative to im) used in the image pyramid """ im_orig = im.astype(np.float32, copy=True) im_orig -= cfg.PIXEL_MEANS im_shape = im_orig.shape im_size_min = np.min(im_shape[0:2]) im_size_max = np.max(im_shape[0:2]) processed_ims = [] assert len(cfg.TEST.SCALES) == 1 target_size = cfg.TEST.SCALES[0] im_scale = float(target_size) / float(im_size_min) # Prevent the biggest axis from being more than MAX_SIZE if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE: im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max) im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) im_info = np.hstack((im.shape[:2], im_scale))[np.newaxis, :] processed_ims.append(im) # Create a blob to hold the input images blob = im_list_to_blob(processed_ims) return blob, im_info def im_proposals(net, im): """Generate RPN proposals on a single image.""" blobs = {} blobs['data'], blobs['im_info'] = _get_image_blob(im) net.blobs['data'].reshape(*(blobs['data'].shape)) net.blobs['im_info'].reshape(*(blobs['im_info'].shape)) blobs_out = net.forward( data=blobs['data'].astype(np.float32, copy=False), im_info=blobs['im_info'].astype(np.float32, copy=False)) scale = blobs['im_info'][0, 2] boxes = blobs_out['rois'][:, 1:].copy() / scale scores = blobs_out['scores'].copy() return boxes, scores def imdb_proposals(net, imdb): """Generate RPN proposals on all images in an imdb.""" _t = Timer() imdb_boxes = [[] for _ in xrange(imdb.num_images)] for i in xrange(imdb.num_images): im = cv2.imread(imdb.image_path_at(i)) _t.tic() imdb_boxes[i], scores = im_proposals(net, im) _t.toc() print 'im_proposals: {:d}/{:d} {:.3f}s' \ .format(i + 1, imdb.num_images, _t.average_time) if 0: dets = np.hstack((imdb_boxes[i], scores)) # from IPython import embed; embed() _vis_proposals(im, dets[:3, :], thresh=0.9) plt.show() return imdb_boxes ================================================ FILE: lib/rpn/generate_anchors.py ================================================ # -------------------------------------------------------- # Faster R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick and Sean Bell # -------------------------------------------------------- import numpy as np # Verify that we compute the same anchors as Shaoqing's matlab implementation: # # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat # >> anchors # # anchors = # # -83 -39 100 56 # -175 -87 192 104 # -359 -183 376 200 # -55 -55 72 72 # -119 -119 136 136 # -247 -247 264 264 # -35 -79 52 96 # -79 -167 96 184 # -167 -343 184 360 #array([[ -83., -39., 100., 56.], # [-175., -87., 192., 104.], # [-359., -183., 376., 200.], # [ -55., -55., 72., 72.], # [-119., -119., 136., 136.], # [-247., -247., 264., 264.], # [ -35., -79., 52., 96.], # [ -79., -167., 96., 184.], # [-167., -343., 184., 360.]]) def generate_anchors(base_size=16, ratios=[0.5, 1, 2], scales=2**np.arange(3, 6)): """ Generate anchor (reference) windows by enumerating aspect ratios X scales wrt a reference (0, 0, 15, 15) window. """ base_anchor = np.array([1, 1, base_size, base_size]) - 1 ratio_anchors = _ratio_enum(base_anchor, ratios) anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) for i in xrange(ratio_anchors.shape[0])]) return anchors def _whctrs(anchor): """ Return width, height, x center, and y center for an anchor (window). """ w = anchor[2] - anchor[0] + 1 h = anchor[3] - anchor[1] + 1 x_ctr = anchor[0] + 0.5 * (w - 1) y_ctr = anchor[1] + 0.5 * (h - 1) return w, h, x_ctr, y_ctr def _mkanchors(ws, hs, x_ctr, y_ctr): """ Given a vector of widths (ws) and heights (hs) around a center (x_ctr, y_ctr), output a set of anchors (windows). """ ws = ws[:, np.newaxis] hs = hs[:, np.newaxis] anchors = np.hstack((x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1), x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1))) return anchors def _ratio_enum(anchor, ratios): """ Enumerate a set of anchors for each aspect ratio wrt an anchor. """ w, h, x_ctr, y_ctr = _whctrs(anchor) size = w * h size_ratios = size / ratios ws = np.round(np.sqrt(size_ratios)) hs = np.round(ws * ratios) anchors = _mkanchors(ws, hs, x_ctr, y_ctr) return anchors def _scale_enum(anchor, scales): """ Enumerate a set of anchors for each scale wrt an anchor. """ w, h, x_ctr, y_ctr = _whctrs(anchor) ws = w * scales hs = h * scales anchors = _mkanchors(ws, hs, x_ctr, y_ctr) return anchors if __name__ == '__main__': import time t = time.time() a = generate_anchors() print time.time() - t print a from IPython import embed; embed() ================================================ FILE: lib/rpn/proposal_layer.py ================================================ # -------------------------------------------------------- # Faster R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick and Sean Bell # -------------------------------------------------------- import caffe import numpy as np import yaml from fast_rcnn.config import cfg from generate_anchors import generate_anchors from fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes from fast_rcnn.nms_wrapper import nms DEBUG = False class ProposalLayer(caffe.Layer): """ Outputs object detection proposals by applying estimated bounding-box transformations to a set of regular boxes (called "anchors"). """ def setup(self, bottom, top): # parse the layer parameter string, which must be valid YAML layer_params = yaml.load(self.param_str_) self._feat_stride = layer_params['feat_stride'] anchor_scales = layer_params.get('scales', (8, 16, 32)) self._anchors = generate_anchors(scales=np.array(anchor_scales)) self._num_anchors = self._anchors.shape[0] if DEBUG: print 'feat_stride: {}'.format(self._feat_stride) print 'anchors:' print self._anchors # rois blob: holds R regions of interest, each is a 5-tuple # (n, x1, y1, x2, y2) specifying an image batch index n and a # rectangle (x1, y1, x2, y2) top[0].reshape(1, 5) # scores blob: holds scores for R regions of interest if len(top) > 1: top[1].reshape(1, 1, 1, 1) def forward(self, bottom, top): # Algorithm: # # for each (H, W) location i # generate A anchor boxes centered on cell i # apply predicted bbox deltas at cell i to each of the A anchors # clip predicted boxes to image # remove predicted boxes with either height or width < threshold # sort all (proposal, score) pairs by score from highest to lowest # take top pre_nms_topN proposals before NMS # apply NMS with threshold 0.7 to remaining proposals # take after_nms_topN proposals after NMS # return the top proposals (-> RoIs top, scores top) assert bottom[0].data.shape[0] == 1, \ 'Only single item batches are supported' cfg_key = str(self.phase) # either 'TRAIN' or 'TEST' pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N nms_thresh = cfg[cfg_key].RPN_NMS_THRESH min_size = cfg[cfg_key].RPN_MIN_SIZE # the first set of _num_anchors channels are bg probs # the second set are the fg probs, which we want scores = bottom[0].data[:, self._num_anchors:, :, :] bbox_deltas = bottom[1].data im_info = bottom[2].data[0, :] if DEBUG: print 'im_size: ({}, {})'.format(im_info[0], im_info[1]) print 'scale: {}'.format(im_info[2]) # 1. Generate proposals from bbox deltas and shifted anchors height, width = scores.shape[-2:] if DEBUG: print 'score map size: {}'.format(scores.shape) # Enumerate all shifts shift_x = np.arange(0, width) * self._feat_stride shift_y = np.arange(0, height) * self._feat_stride shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() # Enumerate all shifted anchors: # # add A anchors (1, A, 4) to # cell K shifts (K, 1, 4) to get # shift anchors (K, A, 4) # reshape to (K*A, 4) shifted anchors A = self._num_anchors K = shifts.shape[0] anchors = self._anchors.reshape((1, A, 4)) + \ shifts.reshape((1, K, 4)).transpose((1, 0, 2)) anchors = anchors.reshape((K * A, 4)) # Transpose and reshape predicted bbox transformations to get them # into the same order as the anchors: # # bbox deltas will be (1, 4 * A, H, W) format # transpose to (1, H, W, 4 * A) # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a) # in slowest to fastest order bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4)) # Same story for the scores: # # scores are (1, A, H, W) format # transpose to (1, H, W, A) # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a) scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1)) # Convert anchors into proposals via bbox transformations proposals = bbox_transform_inv(anchors, bbox_deltas) # 2. clip predicted boxes to image proposals = clip_boxes(proposals, im_info[:2]) # 3. remove predicted boxes with either height or width < threshold # (NOTE: convert min_size to input image scale stored in im_info[2]) keep = _filter_boxes(proposals, min_size * im_info[2]) proposals = proposals[keep, :] scores = scores[keep] # 4. sort all (proposal, score) pairs by score from highest to lowest # 5. take top pre_nms_topN (e.g. 6000) order = scores.ravel().argsort()[::-1] if pre_nms_topN > 0: order = order[:pre_nms_topN] proposals = proposals[order, :] scores = scores[order] # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) keep = nms(np.hstack((proposals, scores)), nms_thresh) if post_nms_topN > 0: keep = keep[:post_nms_topN] proposals = proposals[keep, :] scores = scores[keep] # Output rois blob # Our RPN implementation only supports a single input image, so all # batch inds are 0 batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32) blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False))) top[0].reshape(*(blob.shape)) top[0].data[...] = blob # [Optional] output scores blob if len(top) > 1: top[1].reshape(*(scores.shape)) top[1].data[...] = scores def backward(self, top, propagate_down, bottom): """This layer does not propagate gradients.""" pass def reshape(self, bottom, top): """Reshaping happens during the call to forward.""" pass def _filter_boxes(boxes, min_size): """Remove all boxes with any side smaller than min_size.""" ws = boxes[:, 2] - boxes[:, 0] + 1 hs = boxes[:, 3] - boxes[:, 1] + 1 keep = np.where((ws >= min_size) & (hs >= min_size))[0] return keep ================================================ FILE: lib/rpn/proposal_target_layer.py ================================================ # -------------------------------------------------------- # Faster R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick and Sean Bell # -------------------------------------------------------- import caffe import yaml import numpy as np import numpy.random as npr from fast_rcnn.config import cfg from fast_rcnn.bbox_transform import bbox_transform from utils.cython_bbox import bbox_overlaps DEBUG = False class ProposalTargetLayer(caffe.Layer): """ Assign object detection proposals to ground-truth targets. Produces proposal classification labels and bounding-box regression targets. """ def setup(self, bottom, top): layer_params = yaml.load(self.param_str_) self._num_classes = layer_params['num_classes'] # sampled rois (0, x1, y1, x2, y2) top[0].reshape(1, 5) # labels top[1].reshape(1, 1) # bbox_targets top[2].reshape(1, self._num_classes * 4) # bbox_inside_weights top[3].reshape(1, self._num_classes * 4) # bbox_outside_weights top[4].reshape(1, self._num_classes * 4) def forward(self, bottom, top): # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN # (i.e., rpn.proposal_layer.ProposalLayer), or any other source all_rois = bottom[0].data # GT boxes (x1, y1, x2, y2, label) # TODO(rbg): it's annoying that sometimes I have extra info before # and other times after box coordinates -- normalize to one format gt_boxes = bottom[1].data # Include ground-truth boxes in the set of candidate rois zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype) all_rois = np.vstack( (all_rois, np.hstack((zeros, gt_boxes[:, :-1]))) ) # Sanity check: single batch only assert np.all(all_rois[:, 0] == 0), \ 'Only single item batches are supported' num_images = 1 rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image) # Sample rois with classification labels and bounding box regression # targets labels, rois, bbox_targets, bbox_inside_weights = _sample_rois( all_rois, gt_boxes, fg_rois_per_image, rois_per_image, self._num_classes) if DEBUG: print 'num fg: {}'.format((labels > 0).sum()) print 'num bg: {}'.format((labels == 0).sum()) self._count += 1 self._fg_num += (labels > 0).sum() self._bg_num += (labels == 0).sum() print 'num fg avg: {}'.format(self._fg_num / self._count) print 'num bg avg: {}'.format(self._bg_num / self._count) print 'ratio: {:.3f}'.format(float(self._fg_num) / float(self._bg_num)) # sampled rois top[0].reshape(*rois.shape) top[0].data[...] = rois # classification labels top[1].reshape(*labels.shape) top[1].data[...] = labels # bbox_targets top[2].reshape(*bbox_targets.shape) top[2].data[...] = bbox_targets # bbox_inside_weights top[3].reshape(*bbox_inside_weights.shape) top[3].data[...] = bbox_inside_weights # bbox_outside_weights top[4].reshape(*bbox_inside_weights.shape) top[4].data[...] = np.array(bbox_inside_weights > 0).astype(np.float32) def backward(self, top, propagate_down, bottom): """This layer does not propagate gradients.""" pass def reshape(self, bottom, top): """Reshaping happens during the call to forward.""" pass def _get_bbox_regression_labels(bbox_target_data, num_classes): """Bounding-box regression targets (bbox_target_data) are stored in a compact form N x (class, tx, ty, tw, th) This function expands those targets into the 4-of-4*K representation used by the network (i.e. only one class has non-zero targets). Returns: bbox_target (ndarray): N x 4K blob of regression targets bbox_inside_weights (ndarray): N x 4K blob of loss weights """ clss = bbox_target_data[:, 0] bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32) bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32) inds = np.where(clss > 0)[0] for ind in inds: cls = clss[ind] start = 4 * cls end = start + 4 bbox_targets[ind, start:end] = bbox_target_data[ind, 1:] bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS return bbox_targets, bbox_inside_weights def _compute_targets(ex_rois, gt_rois, labels): """Compute bounding-box regression targets for an image.""" assert ex_rois.shape[0] == gt_rois.shape[0] assert ex_rois.shape[1] == 4 assert gt_rois.shape[1] == 4 targets = bbox_transform(ex_rois, gt_rois) if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS)) / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS)) return np.hstack( (labels[:, np.newaxis], targets)).astype(np.float32, copy=False) def _sample_rois(all_rois, gt_boxes, fg_rois_per_image, rois_per_image, num_classes): """Generate a random sample of RoIs comprising foreground and background examples. """ # overlaps: (rois x gt_boxes) overlaps = bbox_overlaps( np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float), np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float)) gt_assignment = overlaps.argmax(axis=1) max_overlaps = overlaps.max(axis=1) labels = gt_boxes[gt_assignment, 4] # Select foreground RoIs as those with >= FG_THRESH overlap fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0] # Guard against the case when an image has fewer than fg_rois_per_image # foreground RoIs fg_rois_per_this_image = min(fg_rois_per_image, fg_inds.size) # Sample foreground regions without replacement if fg_inds.size > 0: fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False) # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) & (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] # Compute number of background RoIs to take from this image (guarding # against there being fewer than desired) bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size) # Sample background regions without replacement if bg_inds.size > 0: bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False) # The indices that we're selecting (both fg and bg) keep_inds = np.append(fg_inds, bg_inds) # Select sampled values from various arrays: labels = labels[keep_inds] # Clamp labels for the background RoIs to 0 labels[fg_rois_per_this_image:] = 0 rois = all_rois[keep_inds] bbox_target_data = _compute_targets( rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels) bbox_targets, bbox_inside_weights = \ _get_bbox_regression_labels(bbox_target_data, num_classes) return labels, rois, bbox_targets, bbox_inside_weights ================================================ FILE: lib/setup.py ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- import os from os.path import join as pjoin from setuptools import setup from distutils.extension import Extension from Cython.Distutils import build_ext import subprocess import numpy as np def find_in_path(name, path): "Find a file in a search path" # Adapted fom # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ for dir in path.split(os.pathsep): binpath = pjoin(dir, name) if os.path.exists(binpath): return os.path.abspath(binpath) return None def locate_cuda(): """Locate the CUDA environment on the system Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' and values giving the absolute path to each directory. Starts by looking for the CUDAHOME env variable. If not found, everything is based on finding 'nvcc' in the PATH. """ # first check if the CUDAHOME env variable is in use if 'CUDAHOME' in os.environ: home = os.environ['CUDAHOME'] nvcc = pjoin(home, 'bin', 'nvcc') else: # otherwise, search the PATH for NVCC default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path) if nvcc is None: raise EnvironmentError('The nvcc binary could not be ' 'located in your $PATH. Either add it to your path, or set $CUDAHOME') home = os.path.dirname(os.path.dirname(nvcc)) cudaconfig = {'home':home, 'nvcc':nvcc, 'include': pjoin(home, 'include'), 'lib64': pjoin(home, 'lib64')} for k, v in cudaconfig.iteritems(): if not os.path.exists(v): raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) return cudaconfig CUDA = locate_cuda() # Obtain the numpy include directory. This logic works across numpy versions. try: numpy_include = np.get_include() except AttributeError: numpy_include = np.get_numpy_include() def customize_compiler_for_nvcc(self): """inject deep into distutils to customize how the dispatch to gcc/nvcc works. If you subclass UnixCCompiler, it's not trivial to get your subclass injected in, and still have the right customizations (i.e. distutils.sysconfig.customize_compiler) run on it. So instead of going the OO route, I have this. Note, it's kindof like a wierd functional subclassing going on.""" # tell the compiler it can processes .cu self.src_extensions.append('.cu') # save references to the default compiler_so and _comple methods default_compiler_so = self.compiler_so super = self._compile # now redefine the _compile method. This gets executed for each # object but distutils doesn't have the ability to change compilers # based on source extension: we add it. def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): if os.path.splitext(src)[1] == '.cu': # use the cuda for .cu files self.set_executable('compiler_so', CUDA['nvcc']) # use only a subset of the extra_postargs, which are 1-1 translated # from the extra_compile_args in the Extension class postargs = extra_postargs['nvcc'] else: postargs = extra_postargs['gcc'] super(obj, src, ext, cc_args, postargs, pp_opts) # reset the default compiler_so, which we might have changed for cuda self.compiler_so = default_compiler_so # inject our redefined _compile method into the class self._compile = _compile # run the customize_compiler class custom_build_ext(build_ext): def build_extensions(self): customize_compiler_for_nvcc(self.compiler) build_ext.build_extensions(self) ext_modules = [ Extension( "utils.cython_bbox", ["utils/bbox.pyx"], extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, include_dirs = [numpy_include] ), Extension( "nms.cpu_nms", ["nms/cpu_nms.pyx"], extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, include_dirs = [numpy_include] ), Extension('nms.gpu_nms', ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'], library_dirs=[CUDA['lib64']], libraries=['cudart'], language='c++', runtime_library_dirs=[CUDA['lib64']], # this syntax is specific to this build system # we're only going to use certain compiler args with nvcc and not with # gcc the implementation of this trick is in customize_compiler() below extra_compile_args={'gcc': ["-Wno-unused-function"], 'nvcc': ['-arch=sm_35', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]}, include_dirs = [numpy_include, CUDA['include']] ), Extension( 'pycocotools._mask', sources=['pycocotools/maskApi.c', 'pycocotools/_mask.pyx'], include_dirs = [numpy_include, 'pycocotools'], extra_compile_args={ 'gcc': ['-Wno-cpp', '-Wno-unused-function', '-std=c99']}, ), ] setup( name='fast_rcnn', ext_modules=ext_modules, # inject our custom trigger cmdclass={'build_ext': custom_build_ext}, ) ================================================ FILE: lib/transform/__init__.py ================================================ ================================================ FILE: lib/transform/torch_image_transform_layer.py ================================================ # -------------------------------------------------------- # Fast/er R-CNN # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- """ Transform images for compatibility with models trained with https://github.com/facebook/fb.resnet.torch. Usage in model prototxt: layer { name: 'data_xform' type: 'Python' bottom: 'data_caffe' top: 'data' python_param { module: 'transform.torch_image_transform_layer' layer: 'TorchImageTransformLayer' } } """ import caffe from fast_rcnn.config import cfg import numpy as np class TorchImageTransformLayer(caffe.Layer): def setup(self, bottom, top): # (1, 3, 1, 1) shaped arrays self.PIXEL_MEANS = \ np.array([[[[0.48462227599918]], [[0.45624044862054]], [[0.40588363755159]]]]) self.PIXEL_STDS = \ np.array([[[[0.22889466674951]], [[0.22446679341259]], [[0.22495548344775]]]]) # The default ("old") pixel means that were already subtracted channel_swap = (0, 3, 1, 2) self.OLD_PIXEL_MEANS = \ cfg.PIXEL_MEANS[np.newaxis, :, :, :].transpose(channel_swap) top[0].reshape(*(bottom[0].shape)) def forward(self, bottom, top): ims = bottom[0].data # Invert the channel means that were already subtracted ims += self.OLD_PIXEL_MEANS # 1. Permute BGR to RGB and normalize to [0, 1] ims = ims[:, [2, 1, 0], :, :] / 255.0 # 2. Remove channel means ims -= self.PIXEL_MEANS # 3. Standardize channels ims /= self.PIXEL_STDS top[0].reshape(*(ims.shape)) top[0].data[...] = ims def backward(self, top, propagate_down, bottom): """This layer does not propagate gradients.""" pass def reshape(self, bottom, top): """Reshaping happens during the call to forward.""" pass ================================================ FILE: lib/utils/.gitignore ================================================ *.c *.so ================================================ FILE: lib/utils/__init__.py ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- ================================================ FILE: lib/utils/bbox.pyx ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Sergey Karayev # -------------------------------------------------------- cimport cython import numpy as np cimport numpy as np DTYPE = np.float ctypedef np.float_t DTYPE_t def bbox_overlaps( np.ndarray[DTYPE_t, ndim=2] boxes, np.ndarray[DTYPE_t, ndim=2] query_boxes): """ Parameters ---------- boxes: (N, 4) ndarray of float query_boxes: (K, 4) ndarray of float Returns ------- overlaps: (N, K) ndarray of overlap between boxes and query_boxes """ cdef unsigned int N = boxes.shape[0] cdef unsigned int K = query_boxes.shape[0] cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) cdef DTYPE_t iw, ih, box_area cdef DTYPE_t ua cdef unsigned int k, n for k in range(K): box_area = ( (query_boxes[k, 2] - query_boxes[k, 0] + 1) * (query_boxes[k, 3] - query_boxes[k, 1] + 1) ) for n in range(N): iw = ( min(boxes[n, 2], query_boxes[k, 2]) - max(boxes[n, 0], query_boxes[k, 0]) + 1 ) if iw > 0: ih = ( min(boxes[n, 3], query_boxes[k, 3]) - max(boxes[n, 1], query_boxes[k, 1]) + 1 ) if ih > 0: ua = float( (boxes[n, 2] - boxes[n, 0] + 1) * (boxes[n, 3] - boxes[n, 1] + 1) + box_area - iw * ih ) overlaps[n, k] = iw * ih / ua return overlaps ================================================ FILE: lib/utils/blob.py ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- """Blob helper functions.""" import numpy as np import cv2 def im_list_to_blob(ims): """Convert a list of images into a network input. Assumes images are already prepared (means subtracted, BGR order, ...). """ max_shape = np.array([im.shape for im in ims]).max(axis=0) num_images = len(ims) blob = np.zeros((num_images, max_shape[0], max_shape[1], 3), dtype=np.float32) for i in xrange(num_images): im = ims[i] blob[i, 0:im.shape[0], 0:im.shape[1], :] = im # Move channels (axis 3) to axis 1 # Axis order will become: (batch elem, channel, height, width) channel_swap = (0, 3, 1, 2) blob = blob.transpose(channel_swap) return blob def prep_im_for_blob(im, pixel_means, target_size, max_size): """Mean subtract and scale an image for use in a blob.""" im = im.astype(np.float32, copy=False) im -= pixel_means im_shape = im.shape im_size_min = np.min(im_shape[0:2]) im_size_max = np.max(im_shape[0:2]) im_scale = float(target_size) / float(im_size_min) # Prevent the biggest axis from being more than MAX_SIZE if np.round(im_scale * im_size_max) > max_size: im_scale = float(max_size) / float(im_size_max) im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) return im, im_scale ================================================ FILE: lib/utils/timer.py ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- import time class Timer(object): """A simple timer.""" def __init__(self): self.total_time = 0. self.calls = 0 self.start_time = 0. self.diff = 0. self.average_time = 0. def tic(self): # using time.time instead of time.clock because time time.clock # does not normalize for multithreading self.start_time = time.time() def toc(self, average=True): self.diff = time.time() - self.start_time self.total_time += self.diff self.calls += 1 self.average_time = self.total_time / self.calls if average: return self.average_time else: return self.diff ================================================ FILE: models/README.md ================================================ ## Model Zoo ### COCO Faster R-CNN VGG-16 trained using end-to-end Model URL: https://dl.dropboxusercontent.com/s/cotx0y81zvbbhnt/coco_vgg16_faster_rcnn_final.caffemodel?dl=0 Training command: ``` tools/train_net.py \ --gpu 0 \ --solver ./models/coco/VGG16/faster_rcnn_end2end/solver.prototxt \ --weights data/imagenet_models/VGG16.v2.caffemodel \ --imdb coco_2014_train+coco_2014_valminusminival \ --iters 490000 \ --cfg ./experiments/cfgs/faster_rcnn_end2end.yml ``` `py-faster-rcnn` commit: 68eec95 test-dev2015 results ``` Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.242 Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.453 Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.235 Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.077 Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.264 Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.371 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.238 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.340 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.346 Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.120 Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.385 Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.544 ``` test-standard2015 results ``` Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.242 Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.453 Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.234 Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.072 Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.264 Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.369 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.238 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.341 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.347 Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.115 Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.389 Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.544 ``` ================================================ FILE: models/coco/VGG16/fast_rcnn/solver.prototxt ================================================ train_net: "models/coco/VGG16/fast_rcnn/train.prototxt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 200000 display: 20 average_loss: 100 # iter_size: 1 momentum: 0.9 weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "vgg16_fast_rcnn" #debug_info: true ================================================ FILE: models/coco/VGG16/fast_rcnn/test.prototxt ================================================ name: "VGG_ILSVRC_16_layers" input: "data" input_shape { dim: 1 dim: 3 dim: 224 dim: 224 } input: "rois" input_shape { dim: 1 # to be changed on-the-fly to num ROIs dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing } layer { name: "conv1_1" type: "Convolution" bottom: "data" top: "conv1_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_1" type: "ReLU" bottom: "conv1_1" top: "conv1_1" } layer { name: "conv1_2" type: "Convolution" bottom: "conv1_1" top: "conv1_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_2" type: "ReLU" bottom: "conv1_2" top: "conv1_2" } layer { name: "pool1" type: "Pooling" bottom: "conv1_2" top: "pool1" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv2_1" type: "Convolution" bottom: "pool1" top: "conv2_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_1" type: "ReLU" bottom: "conv2_1" top: "conv2_1" } layer { name: "conv2_2" type: "Convolution" bottom: "conv2_1" top: "conv2_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_2" type: "ReLU" bottom: "conv2_2" top: "conv2_2" } layer { name: "pool2" type: "Pooling" bottom: "conv2_2" top: "pool2" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv3_1" type: "Convolution" bottom: "pool2" top: "conv3_1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_1" type: "ReLU" bottom: "conv3_1" top: "conv3_1" } layer { name: "conv3_2" type: "Convolution" bottom: "conv3_1" top: "conv3_2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_2" type: "ReLU" bottom: "conv3_2" top: "conv3_2" } layer { name: "conv3_3" type: "Convolution" bottom: "conv3_2" top: "conv3_3" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_3" type: "ReLU" bottom: "conv3_3" top: "conv3_3" } layer { name: "pool3" type: "Pooling" bottom: "conv3_3" top: "pool3" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv4_1" type: "Convolution" bottom: "pool3" top: "conv4_1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_1" type: "ReLU" bottom: "conv4_1" top: "conv4_1" } layer { name: "conv4_2" type: "Convolution" bottom: "conv4_1" top: "conv4_2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_2" type: "ReLU" bottom: "conv4_2" top: "conv4_2" } layer { name: "conv4_3" type: "Convolution" bottom: "conv4_2" top: "conv4_3" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_3" type: "ReLU" bottom: "conv4_3" top: "conv4_3" } layer { name: "pool4" type: "Pooling" bottom: "conv4_3" top: "pool4" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv5_1" type: "Convolution" bottom: "pool4" top: "conv5_1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_1" type: "ReLU" bottom: "conv5_1" top: "conv5_1" } layer { name: "conv5_2" type: "Convolution" bottom: "conv5_1" top: "conv5_2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_2" type: "ReLU" bottom: "conv5_2" top: "conv5_2" } layer { name: "conv5_3" type: "Convolution" bottom: "conv5_2" top: "conv5_3" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_3" type: "ReLU" bottom: "conv5_3" top: "conv5_3" } layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5_3" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 7 pooled_h: 7 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 81 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 324 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "cls_prob" type: "Softmax" bottom: "cls_score" top: "cls_prob" } ================================================ FILE: models/coco/VGG16/fast_rcnn/train.prototxt ================================================ name: "VGG_ILSVRC_16_layers" layer { name: 'data' type: 'Python' top: 'data' top: 'rois' top: 'labels' top: 'bbox_targets' top: 'bbox_inside_weights' top: 'bbox_outside_weights' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 81" } } layer { name: "conv1_1" type: "Convolution" bottom: "data" top: "conv1_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_1" type: "ReLU" bottom: "conv1_1" top: "conv1_1" } layer { name: "conv1_2" type: "Convolution" bottom: "conv1_1" top: "conv1_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_2" type: "ReLU" bottom: "conv1_2" top: "conv1_2" } layer { name: "pool1" type: "Pooling" bottom: "conv1_2" top: "pool1" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv2_1" type: "Convolution" bottom: "pool1" top: "conv2_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_1" type: "ReLU" bottom: "conv2_1" top: "conv2_1" } layer { name: "conv2_2" type: "Convolution" bottom: "conv2_1" top: "conv2_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_2" type: "ReLU" bottom: "conv2_2" top: "conv2_2" } layer { name: "pool2" type: "Pooling" bottom: "conv2_2" top: "pool2" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv3_1" type: "Convolution" bottom: "pool2" top: "conv3_1" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_1" type: "ReLU" bottom: "conv3_1" top: "conv3_1" } layer { name: "conv3_2" type: "Convolution" bottom: "conv3_1" top: "conv3_2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_2" type: "ReLU" bottom: "conv3_2" top: "conv3_2" } layer { name: "conv3_3" type: "Convolution" bottom: "conv3_2" top: "conv3_3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_3" type: "ReLU" bottom: "conv3_3" top: "conv3_3" } layer { name: "pool3" type: "Pooling" bottom: "conv3_3" top: "pool3" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv4_1" type: "Convolution" bottom: "pool3" top: "conv4_1" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_1" type: "ReLU" bottom: "conv4_1" top: "conv4_1" } layer { name: "conv4_2" type: "Convolution" bottom: "conv4_1" top: "conv4_2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_2" type: "ReLU" bottom: "conv4_2" top: "conv4_2" } layer { name: "conv4_3" type: "Convolution" bottom: "conv4_2" top: "conv4_3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_3" type: "ReLU" bottom: "conv4_3" top: "conv4_3" } layer { name: "pool4" type: "Pooling" bottom: "conv4_3" top: "pool4" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv5_1" type: "Convolution" bottom: "pool4" top: "conv5_1" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_1" type: "ReLU" bottom: "conv5_1" top: "conv5_1" } layer { name: "conv5_2" type: "Convolution" bottom: "conv5_1" top: "conv5_2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_2" type: "ReLU" bottom: "conv5_2" top: "conv5_2" } layer { name: "conv5_3" type: "Convolution" bottom: "conv5_2" top: "conv5_3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_3" type: "ReLU" bottom: "conv5_3" top: "conv5_3" } layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5_3" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 7 pooled_h: 7 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 4096 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 81 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 324 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "loss_cls" type: "SoftmaxWithLoss" bottom: "cls_score" bottom: "labels" top: "loss_cls" loss_weight: 1 } layer { name: "loss_bbox" type: "SmoothL1Loss" bottom: "bbox_pred" bottom: "bbox_targets" bottom: "bbox_inside_weights" bottom: "bbox_outside_weights" top: "loss_bbox" loss_weight: 1 } ================================================ FILE: models/coco/VGG16/faster_rcnn_end2end/solver.prototxt ================================================ train_net: "models/coco/VGG16/faster_rcnn_end2end/train.prototxt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 350000 display: 20 average_loss: 100 momentum: 0.9 weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "vgg16_faster_rcnn" iter_size: 2 ================================================ FILE: models/coco/VGG16/faster_rcnn_end2end/test.prototxt ================================================ name: "VGG_ILSVRC_16_layers" input: "data" input_shape { dim: 1 dim: 3 dim: 224 dim: 224 } input: "im_info" input_shape { dim: 1 dim: 3 } layer { name: "conv1_1" type: "Convolution" bottom: "data" top: "conv1_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_1" type: "ReLU" bottom: "conv1_1" top: "conv1_1" } layer { name: "conv1_2" type: "Convolution" bottom: "conv1_1" top: "conv1_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_2" type: "ReLU" bottom: "conv1_2" top: "conv1_2" } layer { name: "pool1" type: "Pooling" bottom: "conv1_2" top: "pool1" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv2_1" type: "Convolution" bottom: "pool1" top: "conv2_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_1" type: "ReLU" bottom: "conv2_1" top: "conv2_1" } layer { name: "conv2_2" type: "Convolution" bottom: "conv2_1" top: "conv2_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_2" type: "ReLU" bottom: "conv2_2" top: "conv2_2" } layer { name: "pool2" type: "Pooling" bottom: "conv2_2" top: "pool2" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv3_1" type: "Convolution" bottom: "pool2" top: "conv3_1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_1" type: "ReLU" bottom: "conv3_1" top: "conv3_1" } layer { name: "conv3_2" type: "Convolution" bottom: "conv3_1" top: "conv3_2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_2" type: "ReLU" bottom: "conv3_2" top: "conv3_2" } layer { name: "conv3_3" type: "Convolution" bottom: "conv3_2" top: "conv3_3" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_3" type: "ReLU" bottom: "conv3_3" top: "conv3_3" } layer { name: "pool3" type: "Pooling" bottom: "conv3_3" top: "pool3" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv4_1" type: "Convolution" bottom: "pool3" top: "conv4_1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_1" type: "ReLU" bottom: "conv4_1" top: "conv4_1" } layer { name: "conv4_2" type: "Convolution" bottom: "conv4_1" top: "conv4_2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_2" type: "ReLU" bottom: "conv4_2" top: "conv4_2" } layer { name: "conv4_3" type: "Convolution" bottom: "conv4_2" top: "conv4_3" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_3" type: "ReLU" bottom: "conv4_3" top: "conv4_3" } layer { name: "pool4" type: "Pooling" bottom: "conv4_3" top: "pool4" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv5_1" type: "Convolution" bottom: "pool4" top: "conv5_1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_1" type: "ReLU" bottom: "conv5_1" top: "conv5_1" } layer { name: "conv5_2" type: "Convolution" bottom: "conv5_1" top: "conv5_2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_2" type: "ReLU" bottom: "conv5_2" top: "conv5_2" } layer { name: "conv5_3" type: "Convolution" bottom: "conv5_2" top: "conv5_3" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_3" type: "ReLU" bottom: "conv5_3" top: "conv5_3" } #========= RPN ============ layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5_3" top: "rpn/output" param { lr_mult: 1.0 decay_mult: 1.0 } param { lr_mult: 2.0 decay_mult: 0 } convolution_param { num_output: 512 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" param { lr_mult: 1.0 decay_mult: 1.0 } param { lr_mult: 2.0 decay_mult: 0 } convolution_param { num_output: 24 # 2(bg/fg) * 12(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" param { lr_mult: 1.0 decay_mult: 1.0 } param { lr_mult: 2.0 decay_mult: 0 } convolution_param { num_output: 48 # 4 * 12(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } #========= RoI Proposal ============ layer { name: "rpn_cls_prob" type: "Softmax" bottom: "rpn_cls_score_reshape" top: "rpn_cls_prob" } layer { name: 'rpn_cls_prob_reshape' type: 'Reshape' bottom: 'rpn_cls_prob' top: 'rpn_cls_prob_reshape' reshape_param { shape { dim: 0 dim: 24 dim: -1 dim: 0 } } } layer { name: 'proposal' type: 'Python' bottom: 'rpn_cls_prob_reshape' bottom: 'rpn_bbox_pred' bottom: 'im_info' top: 'rois' python_param { module: 'rpn.proposal_layer' layer: 'ProposalLayer' param_str: "'feat_stride': 16 \n'scales': !!python/tuple [4, 8, 16, 32]" } } #========= RCNN ============ layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5_3" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 7 pooled_h: 7 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 81 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 324 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "cls_prob" type: "Softmax" bottom: "cls_score" top: "cls_prob" } ================================================ FILE: models/coco/VGG16/faster_rcnn_end2end/train.prototxt ================================================ name: "VGG_ILSVRC_16_layers" layer { name: 'input-data' type: 'Python' top: 'data' top: 'im_info' top: 'gt_boxes' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 81" } } layer { name: "conv1_1" type: "Convolution" bottom: "data" top: "conv1_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_1" type: "ReLU" bottom: "conv1_1" top: "conv1_1" } layer { name: "conv1_2" type: "Convolution" bottom: "conv1_1" top: "conv1_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_2" type: "ReLU" bottom: "conv1_2" top: "conv1_2" } layer { name: "pool1" type: "Pooling" bottom: "conv1_2" top: "pool1" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv2_1" type: "Convolution" bottom: "pool1" top: "conv2_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_1" type: "ReLU" bottom: "conv2_1" top: "conv2_1" } layer { name: "conv2_2" type: "Convolution" bottom: "conv2_1" top: "conv2_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_2" type: "ReLU" bottom: "conv2_2" top: "conv2_2" } layer { name: "pool2" type: "Pooling" bottom: "conv2_2" top: "pool2" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv3_1" type: "Convolution" bottom: "pool2" top: "conv3_1" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_1" type: "ReLU" bottom: "conv3_1" top: "conv3_1" } layer { name: "conv3_2" type: "Convolution" bottom: "conv3_1" top: "conv3_2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_2" type: "ReLU" bottom: "conv3_2" top: "conv3_2" } layer { name: "conv3_3" type: "Convolution" bottom: "conv3_2" top: "conv3_3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_3" type: "ReLU" bottom: "conv3_3" top: "conv3_3" } layer { name: "pool3" type: "Pooling" bottom: "conv3_3" top: "pool3" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv4_1" type: "Convolution" bottom: "pool3" top: "conv4_1" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_1" type: "ReLU" bottom: "conv4_1" top: "conv4_1" } layer { name: "conv4_2" type: "Convolution" bottom: "conv4_1" top: "conv4_2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_2" type: "ReLU" bottom: "conv4_2" top: "conv4_2" } layer { name: "conv4_3" type: "Convolution" bottom: "conv4_2" top: "conv4_3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_3" type: "ReLU" bottom: "conv4_3" top: "conv4_3" } layer { name: "pool4" type: "Pooling" bottom: "conv4_3" top: "pool4" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv5_1" type: "Convolution" bottom: "pool4" top: "conv5_1" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_1" type: "ReLU" bottom: "conv5_1" top: "conv5_1" } layer { name: "conv5_2" type: "Convolution" bottom: "conv5_1" top: "conv5_2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_2" type: "ReLU" bottom: "conv5_2" top: "conv5_2" } layer { name: "conv5_3" type: "Convolution" bottom: "conv5_2" top: "conv5_3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_3" type: "ReLU" bottom: "conv5_3" top: "conv5_3" } #========= RPN ============ layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5_3" top: "rpn/output" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 512 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 24 # 2(bg/fg) * 12(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 48 # 4 * 12(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } layer { name: 'rpn-data' type: 'Python' bottom: 'rpn_cls_score' bottom: 'gt_boxes' bottom: 'im_info' bottom: 'data' top: 'rpn_labels' top: 'rpn_bbox_targets' top: 'rpn_bbox_inside_weights' top: 'rpn_bbox_outside_weights' python_param { module: 'rpn.anchor_target_layer' layer: 'AnchorTargetLayer' param_str: "'feat_stride': 16 \n'scales': !!python/tuple [4, 8, 16, 32]" } } layer { name: "rpn_loss_cls" type: "SoftmaxWithLoss" bottom: "rpn_cls_score_reshape" bottom: "rpn_labels" propagate_down: 1 propagate_down: 0 top: "rpn_cls_loss" loss_weight: 1 loss_param { ignore_label: -1 normalize: true } } layer { name: "rpn_loss_bbox" type: "SmoothL1Loss" bottom: "rpn_bbox_pred" bottom: "rpn_bbox_targets" bottom: 'rpn_bbox_inside_weights' bottom: 'rpn_bbox_outside_weights' top: "rpn_loss_bbox" loss_weight: 1 smooth_l1_loss_param { sigma: 3.0 } } #========= RoI Proposal ============ layer { name: "rpn_cls_prob" type: "Softmax" bottom: "rpn_cls_score_reshape" top: "rpn_cls_prob" } layer { name: 'rpn_cls_prob_reshape' type: 'Reshape' bottom: 'rpn_cls_prob' top: 'rpn_cls_prob_reshape' reshape_param { shape { dim: 0 dim: 24 dim: -1 dim: 0 } } } layer { name: 'proposal' type: 'Python' bottom: 'rpn_cls_prob_reshape' bottom: 'rpn_bbox_pred' bottom: 'im_info' top: 'rpn_rois' python_param { module: 'rpn.proposal_layer' layer: 'ProposalLayer' param_str: "'feat_stride': 16 \n'scales': !!python/tuple [4, 8, 16, 32]" } } layer { name: 'roi-data' type: 'Python' bottom: 'rpn_rois' bottom: 'gt_boxes' top: 'rois' top: 'labels' top: 'bbox_targets' top: 'bbox_inside_weights' top: 'bbox_outside_weights' python_param { module: 'rpn.proposal_target_layer' layer: 'ProposalTargetLayer' param_str: "'num_classes': 81" } } #========= RCNN ============ layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5_3" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 7 pooled_h: 7 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 4096 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 81 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 324 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "loss_cls" type: "SoftmaxWithLoss" bottom: "cls_score" bottom: "labels" propagate_down: 1 propagate_down: 0 top: "loss_cls" loss_weight: 1 } layer { name: "loss_bbox" type: "SmoothL1Loss" bottom: "bbox_pred" bottom: "bbox_targets" bottom: "bbox_inside_weights" bottom: "bbox_outside_weights" top: "loss_bbox" loss_weight: 1 } ================================================ FILE: models/coco/VGG_CNN_M_1024/fast_rcnn/solver.prototxt ================================================ train_net: "models/coco/VGG_CNN_M_1024/fast_rcnn/train.prototxt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 200000 display: 20 average_loss: 100 momentum: 0.9 weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "vgg_cnn_m_1024_fast_rcnn" #debug_info: true ================================================ FILE: models/coco/VGG_CNN_M_1024/fast_rcnn/test.prototxt ================================================ name: "VGG_CNN_M_1024" input: "data" input_shape { dim: 1 dim: 3 dim: 224 dim: 224 } input: "rois" input_shape { dim: 1 # to be changed on-the-fly to num ROIs dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing } layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 96 kernel_size: 7 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 5 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 6 pooled_h: 6 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 1024 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 81 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 324 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "cls_prob" type: "Softmax" bottom: "cls_score" top: "cls_prob" } ================================================ FILE: models/coco/VGG_CNN_M_1024/fast_rcnn/train.prototxt ================================================ name: "VGG_CNN_M_1024" layer { name: 'data' type: 'Python' top: 'data' top: 'rois' top: 'labels' top: 'bbox_targets' top: 'bbox_inside_weights' top: 'bbox_outside_weights' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 81" } } layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 96 kernel_size: 7 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 5 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 6 pooled_h: 6 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 1024 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 81 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 324 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "loss_cls" type: "SoftmaxWithLoss" bottom: "cls_score" bottom: "labels" top: "loss_cls" loss_weight: 1 } layer { name: "loss_bbox" type: "SmoothL1Loss" bottom: "bbox_pred" bottom: "bbox_targets" bottom: "bbox_inside_weights" bottom: "bbox_outside_weights" top: "loss_bbox" loss_weight: 1 } ================================================ FILE: models/coco/VGG_CNN_M_1024/faster_rcnn_end2end/solver.prototxt ================================================ train_net: "models/coco/VGG_CNN_M_1024/faster_rcnn_end2end/train.prototxt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 350000 display: 20 average_loss: 100 momentum: 0.9 weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "vgg_cnn_m_1024_faster_rcnn" ================================================ FILE: models/coco/VGG_CNN_M_1024/faster_rcnn_end2end/test.prototxt ================================================ name: "VGG_CNN_M_1024" input: "data" input_shape { dim: 1 dim: 3 dim: 224 dim: 224 } input: "im_info" input_shape { dim: 1 dim: 3 } layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 96 kernel_size: 7 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 5 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #========= RPN ============ layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5" top: "rpn/output" param { lr_mult: 1.0 decay_mult: 1.0 } param { lr_mult: 2.0 decay_mult: 0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } #layer { # name: "rpn_conv/3x3" # type: "Convolution" # bottom: "conv5" # top: "rpn_conv/3x3" # param { lr_mult: 1.0 decay_mult: 1.0 } # param { lr_mult: 2.0 decay_mult: 0 } # convolution_param { # num_output: 192 # kernel_size: 3 pad: 1 stride: 1 # weight_filler { type: "gaussian" std: 0.01 } # bias_filler { type: "constant" value: 0 } # } #} #layer { # name: "rpn_conv/5x5" # type: "Convolution" # bottom: "conv5" # top: "rpn_conv/5x5" # param { lr_mult: 1.0 decay_mult: 1.0 } # param { lr_mult: 2.0 decay_mult: 0 } # convolution_param { # num_output: 64 # kernel_size: 5 pad: 2 stride: 1 # weight_filler { type: "gaussian" std: 0.0036 } # bias_filler { type: "constant" value: 0 } # } #} #layer { # name: "rpn/output" # type: "Concat" # bottom: "rpn_conv/3x3" # bottom: "rpn_conv/5x5" # top: "rpn/output" #} #layer { # name: "rpn_relu/output" # type: "ReLU" # bottom: "rpn/output" # top: "rpn/output" #} layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" param { lr_mult: 1.0 decay_mult: 1.0 } param { lr_mult: 2.0 decay_mult: 0 } convolution_param { num_output: 24 # 2(bg/fg) * 12(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" param { lr_mult: 1.0 decay_mult: 1.0 } param { lr_mult: 2.0 decay_mult: 0 } convolution_param { num_output: 48 # 4 * 12(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } #========= RoI Proposal ============ layer { name: "rpn_cls_prob" type: "Softmax" bottom: "rpn_cls_score_reshape" top: "rpn_cls_prob" } layer { name: 'rpn_cls_prob_reshape' type: 'Reshape' bottom: 'rpn_cls_prob' top: 'rpn_cls_prob_reshape' reshape_param { shape { dim: 0 dim: 24 dim: -1 dim: 0 } } } layer { name: 'proposal' type: 'Python' bottom: 'rpn_cls_prob_reshape' bottom: 'rpn_bbox_pred' bottom: 'im_info' top: 'rois' python_param { module: 'rpn.proposal_layer' layer: 'ProposalLayer' param_str: "'feat_stride': 16 \n'scales': !!python/tuple [4, 8, 16, 32]" } } #========= RCNN ============ layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 6 pooled_h: 6 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 1024 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 81 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 324 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "cls_prob" type: "Softmax" bottom: "cls_score" top: "cls_prob" } ================================================ FILE: models/coco/VGG_CNN_M_1024/faster_rcnn_end2end/train.prototxt ================================================ name: "VGG_CNN_M_1024" layer { name: 'input-data' type: 'Python' top: 'data' top: 'im_info' top: 'gt_boxes' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 81" } } layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 96 kernel_size: 7 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 5 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #========= RPN ============ layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5" top: "rpn/output" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 24 # 2(bg/fg) * 12(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 48 # 4 * 12(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } layer { name: 'rpn-data' type: 'Python' bottom: 'rpn_cls_score' bottom: 'gt_boxes' bottom: 'im_info' bottom: 'data' top: 'rpn_labels' top: 'rpn_bbox_targets' top: 'rpn_bbox_inside_weights' top: 'rpn_bbox_outside_weights' python_param { module: 'rpn.anchor_target_layer' layer: 'AnchorTargetLayer' param_str: "'feat_stride': 16 \n'scales': !!python/tuple [4, 8, 16, 32]" } } layer { name: "rpn_loss_cls" type: "SoftmaxWithLoss" bottom: "rpn_cls_score_reshape" bottom: "rpn_labels" propagate_down: 1 propagate_down: 0 top: "rpn_cls_loss" loss_weight: 1 loss_param { ignore_label: -1 normalize: true } } layer { name: "rpn_loss_bbox" type: "SmoothL1Loss" bottom: "rpn_bbox_pred" bottom: "rpn_bbox_targets" bottom: 'rpn_bbox_inside_weights' bottom: 'rpn_bbox_outside_weights' top: "rpn_loss_bbox" loss_weight: 1 smooth_l1_loss_param { sigma: 3.0 } } #========= RoI Proposal ============ layer { name: "rpn_cls_prob" type: "Softmax" bottom: "rpn_cls_score_reshape" top: "rpn_cls_prob" } layer { name: 'rpn_cls_prob_reshape' type: 'Reshape' bottom: 'rpn_cls_prob' top: 'rpn_cls_prob_reshape' reshape_param { shape { dim: 0 dim: 24 dim: -1 dim: 0 } } } layer { name: 'proposal' type: 'Python' bottom: 'rpn_cls_prob_reshape' bottom: 'rpn_bbox_pred' bottom: 'im_info' top: 'rpn_rois' python_param { module: 'rpn.proposal_layer' layer: 'ProposalLayer' param_str: "'feat_stride': 16 \n'scales': !!python/tuple [4, 8, 16, 32]" } } layer { name: 'roi-data' type: 'Python' bottom: 'rpn_rois' bottom: 'gt_boxes' top: 'rois' top: 'labels' top: 'bbox_targets' top: 'bbox_inside_weights' top: 'bbox_outside_weights' python_param { module: 'rpn.proposal_target_layer' layer: 'ProposalTargetLayer' param_str: "'num_classes': 81" } } #========= RCNN ============ layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 6 pooled_h: 6 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 1024 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 81 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 324 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "loss_cls" type: "SoftmaxWithLoss" bottom: "cls_score" bottom: "labels" propagate_down: 1 propagate_down: 0 top: "loss_cls" loss_weight: 1 } layer { name: "loss_bbox" type: "SmoothL1Loss" bottom: "bbox_pred" bottom: "bbox_targets" bottom: "bbox_inside_weights" bottom: "bbox_outside_weights" top: "loss_bbox" loss_weight: 1 } ================================================ FILE: models/pascal_voc/VGG16/fast_rcnn/solver.prototxt ================================================ train_net: "models/pascal_voc/VGG16/fast_rcnn/train.prototxt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 30000 display: 20 average_loss: 100 # iter_size: 1 momentum: 0.9 weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "vgg16_fast_rcnn" #debug_info: true ================================================ FILE: models/pascal_voc/VGG16/fast_rcnn/test.prototxt ================================================ name: "VGG_ILSVRC_16_layers" input: "data" input_shape { dim: 1 dim: 3 dim: 224 dim: 224 } input: "rois" input_shape { dim: 1 # to be changed on-the-fly to num ROIs dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing } layer { name: "conv1_1" type: "Convolution" bottom: "data" top: "conv1_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_1" type: "ReLU" bottom: "conv1_1" top: "conv1_1" } layer { name: "conv1_2" type: "Convolution" bottom: "conv1_1" top: "conv1_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_2" type: "ReLU" bottom: "conv1_2" top: "conv1_2" } layer { name: "pool1" type: "Pooling" bottom: "conv1_2" top: "pool1" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv2_1" type: "Convolution" bottom: "pool1" top: "conv2_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_1" type: "ReLU" bottom: "conv2_1" top: "conv2_1" } layer { name: "conv2_2" type: "Convolution" bottom: "conv2_1" top: "conv2_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_2" type: "ReLU" bottom: "conv2_2" top: "conv2_2" } layer { name: "pool2" type: "Pooling" bottom: "conv2_2" top: "pool2" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv3_1" type: "Convolution" bottom: "pool2" top: "conv3_1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_1" type: "ReLU" bottom: "conv3_1" top: "conv3_1" } layer { name: "conv3_2" type: "Convolution" bottom: "conv3_1" top: "conv3_2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_2" type: "ReLU" bottom: "conv3_2" top: "conv3_2" } layer { name: "conv3_3" type: "Convolution" bottom: "conv3_2" top: "conv3_3" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_3" type: "ReLU" bottom: "conv3_3" top: "conv3_3" } layer { name: "pool3" type: "Pooling" bottom: "conv3_3" top: "pool3" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv4_1" type: "Convolution" bottom: "pool3" top: "conv4_1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_1" type: "ReLU" bottom: "conv4_1" top: "conv4_1" } layer { name: "conv4_2" type: "Convolution" bottom: "conv4_1" top: "conv4_2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_2" type: "ReLU" bottom: "conv4_2" top: "conv4_2" } layer { name: "conv4_3" type: "Convolution" bottom: "conv4_2" top: "conv4_3" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_3" type: "ReLU" bottom: "conv4_3" top: "conv4_3" } layer { name: "pool4" type: "Pooling" bottom: "conv4_3" top: "pool4" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv5_1" type: "Convolution" bottom: "pool4" top: "conv5_1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_1" type: "ReLU" bottom: "conv5_1" top: "conv5_1" } layer { name: "conv5_2" type: "Convolution" bottom: "conv5_1" top: "conv5_2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_2" type: "ReLU" bottom: "conv5_2" top: "conv5_2" } layer { name: "conv5_3" type: "Convolution" bottom: "conv5_2" top: "conv5_3" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_3" type: "ReLU" bottom: "conv5_3" top: "conv5_3" } layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5_3" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 7 pooled_h: 7 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7" dropout_param { dropout_ratio: 0.5 } } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 21 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 84 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "cls_prob" type: "Softmax" bottom: "cls_score" top: "cls_prob" } ================================================ FILE: models/pascal_voc/VGG16/fast_rcnn/train.prototxt ================================================ name: "VGG_ILSVRC_16_layers" layer { name: 'data' type: 'Python' top: 'data' top: 'rois' top: 'labels' top: 'bbox_targets' top: 'bbox_inside_weights' top: 'bbox_outside_weights' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 21" } } layer { name: "conv1_1" type: "Convolution" bottom: "data" top: "conv1_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_1" type: "ReLU" bottom: "conv1_1" top: "conv1_1" } layer { name: "conv1_2" type: "Convolution" bottom: "conv1_1" top: "conv1_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_2" type: "ReLU" bottom: "conv1_2" top: "conv1_2" } layer { name: "pool1" type: "Pooling" bottom: "conv1_2" top: "pool1" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv2_1" type: "Convolution" bottom: "pool1" top: "conv2_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_1" type: "ReLU" bottom: "conv2_1" top: "conv2_1" } layer { name: "conv2_2" type: "Convolution" bottom: "conv2_1" top: "conv2_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_2" type: "ReLU" bottom: "conv2_2" top: "conv2_2" } layer { name: "pool2" type: "Pooling" bottom: "conv2_2" top: "pool2" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv3_1" type: "Convolution" bottom: "pool2" top: "conv3_1" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_1" type: "ReLU" bottom: "conv3_1" top: "conv3_1" } layer { name: "conv3_2" type: "Convolution" bottom: "conv3_1" top: "conv3_2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_2" type: "ReLU" bottom: "conv3_2" top: "conv3_2" } layer { name: "conv3_3" type: "Convolution" bottom: "conv3_2" top: "conv3_3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_3" type: "ReLU" bottom: "conv3_3" top: "conv3_3" } layer { name: "pool3" type: "Pooling" bottom: "conv3_3" top: "pool3" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv4_1" type: "Convolution" bottom: "pool3" top: "conv4_1" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_1" type: "ReLU" bottom: "conv4_1" top: "conv4_1" } layer { name: "conv4_2" type: "Convolution" bottom: "conv4_1" top: "conv4_2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_2" type: "ReLU" bottom: "conv4_2" top: "conv4_2" } layer { name: "conv4_3" type: "Convolution" bottom: "conv4_2" top: "conv4_3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_3" type: "ReLU" bottom: "conv4_3" top: "conv4_3" } layer { name: "pool4" type: "Pooling" bottom: "conv4_3" top: "pool4" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv5_1" type: "Convolution" bottom: "pool4" top: "conv5_1" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_1" type: "ReLU" bottom: "conv5_1" top: "conv5_1" } layer { name: "conv5_2" type: "Convolution" bottom: "conv5_1" top: "conv5_2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_2" type: "ReLU" bottom: "conv5_2" top: "conv5_2" } layer { name: "conv5_3" type: "Convolution" bottom: "conv5_2" top: "conv5_3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_3" type: "ReLU" bottom: "conv5_3" top: "conv5_3" } layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5_3" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 7 pooled_h: 7 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 4096 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7" dropout_param { dropout_ratio: 0.5 } } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 21 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 84 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "loss_cls" type: "SoftmaxWithLoss" bottom: "cls_score" bottom: "labels" top: "loss_cls" loss_weight: 1 } layer { name: "loss_bbox" type: "SmoothL1Loss" bottom: "bbox_pred" bottom: "bbox_targets" bottom: "bbox_inside_weights" bottom: "bbox_outside_weights" top: "loss_bbox" loss_weight: 1 } ================================================ FILE: models/pascal_voc/VGG16/faster_rcnn_alt_opt/faster_rcnn_test.pt ================================================ name: "VGG_ILSVRC_16_layers" input: "data" input_shape { dim: 1 dim: 3 dim: 224 dim: 224 } input: "im_info" input_shape { dim: 1 dim: 3 } layer { name: "conv1_1" type: "Convolution" bottom: "data" top: "conv1_1" convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_1" type: "ReLU" bottom: "conv1_1" top: "conv1_1" } layer { name: "conv1_2" type: "Convolution" bottom: "conv1_1" top: "conv1_2" convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_2" type: "ReLU" bottom: "conv1_2" top: "conv1_2" } layer { name: "pool1" type: "Pooling" bottom: "conv1_2" top: "pool1" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv2_1" type: "Convolution" bottom: "pool1" top: "conv2_1" convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_1" type: "ReLU" bottom: "conv2_1" top: "conv2_1" } layer { name: "conv2_2" type: "Convolution" bottom: "conv2_1" top: "conv2_2" convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_2" type: "ReLU" bottom: "conv2_2" top: "conv2_2" } layer { name: "pool2" type: "Pooling" bottom: "conv2_2" top: "pool2" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv3_1" type: "Convolution" bottom: "pool2" top: "conv3_1" convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_1" type: "ReLU" bottom: "conv3_1" top: "conv3_1" } layer { name: "conv3_2" type: "Convolution" bottom: "conv3_1" top: "conv3_2" convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_2" type: "ReLU" bottom: "conv3_2" top: "conv3_2" } layer { name: "conv3_3" type: "Convolution" bottom: "conv3_2" top: "conv3_3" convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_3" type: "ReLU" bottom: "conv3_3" top: "conv3_3" } layer { name: "pool3" type: "Pooling" bottom: "conv3_3" top: "pool3" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv4_1" type: "Convolution" bottom: "pool3" top: "conv4_1" convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_1" type: "ReLU" bottom: "conv4_1" top: "conv4_1" } layer { name: "conv4_2" type: "Convolution" bottom: "conv4_1" top: "conv4_2" convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_2" type: "ReLU" bottom: "conv4_2" top: "conv4_2" } layer { name: "conv4_3" type: "Convolution" bottom: "conv4_2" top: "conv4_3" convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_3" type: "ReLU" bottom: "conv4_3" top: "conv4_3" } layer { name: "pool4" type: "Pooling" bottom: "conv4_3" top: "pool4" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv5_1" type: "Convolution" bottom: "pool4" top: "conv5_1" convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_1" type: "ReLU" bottom: "conv5_1" top: "conv5_1" } layer { name: "conv5_2" type: "Convolution" bottom: "conv5_1" top: "conv5_2" convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_2" type: "ReLU" bottom: "conv5_2" top: "conv5_2" } layer { name: "conv5_3" type: "Convolution" bottom: "conv5_2" top: "conv5_3" convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_3" type: "ReLU" bottom: "conv5_3" top: "conv5_3" } #========= RPN ============ layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5_3" top: "rpn/output" convolution_param { num_output: 512 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } #========= RoI Proposal ============ layer { name: "rpn_cls_prob" type: "Softmax" bottom: "rpn_cls_score_reshape" top: "rpn_cls_prob" } layer { name: 'rpn_cls_prob_reshape' type: 'Reshape' bottom: 'rpn_cls_prob' top: 'rpn_cls_prob_reshape' reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } } } layer { name: 'proposal' type: 'Python' bottom: 'rpn_cls_prob_reshape' bottom: 'rpn_bbox_pred' bottom: 'im_info' top: 'rois' python_param { module: 'rpn.proposal_layer' layer: 'ProposalLayer' param_str: "'feat_stride': 16" } } #========= RCNN ============ layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5_3" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 7 pooled_h: 7 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" inner_product_param { num_output: 4096 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" inner_product_param { num_output: 21 } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" inner_product_param { num_output: 84 } } layer { name: "cls_prob" type: "Softmax" bottom: "cls_score" top: "cls_prob" } ================================================ FILE: models/pascal_voc/VGG16/faster_rcnn_alt_opt/rpn_test.pt ================================================ name: "VGG_ILSVRC_16_layers" input: "data" input_shape { dim: 1 dim: 3 dim: 224 dim: 224 } input: "im_info" input_shape { dim: 1 dim: 3 } layer { name: "conv1_1" type: "Convolution" bottom: "data" top: "conv1_1" convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_1" type: "ReLU" bottom: "conv1_1" top: "conv1_1" } layer { name: "conv1_2" type: "Convolution" bottom: "conv1_1" top: "conv1_2" convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_2" type: "ReLU" bottom: "conv1_2" top: "conv1_2" } layer { name: "pool1" type: "Pooling" bottom: "conv1_2" top: "pool1" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv2_1" type: "Convolution" bottom: "pool1" top: "conv2_1" convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_1" type: "ReLU" bottom: "conv2_1" top: "conv2_1" } layer { name: "conv2_2" type: "Convolution" bottom: "conv2_1" top: "conv2_2" convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_2" type: "ReLU" bottom: "conv2_2" top: "conv2_2" } layer { name: "pool2" type: "Pooling" bottom: "conv2_2" top: "pool2" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv3_1" type: "Convolution" bottom: "pool2" top: "conv3_1" convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_1" type: "ReLU" bottom: "conv3_1" top: "conv3_1" } layer { name: "conv3_2" type: "Convolution" bottom: "conv3_1" top: "conv3_2" convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_2" type: "ReLU" bottom: "conv3_2" top: "conv3_2" } layer { name: "conv3_3" type: "Convolution" bottom: "conv3_2" top: "conv3_3" convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_3" type: "ReLU" bottom: "conv3_3" top: "conv3_3" } layer { name: "pool3" type: "Pooling" bottom: "conv3_3" top: "pool3" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv4_1" type: "Convolution" bottom: "pool3" top: "conv4_1" convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_1" type: "ReLU" bottom: "conv4_1" top: "conv4_1" } layer { name: "conv4_2" type: "Convolution" bottom: "conv4_1" top: "conv4_2" convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_2" type: "ReLU" bottom: "conv4_2" top: "conv4_2" } layer { name: "conv4_3" type: "Convolution" bottom: "conv4_2" top: "conv4_3" convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_3" type: "ReLU" bottom: "conv4_3" top: "conv4_3" } layer { name: "pool4" type: "Pooling" bottom: "conv4_3" top: "pool4" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv5_1" type: "Convolution" bottom: "pool4" top: "conv5_1" convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_1" type: "ReLU" bottom: "conv5_1" top: "conv5_1" } layer { name: "conv5_2" type: "Convolution" bottom: "conv5_1" top: "conv5_2" convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_2" type: "ReLU" bottom: "conv5_2" top: "conv5_2" } layer { name: "conv5_3" type: "Convolution" bottom: "conv5_2" top: "conv5_3" convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_3" type: "ReLU" bottom: "conv5_3" top: "conv5_3" } #========= RPN ============ layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5_3" top: "rpn/output" convolution_param { num_output: 512 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } #========= RoI Proposal ============ layer { name: "rpn_cls_prob" type: "Softmax" bottom: "rpn_cls_score_reshape" top: "rpn_cls_prob" } layer { name: 'rpn_cls_prob_reshape' type: 'Reshape' bottom: 'rpn_cls_prob' top: 'rpn_cls_prob_reshape' reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } } } layer { name: 'proposal' type: 'Python' bottom: 'rpn_cls_prob_reshape' bottom: 'rpn_bbox_pred' bottom: 'im_info' top: 'rois' top: 'scores' python_param { module: 'rpn.proposal_layer' layer: 'ProposalLayer' param_str: "'feat_stride': 16" } } ================================================ FILE: models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_fast_rcnn_solver30k40k.pt ================================================ train_net: "models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_fast_rcnn_train.pt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 30000 display: 20 average_loss: 100 momentum: 0.9 weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "vgg16_fast_rcnn" ================================================ FILE: models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_fast_rcnn_train.pt ================================================ name: "VGG_ILSVRC_16_layers" layer { name: 'data' type: 'Python' top: 'data' top: 'rois' top: 'labels' top: 'bbox_targets' top: 'bbox_inside_weights' top: 'bbox_outside_weights' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 21" } } layer { name: "conv1_1" type: "Convolution" bottom: "data" top: "conv1_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_1" type: "ReLU" bottom: "conv1_1" top: "conv1_1" } layer { name: "conv1_2" type: "Convolution" bottom: "conv1_1" top: "conv1_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_2" type: "ReLU" bottom: "conv1_2" top: "conv1_2" } layer { name: "pool1" type: "Pooling" bottom: "conv1_2" top: "pool1" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv2_1" type: "Convolution" bottom: "pool1" top: "conv2_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_1" type: "ReLU" bottom: "conv2_1" top: "conv2_1" } layer { name: "conv2_2" type: "Convolution" bottom: "conv2_1" top: "conv2_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_2" type: "ReLU" bottom: "conv2_2" top: "conv2_2" } layer { name: "pool2" type: "Pooling" bottom: "conv2_2" top: "pool2" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv3_1" type: "Convolution" bottom: "pool2" top: "conv3_1" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_1" type: "ReLU" bottom: "conv3_1" top: "conv3_1" } layer { name: "conv3_2" type: "Convolution" bottom: "conv3_1" top: "conv3_2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_2" type: "ReLU" bottom: "conv3_2" top: "conv3_2" } layer { name: "conv3_3" type: "Convolution" bottom: "conv3_2" top: "conv3_3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_3" type: "ReLU" bottom: "conv3_3" top: "conv3_3" } layer { name: "pool3" type: "Pooling" bottom: "conv3_3" top: "pool3" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv4_1" type: "Convolution" bottom: "pool3" top: "conv4_1" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_1" type: "ReLU" bottom: "conv4_1" top: "conv4_1" } layer { name: "conv4_2" type: "Convolution" bottom: "conv4_1" top: "conv4_2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_2" type: "ReLU" bottom: "conv4_2" top: "conv4_2" } layer { name: "conv4_3" type: "Convolution" bottom: "conv4_2" top: "conv4_3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_3" type: "ReLU" bottom: "conv4_3" top: "conv4_3" } layer { name: "pool4" type: "Pooling" bottom: "conv4_3" top: "pool4" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv5_1" type: "Convolution" bottom: "pool4" top: "conv5_1" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_1" type: "ReLU" bottom: "conv5_1" top: "conv5_1" } layer { name: "conv5_2" type: "Convolution" bottom: "conv5_1" top: "conv5_2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_2" type: "ReLU" bottom: "conv5_2" top: "conv5_2" } layer { name: "conv5_3" type: "Convolution" bottom: "conv5_2" top: "conv5_3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_3" type: "ReLU" bottom: "conv5_3" top: "conv5_3" } layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5_3" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 7 pooled_h: 7 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 4096 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7" dropout_param { dropout_ratio: 0.5 } } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 21 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 84 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "loss_cls" type: "SoftmaxWithLoss" bottom: "cls_score" bottom: "labels" top: "loss_cls" loss_weight: 1 } layer { name: "loss_bbox" type: "SmoothL1Loss" bottom: "bbox_pred" bottom: "bbox_targets" bottom: "bbox_inside_weights" bottom: "bbox_outside_weights" top: "loss_bbox" loss_weight: 1 } #========= RPN ============ # Dummy layers so that initial parameters are saved into the output net layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5_3" top: "rpn/output" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 512 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "silence_rpn_cls_score" type: "Silence" bottom: "rpn_cls_score" } layer { name: "silence_rpn_bbox_pred" type: "Silence" bottom: "rpn_bbox_pred" } ================================================ FILE: models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_rpn_solver60k80k.pt ================================================ train_net: "models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_rpn_train.pt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 60000 display: 20 average_loss: 100 momentum: 0.9 weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "vgg16_rpn" ================================================ FILE: models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_rpn_train.pt ================================================ name: "VGG_ILSVRC_16_layers" layer { name: 'input-data' type: 'Python' top: 'data' top: 'im_info' top: 'gt_boxes' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 21" } } layer { name: "conv1_1" type: "Convolution" bottom: "data" top: "conv1_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_1" type: "ReLU" bottom: "conv1_1" top: "conv1_1" } layer { name: "conv1_2" type: "Convolution" bottom: "conv1_1" top: "conv1_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_2" type: "ReLU" bottom: "conv1_2" top: "conv1_2" } layer { name: "pool1" type: "Pooling" bottom: "conv1_2" top: "pool1" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv2_1" type: "Convolution" bottom: "pool1" top: "conv2_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_1" type: "ReLU" bottom: "conv2_1" top: "conv2_1" } layer { name: "conv2_2" type: "Convolution" bottom: "conv2_1" top: "conv2_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_2" type: "ReLU" bottom: "conv2_2" top: "conv2_2" } layer { name: "pool2" type: "Pooling" bottom: "conv2_2" top: "pool2" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv3_1" type: "Convolution" bottom: "pool2" top: "conv3_1" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_1" type: "ReLU" bottom: "conv3_1" top: "conv3_1" } layer { name: "conv3_2" type: "Convolution" bottom: "conv3_1" top: "conv3_2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_2" type: "ReLU" bottom: "conv3_2" top: "conv3_2" } layer { name: "conv3_3" type: "Convolution" bottom: "conv3_2" top: "conv3_3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_3" type: "ReLU" bottom: "conv3_3" top: "conv3_3" } layer { name: "pool3" type: "Pooling" bottom: "conv3_3" top: "pool3" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv4_1" type: "Convolution" bottom: "pool3" top: "conv4_1" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_1" type: "ReLU" bottom: "conv4_1" top: "conv4_1" } layer { name: "conv4_2" type: "Convolution" bottom: "conv4_1" top: "conv4_2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_2" type: "ReLU" bottom: "conv4_2" top: "conv4_2" } layer { name: "conv4_3" type: "Convolution" bottom: "conv4_2" top: "conv4_3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_3" type: "ReLU" bottom: "conv4_3" top: "conv4_3" } layer { name: "pool4" type: "Pooling" bottom: "conv4_3" top: "pool4" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv5_1" type: "Convolution" bottom: "pool4" top: "conv5_1" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_1" type: "ReLU" bottom: "conv5_1" top: "conv5_1" } layer { name: "conv5_2" type: "Convolution" bottom: "conv5_1" top: "conv5_2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_2" type: "ReLU" bottom: "conv5_2" top: "conv5_2" } layer { name: "conv5_3" type: "Convolution" bottom: "conv5_2" top: "conv5_3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_3" type: "ReLU" bottom: "conv5_3" top: "conv5_3" } #========= RPN ============ layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5_3" top: "rpn/output" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 512 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } layer { name: 'rpn-data' type: 'Python' bottom: 'rpn_cls_score' bottom: 'gt_boxes' bottom: 'im_info' bottom: 'data' top: 'rpn_labels' top: 'rpn_bbox_targets' top: 'rpn_bbox_inside_weights' top: 'rpn_bbox_outside_weights' python_param { module: 'rpn.anchor_target_layer' layer: 'AnchorTargetLayer' param_str: "'feat_stride': 16" } } layer { name: "rpn_loss_cls" type: "SoftmaxWithLoss" bottom: "rpn_cls_score_reshape" bottom: "rpn_labels" propagate_down: 1 propagate_down: 0 top: "rpn_cls_loss" loss_weight: 1 loss_param { ignore_label: -1 normalize: true } } layer { name: "rpn_loss_bbox" type: "SmoothL1Loss" bottom: "rpn_bbox_pred" bottom: "rpn_bbox_targets" bottom: 'rpn_bbox_inside_weights' bottom: 'rpn_bbox_outside_weights' top: "rpn_loss_bbox" loss_weight: 1 smooth_l1_loss_param { sigma: 3.0 } } #========= RCNN ============ # Dummy layers so that initial parameters are saved into the output net layer { name: "dummy_roi_pool_conv5" type: "DummyData" top: "dummy_roi_pool_conv5" dummy_data_param { shape { dim: 1 dim: 25088 } data_filler { type: "constant" value: 0 } } } layer { name: "fc6" type: "InnerProduct" bottom: "dummy_roi_pool_conv5" top: "fc6" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "silence_fc7" type: "Silence" bottom: "fc7" } ================================================ FILE: models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_fast_rcnn_solver30k40k.pt ================================================ train_net: "models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_fast_rcnn_train.pt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 30000 display: 20 average_loss: 100 momentum: 0.9 weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "vgg16_fast_rcnn" ================================================ FILE: models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_fast_rcnn_train.pt ================================================ name: "VGG_ILSVRC_16_layers" layer { name: 'data' type: 'Python' top: 'data' top: 'rois' top: 'labels' top: 'bbox_targets' top: 'bbox_inside_weights' top: 'bbox_outside_weights' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 21" } } layer { name: "conv1_1" type: "Convolution" bottom: "data" top: "conv1_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_1" type: "ReLU" bottom: "conv1_1" top: "conv1_1" } layer { name: "conv1_2" type: "Convolution" bottom: "conv1_1" top: "conv1_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_2" type: "ReLU" bottom: "conv1_2" top: "conv1_2" } layer { name: "pool1" type: "Pooling" bottom: "conv1_2" top: "pool1" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv2_1" type: "Convolution" bottom: "pool1" top: "conv2_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_1" type: "ReLU" bottom: "conv2_1" top: "conv2_1" } layer { name: "conv2_2" type: "Convolution" bottom: "conv2_1" top: "conv2_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_2" type: "ReLU" bottom: "conv2_2" top: "conv2_2" } layer { name: "pool2" type: "Pooling" bottom: "conv2_2" top: "pool2" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv3_1" type: "Convolution" bottom: "pool2" top: "conv3_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_1" type: "ReLU" bottom: "conv3_1" top: "conv3_1" } layer { name: "conv3_2" type: "Convolution" bottom: "conv3_1" top: "conv3_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_2" type: "ReLU" bottom: "conv3_2" top: "conv3_2" } layer { name: "conv3_3" type: "Convolution" bottom: "conv3_2" top: "conv3_3" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_3" type: "ReLU" bottom: "conv3_3" top: "conv3_3" } layer { name: "pool3" type: "Pooling" bottom: "conv3_3" top: "pool3" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv4_1" type: "Convolution" bottom: "pool3" top: "conv4_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_1" type: "ReLU" bottom: "conv4_1" top: "conv4_1" } layer { name: "conv4_2" type: "Convolution" bottom: "conv4_1" top: "conv4_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_2" type: "ReLU" bottom: "conv4_2" top: "conv4_2" } layer { name: "conv4_3" type: "Convolution" bottom: "conv4_2" top: "conv4_3" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_3" type: "ReLU" bottom: "conv4_3" top: "conv4_3" } layer { name: "pool4" type: "Pooling" bottom: "conv4_3" top: "pool4" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv5_1" type: "Convolution" bottom: "pool4" top: "conv5_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_1" type: "ReLU" bottom: "conv5_1" top: "conv5_1" } layer { name: "conv5_2" type: "Convolution" bottom: "conv5_1" top: "conv5_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_2" type: "ReLU" bottom: "conv5_2" top: "conv5_2" } layer { name: "conv5_3" type: "Convolution" bottom: "conv5_2" top: "conv5_3" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_3" type: "ReLU" bottom: "conv5_3" top: "conv5_3" } layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5_3" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 7 pooled_h: 7 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 4096 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7" dropout_param { dropout_ratio: 0.5 } } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 21 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 84 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "loss_cls" type: "SoftmaxWithLoss" bottom: "cls_score" bottom: "labels" top: "loss_cls" loss_weight: 1 } layer { name: "loss_bbox" type: "SmoothL1Loss" bottom: "bbox_pred" bottom: "bbox_targets" bottom: "bbox_inside_weights" bottom: "bbox_outside_weights" top: "loss_bbox" loss_weight: 1 } #========= RPN ============ # Dummy layers so that initial parameters are saved into the output net layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5_3" top: "rpn/output" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 512 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "silence_rpn_cls_score" type: "Silence" bottom: "rpn_cls_score" } layer { name: "silence_rpn_bbox_pred" type: "Silence" bottom: "rpn_bbox_pred" } ================================================ FILE: models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_rpn_solver60k80k.pt ================================================ train_net: "models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_rpn_train.pt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 60000 display: 20 average_loss: 100 momentum: 0.9 weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "vgg16_rpn" ================================================ FILE: models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_rpn_train.pt ================================================ name: "VGG_ILSVRC_16_layers" layer { name: 'input-data' type: 'Python' top: 'data' top: 'im_info' top: 'gt_boxes' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 21" } } layer { name: "conv1_1" type: "Convolution" bottom: "data" top: "conv1_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_1" type: "ReLU" bottom: "conv1_1" top: "conv1_1" } layer { name: "conv1_2" type: "Convolution" bottom: "conv1_1" top: "conv1_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_2" type: "ReLU" bottom: "conv1_2" top: "conv1_2" } layer { name: "pool1" type: "Pooling" bottom: "conv1_2" top: "pool1" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv2_1" type: "Convolution" bottom: "pool1" top: "conv2_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_1" type: "ReLU" bottom: "conv2_1" top: "conv2_1" } layer { name: "conv2_2" type: "Convolution" bottom: "conv2_1" top: "conv2_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_2" type: "ReLU" bottom: "conv2_2" top: "conv2_2" } layer { name: "pool2" type: "Pooling" bottom: "conv2_2" top: "pool2" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv3_1" type: "Convolution" bottom: "pool2" top: "conv3_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_1" type: "ReLU" bottom: "conv3_1" top: "conv3_1" } layer { name: "conv3_2" type: "Convolution" bottom: "conv3_1" top: "conv3_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_2" type: "ReLU" bottom: "conv3_2" top: "conv3_2" } layer { name: "conv3_3" type: "Convolution" bottom: "conv3_2" top: "conv3_3" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_3" type: "ReLU" bottom: "conv3_3" top: "conv3_3" } layer { name: "pool3" type: "Pooling" bottom: "conv3_3" top: "pool3" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv4_1" type: "Convolution" bottom: "pool3" top: "conv4_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_1" type: "ReLU" bottom: "conv4_1" top: "conv4_1" } layer { name: "conv4_2" type: "Convolution" bottom: "conv4_1" top: "conv4_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_2" type: "ReLU" bottom: "conv4_2" top: "conv4_2" } layer { name: "conv4_3" type: "Convolution" bottom: "conv4_2" top: "conv4_3" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_3" type: "ReLU" bottom: "conv4_3" top: "conv4_3" } layer { name: "pool4" type: "Pooling" bottom: "conv4_3" top: "pool4" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv5_1" type: "Convolution" bottom: "pool4" top: "conv5_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_1" type: "ReLU" bottom: "conv5_1" top: "conv5_1" } layer { name: "conv5_2" type: "Convolution" bottom: "conv5_1" top: "conv5_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_2" type: "ReLU" bottom: "conv5_2" top: "conv5_2" } layer { name: "conv5_3" type: "Convolution" bottom: "conv5_2" top: "conv5_3" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_3" type: "ReLU" bottom: "conv5_3" top: "conv5_3" } #========= RPN ============ layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5_3" top: "rpn/output" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 512 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } layer { name: 'rpn-data' type: 'Python' bottom: 'rpn_cls_score' bottom: 'gt_boxes' bottom: 'im_info' bottom: 'data' top: 'rpn_labels' top: 'rpn_bbox_targets' top: 'rpn_bbox_inside_weights' top: 'rpn_bbox_outside_weights' python_param { module: 'rpn.anchor_target_layer' layer: 'AnchorTargetLayer' param_str: "'feat_stride': 16" } } layer { name: "rpn_loss_cls" type: "SoftmaxWithLoss" bottom: "rpn_cls_score_reshape" bottom: "rpn_labels" propagate_down: 1 propagate_down: 0 top: "rpn_cls_loss" loss_weight: 1 loss_param { ignore_label: -1 normalize: true } } layer { name: "rpn_loss_bbox" type: "SmoothL1Loss" bottom: "rpn_bbox_pred" bottom: "rpn_bbox_targets" bottom: 'rpn_bbox_inside_weights' bottom: 'rpn_bbox_outside_weights' top: "rpn_loss_bbox" loss_weight: 1 smooth_l1_loss_param { sigma: 3.0 } } #========= RCNN ============ # Dummy layers so that initial parameters are saved into the output net layer { name: "dummy_roi_pool_conv5" type: "DummyData" top: "dummy_roi_pool_conv5" dummy_data_param { shape { dim: 1 dim: 25088 } data_filler { type: "constant" value: 0 } } } layer { name: "fc6" type: "InnerProduct" bottom: "dummy_roi_pool_conv5" top: "fc6" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "silence_fc7" type: "Silence" bottom: "fc7" } ================================================ FILE: models/pascal_voc/VGG16/faster_rcnn_end2end/solver.prototxt ================================================ train_net: "models/pascal_voc/VGG16/faster_rcnn_end2end/train.prototxt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 50000 display: 20 average_loss: 100 # iter_size: 1 momentum: 0.9 weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "vgg16_faster_rcnn" iter_size: 2 ================================================ FILE: models/pascal_voc/VGG16/faster_rcnn_end2end/test.prototxt ================================================ name: "VGG_ILSVRC_16_layers" input: "data" input_shape { dim: 1 dim: 3 dim: 224 dim: 224 } input: "im_info" input_shape { dim: 1 dim: 3 } layer { name: "conv1_1" type: "Convolution" bottom: "data" top: "conv1_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_1" type: "ReLU" bottom: "conv1_1" top: "conv1_1" } layer { name: "conv1_2" type: "Convolution" bottom: "conv1_1" top: "conv1_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_2" type: "ReLU" bottom: "conv1_2" top: "conv1_2" } layer { name: "pool1" type: "Pooling" bottom: "conv1_2" top: "pool1" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv2_1" type: "Convolution" bottom: "pool1" top: "conv2_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_1" type: "ReLU" bottom: "conv2_1" top: "conv2_1" } layer { name: "conv2_2" type: "Convolution" bottom: "conv2_1" top: "conv2_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_2" type: "ReLU" bottom: "conv2_2" top: "conv2_2" } layer { name: "pool2" type: "Pooling" bottom: "conv2_2" top: "pool2" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv3_1" type: "Convolution" bottom: "pool2" top: "conv3_1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_1" type: "ReLU" bottom: "conv3_1" top: "conv3_1" } layer { name: "conv3_2" type: "Convolution" bottom: "conv3_1" top: "conv3_2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_2" type: "ReLU" bottom: "conv3_2" top: "conv3_2" } layer { name: "conv3_3" type: "Convolution" bottom: "conv3_2" top: "conv3_3" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_3" type: "ReLU" bottom: "conv3_3" top: "conv3_3" } layer { name: "pool3" type: "Pooling" bottom: "conv3_3" top: "pool3" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv4_1" type: "Convolution" bottom: "pool3" top: "conv4_1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_1" type: "ReLU" bottom: "conv4_1" top: "conv4_1" } layer { name: "conv4_2" type: "Convolution" bottom: "conv4_1" top: "conv4_2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_2" type: "ReLU" bottom: "conv4_2" top: "conv4_2" } layer { name: "conv4_3" type: "Convolution" bottom: "conv4_2" top: "conv4_3" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_3" type: "ReLU" bottom: "conv4_3" top: "conv4_3" } layer { name: "pool4" type: "Pooling" bottom: "conv4_3" top: "pool4" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv5_1" type: "Convolution" bottom: "pool4" top: "conv5_1" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_1" type: "ReLU" bottom: "conv5_1" top: "conv5_1" } layer { name: "conv5_2" type: "Convolution" bottom: "conv5_1" top: "conv5_2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_2" type: "ReLU" bottom: "conv5_2" top: "conv5_2" } layer { name: "conv5_3" type: "Convolution" bottom: "conv5_2" top: "conv5_3" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_3" type: "ReLU" bottom: "conv5_3" top: "conv5_3" } #========= RPN ============ layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5_3" top: "rpn/output" param { lr_mult: 1.0 decay_mult: 1.0 } param { lr_mult: 2.0 decay_mult: 0 } convolution_param { num_output: 512 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" param { lr_mult: 1.0 decay_mult: 1.0 } param { lr_mult: 2.0 decay_mult: 0 } convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" param { lr_mult: 1.0 decay_mult: 1.0 } param { lr_mult: 2.0 decay_mult: 0 } convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } #========= RoI Proposal ============ layer { name: "rpn_cls_prob" type: "Softmax" bottom: "rpn_cls_score_reshape" top: "rpn_cls_prob" } layer { name: 'rpn_cls_prob_reshape' type: 'Reshape' bottom: 'rpn_cls_prob' top: 'rpn_cls_prob_reshape' reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } } } layer { name: 'proposal' type: 'Python' bottom: 'rpn_cls_prob_reshape' bottom: 'rpn_bbox_pred' bottom: 'im_info' top: 'rois' python_param { module: 'rpn.proposal_layer' layer: 'ProposalLayer' param_str: "'feat_stride': 16" } } #========= RCNN ============ layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5_3" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 7 pooled_h: 7 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7" dropout_param { dropout_ratio: 0.5 } } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 21 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 84 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "cls_prob" type: "Softmax" bottom: "cls_score" top: "cls_prob" } ================================================ FILE: models/pascal_voc/VGG16/faster_rcnn_end2end/train.prototxt ================================================ name: "VGG_ILSVRC_16_layers" layer { name: 'input-data' type: 'Python' top: 'data' top: 'im_info' top: 'gt_boxes' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 21" } } layer { name: "conv1_1" type: "Convolution" bottom: "data" top: "conv1_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_1" type: "ReLU" bottom: "conv1_1" top: "conv1_1" } layer { name: "conv1_2" type: "Convolution" bottom: "conv1_1" top: "conv1_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 64 pad: 1 kernel_size: 3 } } layer { name: "relu1_2" type: "ReLU" bottom: "conv1_2" top: "conv1_2" } layer { name: "pool1" type: "Pooling" bottom: "conv1_2" top: "pool1" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv2_1" type: "Convolution" bottom: "pool1" top: "conv2_1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_1" type: "ReLU" bottom: "conv2_1" top: "conv2_1" } layer { name: "conv2_2" type: "Convolution" bottom: "conv2_1" top: "conv2_2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 128 pad: 1 kernel_size: 3 } } layer { name: "relu2_2" type: "ReLU" bottom: "conv2_2" top: "conv2_2" } layer { name: "pool2" type: "Pooling" bottom: "conv2_2" top: "pool2" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv3_1" type: "Convolution" bottom: "pool2" top: "conv3_1" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_1" type: "ReLU" bottom: "conv3_1" top: "conv3_1" } layer { name: "conv3_2" type: "Convolution" bottom: "conv3_1" top: "conv3_2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_2" type: "ReLU" bottom: "conv3_2" top: "conv3_2" } layer { name: "conv3_3" type: "Convolution" bottom: "conv3_2" top: "conv3_3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 3 } } layer { name: "relu3_3" type: "ReLU" bottom: "conv3_3" top: "conv3_3" } layer { name: "pool3" type: "Pooling" bottom: "conv3_3" top: "pool3" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv4_1" type: "Convolution" bottom: "pool3" top: "conv4_1" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_1" type: "ReLU" bottom: "conv4_1" top: "conv4_1" } layer { name: "conv4_2" type: "Convolution" bottom: "conv4_1" top: "conv4_2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_2" type: "ReLU" bottom: "conv4_2" top: "conv4_2" } layer { name: "conv4_3" type: "Convolution" bottom: "conv4_2" top: "conv4_3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4_3" type: "ReLU" bottom: "conv4_3" top: "conv4_3" } layer { name: "pool4" type: "Pooling" bottom: "conv4_3" top: "pool4" pooling_param { pool: MAX kernel_size: 2 stride: 2 } } layer { name: "conv5_1" type: "Convolution" bottom: "pool4" top: "conv5_1" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_1" type: "ReLU" bottom: "conv5_1" top: "conv5_1" } layer { name: "conv5_2" type: "Convolution" bottom: "conv5_1" top: "conv5_2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_2" type: "ReLU" bottom: "conv5_2" top: "conv5_2" } layer { name: "conv5_3" type: "Convolution" bottom: "conv5_2" top: "conv5_3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5_3" type: "ReLU" bottom: "conv5_3" top: "conv5_3" } #========= RPN ============ layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5_3" top: "rpn/output" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 512 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } layer { name: 'rpn-data' type: 'Python' bottom: 'rpn_cls_score' bottom: 'gt_boxes' bottom: 'im_info' bottom: 'data' top: 'rpn_labels' top: 'rpn_bbox_targets' top: 'rpn_bbox_inside_weights' top: 'rpn_bbox_outside_weights' python_param { module: 'rpn.anchor_target_layer' layer: 'AnchorTargetLayer' param_str: "'feat_stride': 16" } } layer { name: "rpn_loss_cls" type: "SoftmaxWithLoss" bottom: "rpn_cls_score_reshape" bottom: "rpn_labels" propagate_down: 1 propagate_down: 0 top: "rpn_cls_loss" loss_weight: 1 loss_param { ignore_label: -1 normalize: true } } layer { name: "rpn_loss_bbox" type: "SmoothL1Loss" bottom: "rpn_bbox_pred" bottom: "rpn_bbox_targets" bottom: 'rpn_bbox_inside_weights' bottom: 'rpn_bbox_outside_weights' top: "rpn_loss_bbox" loss_weight: 1 smooth_l1_loss_param { sigma: 3.0 } } #========= RoI Proposal ============ layer { name: "rpn_cls_prob" type: "Softmax" bottom: "rpn_cls_score_reshape" top: "rpn_cls_prob" } layer { name: 'rpn_cls_prob_reshape' type: 'Reshape' bottom: 'rpn_cls_prob' top: 'rpn_cls_prob_reshape' reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } } } layer { name: 'proposal' type: 'Python' bottom: 'rpn_cls_prob_reshape' bottom: 'rpn_bbox_pred' bottom: 'im_info' top: 'rpn_rois' # top: 'rpn_scores' python_param { module: 'rpn.proposal_layer' layer: 'ProposalLayer' param_str: "'feat_stride': 16" } } #layer { # name: 'debug-data' # type: 'Python' # bottom: 'data' # bottom: 'rpn_rois' # bottom: 'rpn_scores' # python_param { # module: 'rpn.debug_layer' # layer: 'RPNDebugLayer' # } #} layer { name: 'roi-data' type: 'Python' bottom: 'rpn_rois' bottom: 'gt_boxes' top: 'rois' top: 'labels' top: 'bbox_targets' top: 'bbox_inside_weights' top: 'bbox_outside_weights' python_param { module: 'rpn.proposal_target_layer' layer: 'ProposalTargetLayer' param_str: "'num_classes': 21" } } #========= RCNN ============ layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5_3" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 7 pooled_h: 7 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 4096 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7" dropout_param { dropout_ratio: 0.5 } } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 21 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 84 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "loss_cls" type: "SoftmaxWithLoss" bottom: "cls_score" bottom: "labels" propagate_down: 1 propagate_down: 0 top: "loss_cls" loss_weight: 1 } layer { name: "loss_bbox" type: "SmoothL1Loss" bottom: "bbox_pred" bottom: "bbox_targets" bottom: "bbox_inside_weights" bottom: "bbox_outside_weights" top: "loss_bbox" loss_weight: 1 } ================================================ FILE: models/pascal_voc/VGG_CNN_M_1024/fast_rcnn/solver.prototxt ================================================ train_net: "models/pascal_voc/VGG_CNN_M_1024/fast_rcnn/train.prototxt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 30000 display: 20 average_loss: 100 momentum: 0.9 weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "vgg_cnn_m_1024_fast_rcnn" #debug_info: true ================================================ FILE: models/pascal_voc/VGG_CNN_M_1024/fast_rcnn/test.prototxt ================================================ name: "VGG_CNN_M_1024" input: "data" input_shape { dim: 1 dim: 3 dim: 224 dim: 224 } input: "rois" input_shape { dim: 1 # to be changed on-the-fly to num ROIs dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing } layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 96 kernel_size: 7 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 5 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 6 pooled_h: 6 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 1024 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7" dropout_param { dropout_ratio: 0.5 } } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 21 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 84 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "cls_prob" type: "Softmax" bottom: "cls_score" top: "cls_prob" } ================================================ FILE: models/pascal_voc/VGG_CNN_M_1024/fast_rcnn/train.prototxt ================================================ name: "VGG_CNN_M_1024" layer { name: 'data' type: 'Python' top: 'data' top: 'rois' top: 'labels' top: 'bbox_targets' top: 'bbox_inside_weights' top: 'bbox_outside_weights' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 21" } } layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 96 kernel_size: 7 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 5 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 6 pooled_h: 6 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 1024 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7" dropout_param { dropout_ratio: 0.5 } } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 21 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 84 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "loss_cls" type: "SoftmaxWithLoss" bottom: "cls_score" bottom: "labels" top: "loss_cls" loss_weight: 1 } layer { name: "loss_bbox" type: "SmoothL1Loss" bottom: "bbox_pred" bottom: "bbox_targets" bottom: "bbox_inside_weights" bottom: "bbox_outside_weights" top: "loss_bbox" loss_weight: 1 } ================================================ FILE: models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/faster_rcnn_test.pt ================================================ name: "VGG_CNN_M_1024" input: "data" input_shape { dim: 1 dim: 3 dim: 224 dim: 224 } input: "im_info" input_shape { dim: 1 dim: 3 } layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" convolution_param { num_output: 96 kernel_size: 7 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" convolution_param { num_output: 256 pad: 1 kernel_size: 5 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #========= RPN ============ layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5" top: "rpn/output" convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } #========= RoI Proposal ============ layer { name: "rpn_cls_prob" type: "Softmax" bottom: "rpn_cls_score_reshape" top: "rpn_cls_prob" } layer { name: 'rpn_cls_prob_reshape' type: 'Reshape' bottom: 'rpn_cls_prob' top: 'rpn_cls_prob_reshape' reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } } } layer { name: 'proposal' type: 'Python' bottom: 'rpn_cls_prob_reshape' bottom: 'rpn_bbox_pred' bottom: 'im_info' top: 'rois' python_param { module: 'rpn.proposal_layer' layer: 'ProposalLayer' param_str: "'feat_stride': 16" } } #========= RCNN ============ layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 6 pooled_h: 6 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" inner_product_param { num_output: 1024 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" inner_product_param { num_output: 21 } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" inner_product_param { num_output: 84 } } layer { name: "cls_prob" type: "Softmax" bottom: "cls_score" top: "cls_prob" } ================================================ FILE: models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/rpn_test.pt ================================================ name: "VGG_CNN_M_1024" input: "data" input_shape { dim: 1 dim: 3 dim: 224 dim: 224 } input: "im_info" input_shape { dim: 1 dim: 3 } layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" convolution_param { num_output: 96 kernel_size: 7 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" convolution_param { num_output: 256 pad: 1 kernel_size: 5 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #========= RPN ============ layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5" top: "rpn/output" convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } #========= RoI Proposal ============ layer { name: "rpn_cls_prob" type: "Softmax" bottom: "rpn_cls_score_reshape" top: "rpn_cls_prob" } layer { name: 'rpn_cls_prob_reshape' type: 'Reshape' bottom: 'rpn_cls_prob' top: 'rpn_cls_prob_reshape' reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } } } layer { name: 'proposal' type: 'Python' bottom: 'rpn_cls_prob_reshape' bottom: 'rpn_bbox_pred' bottom: 'im_info' top: 'rois' top: 'scores' python_param { module: 'rpn.proposal_layer' layer: 'ProposalLayer' param_str: "'feat_stride': 16" } } ================================================ FILE: models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_fast_rcnn_solver30k40k.pt ================================================ train_net: "models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_fast_rcnn_train.pt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 30000 display: 20 average_loss: 100 momentum: 0.9 weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "vgg_cnn_m_1024_fast_rcnn" ================================================ FILE: models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_fast_rcnn_train.pt ================================================ name: "VGG_CNN_M_1024" layer { name: 'data' type: 'Python' top: 'data' top: 'rois' top: 'labels' top: 'bbox_targets' top: 'bbox_inside_weights' top: 'bbox_outside_weights' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 21" } } layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 96 kernel_size: 7 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 5 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #========= RCNN ============ layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 6 pooled_h: 6 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 1024 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7" dropout_param { dropout_ratio: 0.5 } } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 21 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 84 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "loss_cls" type: "SoftmaxWithLoss" bottom: "cls_score" bottom: "labels" propagate_down: 1 propagate_down: 0 top: "loss_cls" loss_weight: 1 } layer { name: "loss_bbox" type: "SmoothL1Loss" bottom: "bbox_pred" bottom: "bbox_targets" bottom: "bbox_inside_weights" bottom: "bbox_outside_weights" top: "loss_bbox" loss_weight: 1 } #========= RPN ============ # Dummy layers so that initial parameters are saved into the output net layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5" top: "rpn/output" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "silence_rpn_cls_score" type: "Silence" bottom: "rpn_cls_score" } layer { name: "silence_rpn_bbox_pred" type: "Silence" bottom: "rpn_bbox_pred" } ================================================ FILE: models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_rpn_solver60k80k.pt ================================================ train_net: "models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_rpn_train.pt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 60000 display: 20 average_loss: 100 momentum: 0.9 weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "vgg_cnn_m_1024_rpn" ================================================ FILE: models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_rpn_train.pt ================================================ name: "VGG_CNN_M_1024" layer { name: 'input-data' type: 'Python' top: 'data' top: 'im_info' top: 'gt_boxes' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 21" } } layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 96 kernel_size: 7 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 5 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #========= RPN ============ layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5" top: "rpn/output" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } layer { name: 'rpn-data' type: 'Python' bottom: 'rpn_cls_score' bottom: 'gt_boxes' bottom: 'im_info' bottom: 'data' top: 'rpn_labels' top: 'rpn_bbox_targets' top: 'rpn_bbox_inside_weights' top: 'rpn_bbox_outside_weights' python_param { module: 'rpn.anchor_target_layer' layer: 'AnchorTargetLayer' param_str: "'feat_stride': 16" } } layer { name: "rpn_loss_cls" type: "SoftmaxWithLoss" bottom: "rpn_cls_score_reshape" bottom: "rpn_labels" propagate_down: 1 propagate_down: 0 top: "rpn_cls_loss" loss_weight: 1 loss_param { ignore_label: -1 normalize: true } } layer { name: "rpn_loss_bbox" type: "SmoothL1Loss" bottom: "rpn_bbox_pred" bottom: "rpn_bbox_targets" bottom: 'rpn_bbox_inside_weights' bottom: 'rpn_bbox_outside_weights' top: "rpn_loss_bbox" loss_weight: 1 smooth_l1_loss_param { sigma: 3.0 } } #========= RCNN ============ layer { name: "dummy_roi_pool_conv5" type: "DummyData" top: "dummy_roi_pool_conv5" dummy_data_param { shape { dim: 1 dim: 18432 } data_filler { type: "gaussian" std: 0.01 } } } layer { name: "fc6" type: "InnerProduct" bottom: "dummy_roi_pool_conv5" top: "fc6" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } inner_product_param { num_output: 1024 } } layer { name: "silence_fc7" type: "Silence" bottom: "fc7" } ================================================ FILE: models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_fast_rcnn_solver30k40k.pt ================================================ train_net: "models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_fast_rcnn_train.pt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 30000 display: 20 average_loss: 100 momentum: 0.9 weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "vgg_cnn_m_1024_fast_rcnn" ================================================ FILE: models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_fast_rcnn_train.pt ================================================ name: "VGG_CNN_M_1024" layer { name: 'data' type: 'Python' top: 'data' top: 'rois' top: 'labels' top: 'bbox_targets' top: 'bbox_inside_weights' top: 'bbox_outside_weights' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 21" } } layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 96 kernel_size: 7 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 5 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #========= RCNN ============ layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 6 pooled_h: 6 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 1024 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7" dropout_param { dropout_ratio: 0.5 } } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 21 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 84 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "loss_cls" type: "SoftmaxWithLoss" bottom: "cls_score" bottom: "labels" propagate_down: 1 propagate_down: 0 top: "loss_cls" loss_weight: 1 } layer { name: "loss_bbox" type: "SmoothL1Loss" bottom: "bbox_pred" bottom: "bbox_targets" bottom: "bbox_inside_weights" bottom: "bbox_outside_weights" top: "loss_bbox" loss_weight: 1 } #========= RPN ============ # Dummy layers so that initial parameters are saved into the output net layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5" top: "rpn/output" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "silence_rpn_cls_score" type: "Silence" bottom: "rpn_cls_score" } layer { name: "silence_rpn_bbox_pred" type: "Silence" bottom: "rpn_bbox_pred" } ================================================ FILE: models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_rpn_solver60k80k.pt ================================================ train_net: "models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_rpn_train.pt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 60000 display: 20 average_loss: 100 momentum: 0.9 weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "vgg_cnn_m_1024_rpn" ================================================ FILE: models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_rpn_train.pt ================================================ name: "VGG_CNN_M_1024" layer { name: 'input-data' type: 'Python' top: 'data' top: 'im_info' top: 'gt_boxes' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 21" } } layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 96 kernel_size: 7 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 5 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #========= RPN ============ layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5" top: "rpn/output" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } layer { name: 'rpn-data' type: 'Python' bottom: 'rpn_cls_score' bottom: 'gt_boxes' bottom: 'im_info' bottom: 'data' top: 'rpn_labels' top: 'rpn_bbox_targets' top: 'rpn_bbox_inside_weights' top: 'rpn_bbox_outside_weights' python_param { module: 'rpn.anchor_target_layer' layer: 'AnchorTargetLayer' param_str: "'feat_stride': 16" } } layer { name: "rpn_loss_cls" type: "SoftmaxWithLoss" bottom: "rpn_cls_score_reshape" bottom: "rpn_labels" propagate_down: 1 propagate_down: 0 top: "rpn_cls_loss" loss_weight: 1 loss_param { ignore_label: -1 normalize: true } } layer { name: "rpn_loss_bbox" type: "SmoothL1Loss" bottom: "rpn_bbox_pred" bottom: "rpn_bbox_targets" bottom: 'rpn_bbox_inside_weights' bottom: 'rpn_bbox_outside_weights' top: "rpn_loss_bbox" loss_weight: 1 smooth_l1_loss_param { sigma: 3.0 } } #========= RCNN ============ layer { name: "dummy_roi_pool_conv5" type: "DummyData" top: "dummy_roi_pool_conv5" dummy_data_param { shape { dim: 1 dim: 18432 } data_filler { type: "gaussian" std: 0.01 } } } layer { name: "fc6" type: "InnerProduct" bottom: "dummy_roi_pool_conv5" top: "fc6" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } inner_product_param { num_output: 1024 } } layer { name: "silence_fc7" type: "Silence" bottom: "fc7" } ================================================ FILE: models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_end2end/solver.prototxt ================================================ train_net: "models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_end2end/train.prototxt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 50000 display: 20 average_loss: 100 momentum: 0.9 weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "vgg_cnn_m_1024_faster_rcnn" ================================================ FILE: models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_end2end/test.prototxt ================================================ name: "VGG_CNN_M_1024" input: "data" input_shape { dim: 1 dim: 3 dim: 224 dim: 224 } input: "im_info" input_shape { dim: 1 dim: 3 } layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 96 kernel_size: 7 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 256 pad: 1 kernel_size: 5 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #========= RPN ============ layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5" top: "rpn/output" param { lr_mult: 1.0 decay_mult: 1.0 } param { lr_mult: 2.0 decay_mult: 0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } #layer { # name: "rpn_conv/3x3" # type: "Convolution" # bottom: "conv5" # top: "rpn_conv/3x3" # param { lr_mult: 1.0 decay_mult: 1.0 } # param { lr_mult: 2.0 decay_mult: 0 } # convolution_param { # num_output: 192 # kernel_size: 3 pad: 1 stride: 1 # weight_filler { type: "gaussian" std: 0.01 } # bias_filler { type: "constant" value: 0 } # } #} #layer { # name: "rpn_conv/5x5" # type: "Convolution" # bottom: "conv5" # top: "rpn_conv/5x5" # param { lr_mult: 1.0 decay_mult: 1.0 } # param { lr_mult: 2.0 decay_mult: 0 } # convolution_param { # num_output: 64 # kernel_size: 5 pad: 2 stride: 1 # weight_filler { type: "gaussian" std: 0.0036 } # bias_filler { type: "constant" value: 0 } # } #} #layer { # name: "rpn/output" # type: "Concat" # bottom: "rpn_conv/3x3" # bottom: "rpn_conv/5x5" # top: "rpn/output" #} #layer { # name: "rpn_relu/output" # type: "ReLU" # bottom: "rpn/output" # top: "rpn/output" #} layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" param { lr_mult: 1.0 decay_mult: 1.0 } param { lr_mult: 2.0 decay_mult: 0 } convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" param { lr_mult: 1.0 decay_mult: 1.0 } param { lr_mult: 2.0 decay_mult: 0 } convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } #========= RoI Proposal ============ layer { name: "rpn_cls_prob" type: "Softmax" bottom: "rpn_cls_score_reshape" top: "rpn_cls_prob" } layer { name: 'rpn_cls_prob_reshape' type: 'Reshape' bottom: 'rpn_cls_prob' top: 'rpn_cls_prob_reshape' reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } } } layer { name: 'proposal' type: 'Python' bottom: 'rpn_cls_prob_reshape' bottom: 'rpn_bbox_pred' bottom: 'im_info' top: 'rois' python_param { module: 'rpn.proposal_layer' layer: 'ProposalLayer' param_str: "'feat_stride': 16" } } #========= RCNN ============ layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 6 pooled_h: 6 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 1024 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7" dropout_param { dropout_ratio: 0.5 } } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 21 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 } inner_product_param { num_output: 84 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "cls_prob" type: "Softmax" bottom: "cls_score" top: "cls_prob" } ================================================ FILE: models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_end2end/train.prototxt ================================================ name: "VGG_CNN_M_1024" layer { name: 'input-data' type: 'Python' top: 'data' top: 'im_info' top: 'gt_boxes' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 21" } } layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 96 kernel_size: 7 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 256 pad: 1 kernel_size: 5 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 5 alpha: 0.0005 beta: 0.75 k: 2 } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { pool: MAX kernel_size: 3 stride: 2 } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" param { lr_mult: 1 } param { lr_mult: 2 } convolution_param { num_output: 512 pad: 1 kernel_size: 3 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #========= RPN ============ layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5" top: "rpn/output" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } layer { name: 'rpn-data' type: 'Python' bottom: 'rpn_cls_score' bottom: 'gt_boxes' bottom: 'im_info' bottom: 'data' top: 'rpn_labels' top: 'rpn_bbox_targets' top: 'rpn_bbox_inside_weights' top: 'rpn_bbox_outside_weights' python_param { module: 'rpn.anchor_target_layer' layer: 'AnchorTargetLayer' param_str: "'feat_stride': 16" } } layer { name: "rpn_loss_cls" type: "SoftmaxWithLoss" bottom: "rpn_cls_score_reshape" bottom: "rpn_labels" propagate_down: 1 propagate_down: 0 top: "rpn_cls_loss" loss_weight: 1 loss_param { ignore_label: -1 normalize: true } } layer { name: "rpn_loss_bbox" type: "SmoothL1Loss" bottom: "rpn_bbox_pred" bottom: "rpn_bbox_targets" bottom: 'rpn_bbox_inside_weights' bottom: 'rpn_bbox_outside_weights' top: "rpn_loss_bbox" loss_weight: 1 smooth_l1_loss_param { sigma: 3.0 } } #========= RoI Proposal ============ layer { name: "rpn_cls_prob" type: "Softmax" bottom: "rpn_cls_score_reshape" top: "rpn_cls_prob" } layer { name: 'rpn_cls_prob_reshape' type: 'Reshape' bottom: 'rpn_cls_prob' top: 'rpn_cls_prob_reshape' reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } } } layer { name: 'proposal' type: 'Python' bottom: 'rpn_cls_prob_reshape' bottom: 'rpn_bbox_pred' bottom: 'im_info' top: 'rpn_rois' # top: 'rpn_scores' python_param { module: 'rpn.proposal_layer' layer: 'ProposalLayer' param_str: "'feat_stride': 16" } } #layer { # name: 'debug-data' # type: 'Python' # bottom: 'data' # bottom: 'rpn_rois' # bottom: 'rpn_scores' # python_param { # module: 'rpn.debug_layer' # layer: 'RPNDebugLayer' # } #} layer { name: 'roi-data' type: 'Python' bottom: 'rpn_rois' bottom: 'gt_boxes' top: 'rois' top: 'labels' top: 'bbox_targets' top: 'bbox_inside_weights' top: 'bbox_outside_weights' python_param { module: 'rpn.proposal_target_layer' layer: 'ProposalTargetLayer' param_str: "'num_classes': 21" } } #========= RCNN ============ layer { name: "roi_pool5" type: "ROIPooling" bottom: "conv5" bottom: "rois" top: "pool5" roi_pooling_param { pooled_w: 6 pooled_h: 6 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "pool5" top: "fc6" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 1024 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7" dropout_param { dropout_ratio: 0.5 } } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 21 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1 } param { lr_mult: 2 } inner_product_param { num_output: 84 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "loss_cls" type: "SoftmaxWithLoss" bottom: "cls_score" bottom: "labels" propagate_down: 1 propagate_down: 0 top: "loss_cls" loss_weight: 1 } layer { name: "loss_bbox" type: "SmoothL1Loss" bottom: "bbox_pred" bottom: "bbox_targets" bottom: "bbox_inside_weights" bottom: "bbox_outside_weights" top: "loss_bbox" loss_weight: 1 } ================================================ FILE: models/pascal_voc/ZF/fast_rcnn/solver.prototxt ================================================ train_net: "models/pascal_voc/ZF/fast_rcnn/train.prototxt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 30000 display: 20 average_loss: 100 momentum: 0.9 weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "zf_fast_rcnn" #debug_info: true #iter_size: 2 ================================================ FILE: models/pascal_voc/ZF/fast_rcnn/test.prototxt ================================================ name: "ZF" input: "data" input_shape { dim: 1 dim: 3 dim: 224 dim: 224 } input: "rois" input_shape { dim: 1 # to be changed on-the-fly to num ROIs dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing } #========= conv1-conv5 ============ layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" convolution_param { num_output: 96 kernel_size: 7 pad: 3 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" convolution_param { num_output: 256 kernel_size: 5 pad: 2 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #========= RCNN ============ layer { name: "roi_pool_conv5" type: "ROIPooling" bottom: "conv5" bottom: "rois" top: "roi_pool_conv5" roi_pooling_param { pooled_w: 6 pooled_h: 6 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "roi_pool_conv5" top: "fc6" inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 scale_train: false } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" inner_product_param { num_output: 4096 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7" dropout_param { dropout_ratio: 0.5 scale_train: false } } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" inner_product_param { num_output: 21 } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" inner_product_param { num_output: 84 } } layer { name: "cls_prob" type: "Softmax" bottom: "cls_score" top: "cls_prob" loss_param { ignore_label: -1 normalize: true } } ================================================ FILE: models/pascal_voc/ZF/fast_rcnn/train.prototxt ================================================ name: "ZF" layer { name: 'data' type: 'Python' top: 'data' top: 'rois' top: 'labels' top: 'bbox_targets' top: 'bbox_inside_weights' top: 'bbox_outside_weights' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 21" } } #========= conv1-conv5 ============ layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 96 kernel_size: 7 pad: 3 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 5 pad: 2 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #========= RCNN ============ layer { name: "roi_pool_conv5" type: "ROIPooling" bottom: "conv5" bottom: "rois" top: "roi_pool_conv5" roi_pooling_param { pooled_w: 6 pooled_h: 6 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "roi_pool_conv5" top: "fc6" param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 scale_train: false } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 4096 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7" dropout_param { dropout_ratio: 0.5 scale_train: false } } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 21 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 84 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "loss_cls" type: "SoftmaxWithLoss" bottom: "cls_score" bottom: "labels" propagate_down: 1 propagate_down: 0 top: "cls_loss" loss_weight: 1 loss_param { ignore_label: -1 normalize: true } } layer { name: "loss_bbox" type: "SmoothL1Loss" bottom: "bbox_pred" bottom: "bbox_targets" bottom: "bbox_inside_weights" bottom: "bbox_outside_weights" top: "bbox_loss" loss_weight: 1 } ================================================ FILE: models/pascal_voc/ZF/faster_rcnn_alt_opt/faster_rcnn_test.pt ================================================ name: "ZF" input: "data" input_shape { dim: 1 dim: 3 dim: 224 dim: 224 } input: "im_info" input_shape { dim: 1 dim: 3 } #========= conv1-conv5 ============ layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" convolution_param { num_output: 96 kernel_size: 7 pad: 3 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" convolution_param { num_output: 256 kernel_size: 5 pad: 2 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #========= RPN ============ layer { name: "rpn_conv1" type: "Convolution" bottom: "conv5" top: "rpn_conv1" convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "rpn_relu1" type: "ReLU" bottom: "rpn_conv1" top: "rpn_conv1" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn_conv1" top: "rpn_cls_score" convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn_conv1" top: "rpn_bbox_pred" convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } #========= RoI Proposal ============ layer { name: "rpn_cls_prob" type: "Softmax" bottom: "rpn_cls_score_reshape" top: "rpn_cls_prob" } layer { name: 'rpn_cls_prob_reshape' type: 'Reshape' bottom: 'rpn_cls_prob' top: 'rpn_cls_prob_reshape' reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } } } layer { name: 'proposal' type: 'Python' bottom: 'rpn_cls_prob_reshape' bottom: 'rpn_bbox_pred' bottom: 'im_info' top: 'rois' python_param { module: 'rpn.proposal_layer' layer: 'ProposalLayer' param_str: "'feat_stride': 16" } } #========= RCNN ============ layer { name: "roi_pool_conv5" type: "ROIPooling" bottom: "conv5" bottom: "rois" top: "roi_pool_conv5" roi_pooling_param { pooled_w: 6 pooled_h: 6 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "roi_pool_conv5" top: "fc6" inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 scale_train: false } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" inner_product_param { num_output: 4096 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7" dropout_param { dropout_ratio: 0.5 scale_train: false } } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" inner_product_param { num_output: 21 } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" inner_product_param { num_output: 84 } } layer { name: "cls_prob" type: "Softmax" bottom: "cls_score" top: "cls_prob" loss_param { ignore_label: -1 normalize: true } } ================================================ FILE: models/pascal_voc/ZF/faster_rcnn_alt_opt/rpn_test.pt ================================================ name: "ZF" input: "data" input_shape { dim: 1 dim: 3 dim: 224 dim: 224 } input: "im_info" input_shape { dim: 1 dim: 3 } # ------------------------ layer 1 ----------------------------- layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" convolution_param { num_output: 96 kernel_size: 7 pad: 3 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" convolution_param { num_output: 256 kernel_size: 5 pad: 2 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #-----------------------layer +------------------------- layer { name: "rpn_conv1" type: "Convolution" bottom: "conv5" top: "rpn_conv1" convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "rpn_relu1" type: "ReLU" bottom: "rpn_conv1" top: "rpn_conv1" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn_conv1" top: "rpn_cls_score" convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn_conv1" top: "rpn_bbox_pred" convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } #-----------------------output------------------------ layer { name: "rpn_cls_prob" type: "Softmax" bottom: "rpn_cls_score_reshape" top: "rpn_cls_prob" } layer { name: 'rpn_cls_prob_reshape' type: 'Reshape' bottom: 'rpn_cls_prob' top: 'rpn_cls_prob_reshape' reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } } } layer { name: 'proposal' type: 'Python' bottom: 'rpn_cls_prob_reshape' bottom: 'rpn_bbox_pred' bottom: 'im_info' top: 'rois' top: 'scores' python_param { module: 'rpn.proposal_layer' layer: 'ProposalLayer' param_str: "'feat_stride': 16" } } ================================================ FILE: models/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_fast_rcnn_solver30k40k.pt ================================================ train_net: "models/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_fast_rcnn_train.pt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 30000 display: 20 average_loss: 100 momentum: 0.9 weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "zf_fast_rcnn" ================================================ FILE: models/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_fast_rcnn_train.pt ================================================ name: "ZF" layer { name: 'data' type: 'Python' top: 'data' top: 'rois' top: 'labels' top: 'bbox_targets' top: 'bbox_inside_weights' top: 'bbox_outside_weights' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 21" } } #========= conv1-conv5 ============ layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 96 kernel_size: 7 pad: 3 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 5 pad: 2 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #========= RCNN ============ layer { name: "roi_pool_conv5" type: "ROIPooling" bottom: "conv5" bottom: "rois" top: "roi_pool_conv5" roi_pooling_param { pooled_w: 6 pooled_h: 6 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "roi_pool_conv5" top: "fc6" param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 scale_train: false } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 4096 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7" dropout_param { dropout_ratio: 0.5 scale_train: false } } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 21 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 84 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "loss_cls" type: "SoftmaxWithLoss" bottom: "cls_score" bottom: "labels" propagate_down: 1 propagate_down: 0 top: "cls_loss" loss_weight: 1 loss_param { ignore_label: -1 normalize: true } } layer { name: "loss_bbox" type: "SmoothL1Loss" bottom: "bbox_pred" bottom: "bbox_targets" bottom: "bbox_inside_weights" bottom: "bbox_outside_weights" top: "bbox_loss" loss_weight: 1 } #========= RPN ============ # Dummy layers so that initial parameters are saved into the output net layer { name: "rpn_conv1" type: "Convolution" bottom: "conv5" top: "rpn_conv1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu1" type: "ReLU" bottom: "rpn_conv1" top: "rpn_conv1" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn_conv1" top: "rpn_cls_score" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn_conv1" top: "rpn_bbox_pred" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "silence_rpn_cls_score" type: "Silence" bottom: "rpn_cls_score" } layer { name: "silence_rpn_bbox_pred" type: "Silence" bottom: "rpn_bbox_pred" } ================================================ FILE: models/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_rpn_solver60k80k.pt ================================================ train_net: "models/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_rpn_train.pt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 60000 display: 20 average_loss: 100 momentum: 0.9 weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "zf_rpn" ================================================ FILE: models/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_rpn_train.pt ================================================ name: "ZF" layer { name: 'input-data' type: 'Python' top: 'data' top: 'im_info' top: 'gt_boxes' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 21" } } #========= conv1-conv5 ============ layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 96 kernel_size: 7 pad: 3 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 5 pad: 2 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #========= RPN ============ layer { name: "rpn_conv1" type: "Convolution" bottom: "conv5" top: "rpn_conv1" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu1" type: "ReLU" bottom: "rpn_conv1" top: "rpn_conv1" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn_conv1" top: "rpn_cls_score" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn_conv1" top: "rpn_bbox_pred" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } layer { name: 'rpn-data' type: 'Python' bottom: 'rpn_cls_score' bottom: 'gt_boxes' bottom: 'im_info' bottom: 'data' top: 'rpn_labels' top: 'rpn_bbox_targets' top: 'rpn_bbox_inside_weights' top: 'rpn_bbox_outside_weights' python_param { module: 'rpn.anchor_target_layer' layer: 'AnchorTargetLayer' param_str: "'feat_stride': 16" } } layer { name: "rpn_loss_cls" type: "SoftmaxWithLoss" bottom: "rpn_cls_score_reshape" bottom: "rpn_labels" propagate_down: 1 propagate_down: 0 top: "rpn_cls_loss" loss_weight: 1 loss_param { ignore_label: -1 normalize: true } } layer { name: "rpn_loss_bbox" type: "SmoothL1Loss" bottom: "rpn_bbox_pred" bottom: "rpn_bbox_targets" bottom: "rpn_bbox_inside_weights" bottom: "rpn_bbox_outside_weights" top: "rpn_loss_bbox" loss_weight: 1 smooth_l1_loss_param { sigma: 3.0 } } #========= RCNN ============ # Dummy layers so that initial parameters are saved into the output net layer { name: "dummy_roi_pool_conv5" type: "DummyData" top: "dummy_roi_pool_conv5" dummy_data_param { shape { dim: 1 dim: 9216 } data_filler { type: "gaussian" std: 0.01 } } } layer { name: "fc6" type: "InnerProduct" bottom: "dummy_roi_pool_conv5" top: "fc6" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "silence_fc7" type: "Silence" bottom: "fc7" } ================================================ FILE: models/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_fast_rcnn_solver30k40k.pt ================================================ train_net: "models/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_fast_rcnn_train.pt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 30000 display: 20 average_loss: 100 momentum: 0.9 weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "zf_fast_rcnn" ================================================ FILE: models/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_fast_rcnn_train.pt ================================================ name: "ZF" layer { name: 'data' type: 'Python' top: 'data' top: 'rois' top: 'labels' top: 'bbox_targets' top: 'bbox_inside_weights' top: 'bbox_outside_weights' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 21" } } #========= conv1-conv5 ============ layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 96 kernel_size: 7 pad: 3 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 256 kernel_size: 5 pad: 2 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #========= RCNN ============ layer { name: "roi_pool_conv5" type: "ROIPooling" bottom: "conv5" bottom: "rois" top: "roi_pool_conv5" roi_pooling_param { pooled_w: 6 pooled_h: 6 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "roi_pool_conv5" top: "fc6" param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 scale_train: false } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 4096 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7" dropout_param { dropout_ratio: 0.5 scale_train: false } } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 21 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 84 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "loss_cls" type: "SoftmaxWithLoss" bottom: "cls_score" bottom: "labels" propagate_down: 1 propagate_down: 0 top: "cls_loss" loss_weight: 1 loss_param { ignore_label: -1 normalize: true } } layer { name: "loss_bbox" type: "SmoothL1Loss" bottom: "bbox_pred" bottom: "bbox_targets" bottom: "bbox_inside_weights" bottom: "bbox_outside_weights" top: "bbox_loss" loss_weight: 1 } #========= RPN ============ # Dummy layers so that initial parameters are saved into the output net layer { name: "rpn_conv1" type: "Convolution" bottom: "conv5" top: "rpn_conv1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu1" type: "ReLU" bottom: "rpn_conv1" top: "rpn_conv1" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn_conv1" top: "rpn_cls_score" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn_conv1" top: "rpn_bbox_pred" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "silence_rpn_cls_score" type: "Silence" bottom: "rpn_cls_score" } layer { name: "silence_rpn_bbox_pred" type: "Silence" bottom: "rpn_bbox_pred" } ================================================ FILE: models/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_rpn_solver60k80k.pt ================================================ train_net: "models/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_rpn_train.pt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 60000 display: 20 average_loss: 100 momentum: 0.9 weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "zf_rpn" ================================================ FILE: models/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_rpn_train.pt ================================================ name: "ZF" layer { name: 'input-data' type: 'Python' top: 'data' top: 'im_info' top: 'gt_boxes' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 21" } } #========= conv1-conv5 ============ layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 96 kernel_size: 7 pad: 3 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 256 kernel_size: 5 pad: 2 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #========= RPN ============ layer { name: "rpn_conv1" type: "Convolution" bottom: "conv5" top: "rpn_conv1" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu1" type: "ReLU" bottom: "rpn_conv1" top: "rpn_conv1" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn_conv1" top: "rpn_cls_score" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn_conv1" top: "rpn_bbox_pred" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } layer { name: 'rpn-data' type: 'Python' bottom: 'rpn_cls_score' bottom: 'gt_boxes' bottom: 'im_info' bottom: 'data' top: 'rpn_labels' top: 'rpn_bbox_targets' top: 'rpn_bbox_inside_weights' top: 'rpn_bbox_outside_weights' python_param { module: 'rpn.anchor_target_layer' layer: 'AnchorTargetLayer' param_str: "'feat_stride': 16" } } layer { name: "rpn_loss_cls" type: "SoftmaxWithLoss" bottom: "rpn_cls_score_reshape" bottom: "rpn_labels" propagate_down: 1 propagate_down: 0 top: "rpn_cls_loss" loss_weight: 1 loss_param { ignore_label: -1 normalize: true } } layer { name: "rpn_loss_bbox" type: "SmoothL1Loss" bottom: "rpn_bbox_pred" bottom: "rpn_bbox_targets" bottom: "rpn_bbox_inside_weights" bottom: "rpn_bbox_outside_weights" top: "rpn_loss_bbox" loss_weight: 1 smooth_l1_loss_param { sigma: 3.0 } } #========= RCNN ============ # Dummy layers so that initial parameters are saved into the output net layer { name: "dummy_roi_pool_conv5" type: "DummyData" top: "dummy_roi_pool_conv5" dummy_data_param { shape { dim: 1 dim: 9216 } data_filler { type: "gaussian" std: 0.01 } } } layer { name: "fc6" type: "InnerProduct" bottom: "dummy_roi_pool_conv5" top: "fc6" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "silence_fc7" type: "Silence" bottom: "fc7" } ================================================ FILE: models/pascal_voc/ZF/faster_rcnn_end2end/solver.prototxt ================================================ train_net: "models/pascal_voc/ZF/faster_rcnn_end2end/train.prototxt" base_lr: 0.001 lr_policy: "step" gamma: 0.1 stepsize: 50000 display: 20 average_loss: 100 momentum: 0.9 weight_decay: 0.0005 #base_lr: 0.001 #lr_policy: "exp" #gamma: 0.999539589 # (0.00001/0.001)^(1/10000) #display: 1 #average_loss: 100 #momentum: 0.9 #weight_decay: 0.0005 # We disable standard caffe solver snapshotting and implement our own snapshot # function snapshot: 0 # We still use the snapshot prefix, though snapshot_prefix: "zf_faster_rcnn" iter_size: 2 ================================================ FILE: models/pascal_voc/ZF/faster_rcnn_end2end/test.prototxt ================================================ name: "ZF" input: "data" input_shape { dim: 1 dim: 3 dim: 224 dim: 224 } input: "im_info" input_shape { dim: 1 dim: 3 } #========= conv1-conv5 ============ layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" convolution_param { num_output: 96 kernel_size: 7 pad: 3 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" convolution_param { num_output: 256 kernel_size: 5 pad: 2 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #========= RPN ============ layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5" top: "rpn/output" convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } #layer { # name: "rpn_conv/3x3" # type: "Convolution" # bottom: "conv5" # top: "rpn_conv/3x3" # param { lr_mult: 1.0 decay_mult: 1.0 } # param { lr_mult: 2.0 decay_mult: 0 } # convolution_param { # num_output: 192 # kernel_size: 3 pad: 1 stride: 1 # weight_filler { type: "gaussian" std: 0.01 } # bias_filler { type: "constant" value: 0 } # } #} #layer { # name: "rpn_conv/5x5" # type: "Convolution" # bottom: "conv5" # top: "rpn_conv/5x5" # param { lr_mult: 1.0 decay_mult: 1.0 } # param { lr_mult: 2.0 decay_mult: 0 } # convolution_param { # num_output: 64 # kernel_size: 5 pad: 2 stride: 1 # weight_filler { type: "gaussian" std: 0.0036 } # bias_filler { type: "constant" value: 0 } # } #} #layer { # name: "rpn/output" # type: "Concat" # bottom: "rpn_conv/3x3" # bottom: "rpn_conv/5x5" # top: "rpn/output" #} #layer { # name: "rpn_relu/output" # type: "ReLU" # bottom: "rpn/output" # top: "rpn/output" #} layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } #========= RoI Proposal ============ layer { name: "rpn_cls_prob" type: "Softmax" bottom: "rpn_cls_score_reshape" top: "rpn_cls_prob" } layer { name: 'rpn_cls_prob_reshape' type: 'Reshape' bottom: 'rpn_cls_prob' top: 'rpn_cls_prob_reshape' reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } } } layer { name: 'proposal' type: 'Python' bottom: 'rpn_cls_prob_reshape' bottom: 'rpn_bbox_pred' bottom: 'im_info' top: 'rois' python_param { module: 'rpn.proposal_layer' layer: 'ProposalLayer' param_str: "'feat_stride': 16" } } #========= RCNN ============ layer { name: "roi_pool_conv5" type: "ROIPooling" bottom: "conv5" bottom: "rois" top: "roi_pool_conv5" roi_pooling_param { pooled_w: 6 pooled_h: 6 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "roi_pool_conv5" top: "fc6" inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 scale_train: false } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" inner_product_param { num_output: 4096 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7" dropout_param { dropout_ratio: 0.5 scale_train: false } } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" inner_product_param { num_output: 21 } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" inner_product_param { num_output: 84 } } layer { name: "cls_prob" type: "Softmax" bottom: "cls_score" top: "cls_prob" loss_param { ignore_label: -1 normalize: true } } ================================================ FILE: models/pascal_voc/ZF/faster_rcnn_end2end/train.prototxt ================================================ name: "ZF" layer { name: 'input-data' type: 'Python' top: 'data' top: 'im_info' top: 'gt_boxes' python_param { module: 'roi_data_layer.layer' layer: 'RoIDataLayer' param_str: "'num_classes': 21" } } #========= conv1-conv5 ============ layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 96 kernel_size: 7 pad: 3 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 5 pad: 2 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #========= RPN ============ layer { name: "rpn_conv/3x3" type: "Convolution" bottom: "conv5" top: "rpn/output" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu/3x3" type: "ReLU" bottom: "rpn/output" top: "rpn/output" } #layer { # name: "rpn_conv/3x3" # type: "Convolution" # bottom: "conv5" # top: "rpn_conv/3x3" # param { lr_mult: 1.0 } # param { lr_mult: 2.0 } # convolution_param { # num_output: 192 # kernel_size: 3 pad: 1 stride: 1 # weight_filler { type: "gaussian" std: 0.01 } # bias_filler { type: "constant" value: 0 } # } #} #layer { # name: "rpn_conv/5x5" # type: "Convolution" # bottom: "conv5" # top: "rpn_conv/5x5" # param { lr_mult: 1.0 } # param { lr_mult: 2.0 } # convolution_param { # num_output: 64 # kernel_size: 5 pad: 2 stride: 1 # weight_filler { type: "gaussian" std: 0.0036 } # bias_filler { type: "constant" value: 0 } # } #} #layer { # name: "rpn/output" # type: "Concat" # bottom: "rpn_conv/3x3" # bottom: "rpn_conv/5x5" # top: "rpn/output" #} #layer { # name: "rpn_relu/output" # type: "ReLU" # bottom: "rpn/output" # top: "rpn/output" #} layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn/output" top: "rpn_cls_score" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn/output" top: "rpn_bbox_pred" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } layer { name: 'rpn-data' type: 'Python' bottom: 'rpn_cls_score' bottom: 'gt_boxes' bottom: 'im_info' bottom: 'data' top: 'rpn_labels' top: 'rpn_bbox_targets' top: 'rpn_bbox_inside_weights' top: 'rpn_bbox_outside_weights' python_param { module: 'rpn.anchor_target_layer' layer: 'AnchorTargetLayer' param_str: "'feat_stride': 16" } } layer { name: "rpn_loss_cls" type: "SoftmaxWithLoss" bottom: "rpn_cls_score_reshape" bottom: "rpn_labels" propagate_down: 1 propagate_down: 0 top: "rpn_cls_loss" loss_weight: 1 loss_param { ignore_label: -1 normalize: true } } layer { name: "rpn_loss_bbox" type: "SmoothL1Loss" bottom: "rpn_bbox_pred" bottom: "rpn_bbox_targets" bottom: 'rpn_bbox_inside_weights' bottom: 'rpn_bbox_outside_weights' top: "rpn_loss_bbox" loss_weight: 1 smooth_l1_loss_param { sigma: 3.0 } } #========= RoI Proposal ============ layer { name: "rpn_cls_prob" type: "Softmax" bottom: "rpn_cls_score_reshape" top: "rpn_cls_prob" } layer { name: 'rpn_cls_prob_reshape' type: 'Reshape' bottom: 'rpn_cls_prob' top: 'rpn_cls_prob_reshape' reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } } } layer { name: 'proposal' type: 'Python' bottom: 'rpn_cls_prob_reshape' bottom: 'rpn_bbox_pred' bottom: 'im_info' top: 'rpn_rois' # top: 'rpn_scores' python_param { module: 'rpn.proposal_layer' layer: 'ProposalLayer' param_str: "'feat_stride': 16" } } #layer { # name: 'debug-data' # type: 'Python' # bottom: 'data' # bottom: 'rpn_rois' # bottom: 'rpn_scores' # python_param { # module: 'rpn.debug_layer' # layer: 'RPNDebugLayer' # } #} layer { name: 'roi-data' type: 'Python' bottom: 'rpn_rois' bottom: 'gt_boxes' top: 'rois' top: 'labels' top: 'bbox_targets' top: 'bbox_inside_weights' top: 'bbox_outside_weights' python_param { module: 'rpn.proposal_target_layer' layer: 'ProposalTargetLayer' param_str: "'num_classes': 21" } } #========= RCNN ============ layer { name: "roi_pool_conv5" type: "ROIPooling" bottom: "conv5" bottom: "rois" top: "roi_pool_conv5" roi_pooling_param { pooled_w: 6 pooled_h: 6 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "roi_pool_conv5" top: "fc6" param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 scale_train: false } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 4096 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7" dropout_param { dropout_ratio: 0.5 scale_train: false } } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 21 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 84 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "loss_cls" type: "SoftmaxWithLoss" bottom: "cls_score" bottom: "labels" propagate_down: 1 propagate_down: 0 top: "cls_loss" loss_weight: 1 loss_param { ignore_label: -1 normalize: true } } layer { name: "loss_bbox" type: "SmoothL1Loss" bottom: "bbox_pred" bottom: "bbox_targets" bottom: 'bbox_inside_weights' bottom: 'bbox_outside_weights' top: "bbox_loss" loss_weight: 1 } ================================================ FILE: tools/README.md ================================================ Tools for training, testing, and compressing Fast R-CNN networks. ================================================ FILE: tools/_init_paths.py ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- """Set up paths for Fast R-CNN.""" import os.path as osp import sys def add_path(path): if path not in sys.path: sys.path.insert(0, path) this_dir = osp.dirname(__file__) # Add caffe to PYTHONPATH caffe_path = osp.join(this_dir, '..', 'caffe-fast-rcnn', 'python') add_path(caffe_path) # Add lib to PYTHONPATH lib_path = osp.join(this_dir, '..', 'lib') add_path(lib_path) ================================================ FILE: tools/compress_net.py ================================================ #!/usr/bin/env python # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- """Compress a Fast R-CNN network using truncated SVD.""" import _init_paths import caffe import argparse import numpy as np import os, sys def parse_args(): """Parse input arguments.""" parser = argparse.ArgumentParser(description='Compress a Fast R-CNN network') parser.add_argument('--def', dest='prototxt', help='prototxt file defining the uncompressed network', default=None, type=str) parser.add_argument('--def-svd', dest='prototxt_svd', help='prototxt file defining the SVD compressed network', default=None, type=str) parser.add_argument('--net', dest='caffemodel', help='model to compress', default=None, type=str) if len(sys.argv) == 1: parser.print_help() sys.exit(1) args = parser.parse_args() return args def compress_weights(W, l): """Compress the weight matrix W of an inner product (fully connected) layer using truncated SVD. Parameters: W: N x M weights matrix l: number of singular values to retain Returns: Ul, L: matrices such that W \approx Ul*L """ # numpy doesn't seem to have a fast truncated SVD algorithm... # this could be faster U, s, V = np.linalg.svd(W, full_matrices=False) Ul = U[:, :l] sl = s[:l] Vl = V[:l, :] L = np.dot(np.diag(sl), Vl) return Ul, L def main(): args = parse_args() # prototxt = 'models/VGG16/test.prototxt' # caffemodel = 'snapshots/vgg16_fast_rcnn_iter_40000.caffemodel' net = caffe.Net(args.prototxt, args.caffemodel, caffe.TEST) # prototxt_svd = 'models/VGG16/svd/test_fc6_fc7.prototxt' # caffemodel = 'snapshots/vgg16_fast_rcnn_iter_40000.caffemodel' net_svd = caffe.Net(args.prototxt_svd, args.caffemodel, caffe.TEST) print('Uncompressed network {} : {}'.format(args.prototxt, args.caffemodel)) print('Compressed network prototxt {}'.format(args.prototxt_svd)) out = os.path.splitext(os.path.basename(args.caffemodel))[0] + '_svd' out_dir = os.path.dirname(args.caffemodel) # Compress fc6 if net_svd.params.has_key('fc6_L'): l_fc6 = net_svd.params['fc6_L'][0].data.shape[0] print(' fc6_L bottleneck size: {}'.format(l_fc6)) # uncompressed weights and biases W_fc6 = net.params['fc6'][0].data B_fc6 = net.params['fc6'][1].data print(' compressing fc6...') Ul_fc6, L_fc6 = compress_weights(W_fc6, l_fc6) assert(len(net_svd.params['fc6_L']) == 1) # install compressed matrix factors (and original biases) net_svd.params['fc6_L'][0].data[...] = L_fc6 net_svd.params['fc6_U'][0].data[...] = Ul_fc6 net_svd.params['fc6_U'][1].data[...] = B_fc6 out += '_fc6_{}'.format(l_fc6) # Compress fc7 if net_svd.params.has_key('fc7_L'): l_fc7 = net_svd.params['fc7_L'][0].data.shape[0] print ' fc7_L bottleneck size: {}'.format(l_fc7) W_fc7 = net.params['fc7'][0].data B_fc7 = net.params['fc7'][1].data print(' compressing fc7...') Ul_fc7, L_fc7 = compress_weights(W_fc7, l_fc7) assert(len(net_svd.params['fc7_L']) == 1) net_svd.params['fc7_L'][0].data[...] = L_fc7 net_svd.params['fc7_U'][0].data[...] = Ul_fc7 net_svd.params['fc7_U'][1].data[...] = B_fc7 out += '_fc7_{}'.format(l_fc7) filename = '{}/{}.caffemodel'.format(out_dir, out) net_svd.save(filename) print 'Wrote svd model to: {:s}'.format(filename) if __name__ == '__main__': main() ================================================ FILE: tools/demo.py ================================================ #!/usr/bin/env python # -------------------------------------------------------- # Faster R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- """ Demo script showing detections in sample images. See README.md for installation instructions before running. """ import _init_paths from fast_rcnn.config import cfg from fast_rcnn.test import im_detect from fast_rcnn.nms_wrapper import nms from utils.timer import Timer import matplotlib.pyplot as plt import numpy as np import scipy.io as sio import caffe, os, sys, cv2 import argparse CLASSES = ('__background__', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor') NETS = {'vgg16': ('VGG16', 'VGG16_faster_rcnn_final.caffemodel'), 'zf': ('ZF', 'ZF_faster_rcnn_final.caffemodel')} def vis_detections(im, class_name, dets, thresh=0.5): """Draw detected bounding boxes.""" inds = np.where(dets[:, -1] >= thresh)[0] if len(inds) == 0: return im = im[:, :, (2, 1, 0)] fig, ax = plt.subplots(figsize=(12, 12)) ax.imshow(im, aspect='equal') for i in inds: bbox = dets[i, :4] score = dets[i, -1] ax.add_patch( plt.Rectangle((bbox[0], bbox[1]), bbox[2] - bbox[0], bbox[3] - bbox[1], fill=False, edgecolor='red', linewidth=3.5) ) ax.text(bbox[0], bbox[1] - 2, '{:s} {:.3f}'.format(class_name, score), bbox=dict(facecolor='blue', alpha=0.5), fontsize=14, color='white') ax.set_title(('{} detections with ' 'p({} | box) >= {:.1f}').format(class_name, class_name, thresh), fontsize=14) plt.axis('off') plt.tight_layout() plt.draw() def demo(net, image_name): """Detect object classes in an image using pre-computed object proposals.""" # Load the demo image im_file = os.path.join(cfg.DATA_DIR, 'demo', image_name) im = cv2.imread(im_file) # Detect all object classes and regress object bounds timer = Timer() timer.tic() scores, boxes = im_detect(net, im) timer.toc() print ('Detection took {:.3f}s for ' '{:d} object proposals').format(timer.total_time, boxes.shape[0]) # Visualize detections for each class CONF_THRESH = 0.8 NMS_THRESH = 0.3 for cls_ind, cls in enumerate(CLASSES[1:]): cls_ind += 1 # because we skipped background cls_boxes = boxes[:, 4*cls_ind:4*(cls_ind + 1)] cls_scores = scores[:, cls_ind] dets = np.hstack((cls_boxes, cls_scores[:, np.newaxis])).astype(np.float32) keep = nms(dets, NMS_THRESH) dets = dets[keep, :] vis_detections(im, cls, dets, thresh=CONF_THRESH) def parse_args(): """Parse input arguments.""" parser = argparse.ArgumentParser(description='Faster R-CNN demo') parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]', default=0, type=int) parser.add_argument('--cpu', dest='cpu_mode', help='Use CPU mode (overrides --gpu)', action='store_true') parser.add_argument('--net', dest='demo_net', help='Network to use [vgg16]', choices=NETS.keys(), default='vgg16') args = parser.parse_args() return args if __name__ == '__main__': cfg.TEST.HAS_RPN = True # Use RPN for proposals args = parse_args() prototxt = os.path.join(cfg.MODELS_DIR, NETS[args.demo_net][0], 'faster_rcnn_alt_opt', 'faster_rcnn_test.pt') caffemodel = os.path.join(cfg.DATA_DIR, 'faster_rcnn_models', NETS[args.demo_net][1]) if not os.path.isfile(caffemodel): raise IOError(('{:s} not found.\nDid you run ./data/script/' 'fetch_faster_rcnn_models.sh?').format(caffemodel)) if args.cpu_mode: caffe.set_mode_cpu() else: caffe.set_mode_gpu() caffe.set_device(args.gpu_id) cfg.GPU_ID = args.gpu_id net = caffe.Net(prototxt, caffemodel, caffe.TEST) print '\n\nLoaded network {:s}'.format(caffemodel) # Warmup on a dummy image im = 128 * np.ones((300, 500, 3), dtype=np.uint8) for i in xrange(2): _, _= im_detect(net, im) im_names = ['000456.jpg', '000542.jpg', '001150.jpg', '001763.jpg', '004545.jpg'] for im_name in im_names: print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' print 'Demo for data/demo/{}'.format(im_name) demo(net, im_name) plt.show() ================================================ FILE: tools/eval_recall.py ================================================ #!/usr/bin/env python import _init_paths from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list from datasets.factory import get_imdb import argparse import time, os, sys import numpy as np def parse_args(): """ Parse input arguments """ parser = argparse.ArgumentParser(description='Test a Fast R-CNN network') parser.add_argument('--imdb', dest='imdb_name', help='dataset to test', default='voc_2007_test', type=str) parser.add_argument('--method', dest='method', help='proposal method', default='selective_search', type=str) parser.add_argument('--rpn-file', dest='rpn_file', default=None, type=str) if len(sys.argv) == 1: parser.print_help() sys.exit(1) args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() print('Called with args:') print(args) imdb = get_imdb(args.imdb_name) imdb.set_proposal_method(args.method) if args.rpn_file is not None: imdb.config['rpn_file'] = args.rpn_file candidate_boxes = None if 0: import scipy.io as sio filename = 'debug/stage1_rpn_voc_2007_test.mat' raw_data = sio.loadmat(filename)['aboxes'].ravel() candidate_boxes = raw_data ar, gt_overlaps, recalls, thresholds = \ imdb.evaluate_recall(candidate_boxes=candidate_boxes) print 'Method: {}'.format(args.method) print 'AverageRec: {:.3f}'.format(ar) def recall_at(t): ind = np.where(thresholds > t - 1e-5)[0][0] assert np.isclose(thresholds[ind], t) return recalls[ind] print 'Recall@0.5: {:.3f}'.format(recall_at(0.5)) print 'Recall@0.6: {:.3f}'.format(recall_at(0.6)) print 'Recall@0.7: {:.3f}'.format(recall_at(0.7)) print 'Recall@0.8: {:.3f}'.format(recall_at(0.8)) print 'Recall@0.9: {:.3f}'.format(recall_at(0.9)) # print again for easy spreadsheet copying print '{:.3f}'.format(ar) print '{:.3f}'.format(recall_at(0.5)) print '{:.3f}'.format(recall_at(0.6)) print '{:.3f}'.format(recall_at(0.7)) print '{:.3f}'.format(recall_at(0.8)) print '{:.3f}'.format(recall_at(0.9)) ================================================ FILE: tools/reval.py ================================================ #!/usr/bin/env python # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- """Reval = re-eval. Re-evaluate saved detections.""" import _init_paths from fast_rcnn.test import apply_nms from fast_rcnn.config import cfg from datasets.factory import get_imdb import cPickle import os, sys, argparse import numpy as np def parse_args(): """ Parse input arguments """ parser = argparse.ArgumentParser(description='Re-evaluate results') parser.add_argument('output_dir', nargs=1, help='results directory', type=str) parser.add_argument('--imdb', dest='imdb_name', help='dataset to re-evaluate', default='voc_2007_test', type=str) parser.add_argument('--matlab', dest='matlab_eval', help='use matlab for evaluation', action='store_true') parser.add_argument('--comp', dest='comp_mode', help='competition mode', action='store_true') parser.add_argument('--nms', dest='apply_nms', help='apply nms', action='store_true') if len(sys.argv) == 1: parser.print_help() sys.exit(1) args = parser.parse_args() return args def from_dets(imdb_name, output_dir, args): imdb = get_imdb(imdb_name) imdb.competition_mode(args.comp_mode) imdb.config['matlab_eval'] = args.matlab_eval with open(os.path.join(output_dir, 'detections.pkl'), 'rb') as f: dets = cPickle.load(f) if args.apply_nms: print 'Applying NMS to all detections' nms_dets = apply_nms(dets, cfg.TEST.NMS) else: nms_dets = dets print 'Evaluating detections' imdb.evaluate_detections(nms_dets, output_dir) if __name__ == '__main__': args = parse_args() output_dir = os.path.abspath(args.output_dir[0]) imdb_name = args.imdb_name from_dets(imdb_name, output_dir, args) ================================================ FILE: tools/rpn_generate.py ================================================ #!/usr/bin/env python # -------------------------------------------------------- # Fast/er/ R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- """Generate RPN proposals.""" import _init_paths import numpy as np from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list, get_output_dir from datasets.factory import get_imdb from rpn.generate import imdb_proposals import cPickle import caffe import argparse import pprint import time, os, sys def parse_args(): """ Parse input arguments """ parser = argparse.ArgumentParser(description='Test a Fast R-CNN network') parser.add_argument('--gpu', dest='gpu_id', help='GPU id to use', default=0, type=int) parser.add_argument('--def', dest='prototxt', help='prototxt file defining the network', default=None, type=str) parser.add_argument('--net', dest='caffemodel', help='model to test', default=None, type=str) parser.add_argument('--cfg', dest='cfg_file', help='optional config file', default=None, type=str) parser.add_argument('--wait', dest='wait', help='wait until net file exists', default=True, type=bool) parser.add_argument('--imdb', dest='imdb_name', help='dataset to test', default='voc_2007_test', type=str) parser.add_argument('--set', dest='set_cfgs', help='set config keys', default=None, nargs=argparse.REMAINDER) if len(sys.argv) == 1: parser.print_help() sys.exit(1) args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() print('Called with args:') print(args) if args.cfg_file is not None: cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) cfg.GPU_ID = args.gpu_id # RPN test settings cfg.TEST.RPN_PRE_NMS_TOP_N = -1 cfg.TEST.RPN_POST_NMS_TOP_N = 2000 print('Using config:') pprint.pprint(cfg) while not os.path.exists(args.caffemodel) and args.wait: print('Waiting for {} to exist...'.format(args.caffemodel)) time.sleep(10) caffe.set_mode_gpu() caffe.set_device(args.gpu_id) net = caffe.Net(args.prototxt, args.caffemodel, caffe.TEST) net.name = os.path.splitext(os.path.basename(args.caffemodel))[0] imdb = get_imdb(args.imdb_name) imdb_boxes = imdb_proposals(net, imdb) output_dir = get_output_dir(imdb, net) rpn_file = os.path.join(output_dir, net.name + '_rpn_proposals.pkl') with open(rpn_file, 'wb') as f: cPickle.dump(imdb_boxes, f, cPickle.HIGHEST_PROTOCOL) print 'Wrote RPN proposals to {}'.format(rpn_file) ================================================ FILE: tools/test_net.py ================================================ #!/usr/bin/env python # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- """Test a Fast R-CNN network on an image database.""" import _init_paths from fast_rcnn.test import test_net from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list from datasets.factory import get_imdb import caffe import argparse import pprint import time, os, sys def parse_args(): """ Parse input arguments """ parser = argparse.ArgumentParser(description='Test a Fast R-CNN network') parser.add_argument('--gpu', dest='gpu_id', help='GPU id to use', default=0, type=int) parser.add_argument('--def', dest='prototxt', help='prototxt file defining the network', default=None, type=str) parser.add_argument('--net', dest='caffemodel', help='model to test', default=None, type=str) parser.add_argument('--cfg', dest='cfg_file', help='optional config file', default=None, type=str) parser.add_argument('--wait', dest='wait', help='wait until net file exists', default=True, type=bool) parser.add_argument('--imdb', dest='imdb_name', help='dataset to test', default='voc_2007_test', type=str) parser.add_argument('--comp', dest='comp_mode', help='competition mode', action='store_true') parser.add_argument('--set', dest='set_cfgs', help='set config keys', default=None, nargs=argparse.REMAINDER) parser.add_argument('--vis', dest='vis', help='visualize detections', action='store_true') parser.add_argument('--num_dets', dest='max_per_image', help='max number of detections per image', default=100, type=int) if len(sys.argv) == 1: parser.print_help() sys.exit(1) args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() print('Called with args:') print(args) if args.cfg_file is not None: cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) cfg.GPU_ID = args.gpu_id print('Using config:') pprint.pprint(cfg) while not os.path.exists(args.caffemodel) and args.wait: print('Waiting for {} to exist...'.format(args.caffemodel)) time.sleep(10) caffe.set_mode_gpu() caffe.set_device(args.gpu_id) net = caffe.Net(args.prototxt, args.caffemodel, caffe.TEST) net.name = os.path.splitext(os.path.basename(args.caffemodel))[0] imdb = get_imdb(args.imdb_name) imdb.competition_mode(args.comp_mode) if not cfg.TEST.HAS_RPN: imdb.set_proposal_method(cfg.TEST.PROPOSAL_METHOD) test_net(net, imdb, max_per_image=args.max_per_image, vis=args.vis) ================================================ FILE: tools/train_faster_rcnn_alt_opt.py ================================================ #!/usr/bin/env python # -------------------------------------------------------- # Faster R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- """Train a Faster R-CNN network using alternating optimization. This tool implements the alternating optimization algorithm described in our NIPS 2015 paper ("Faster R-CNN: Towards Real-time Object Detection with Region Proposal Networks." Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun.) """ import _init_paths from fast_rcnn.train import get_training_roidb, train_net from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list, get_output_dir from datasets.factory import get_imdb from rpn.generate import imdb_proposals import argparse import pprint import numpy as np import sys, os import multiprocessing as mp import cPickle import shutil def parse_args(): """ Parse input arguments """ parser = argparse.ArgumentParser(description='Train a Faster R-CNN network') parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]', default=0, type=int) parser.add_argument('--net_name', dest='net_name', help='network name (e.g., "ZF")', default=None, type=str) parser.add_argument('--weights', dest='pretrained_model', help='initialize with pretrained model weights', default=None, type=str) parser.add_argument('--cfg', dest='cfg_file', help='optional config file', default=None, type=str) parser.add_argument('--imdb', dest='imdb_name', help='dataset to train on', default='voc_2007_trainval', type=str) parser.add_argument('--set', dest='set_cfgs', help='set config keys', default=None, nargs=argparse.REMAINDER) if len(sys.argv) == 1: parser.print_help() sys.exit(1) args = parser.parse_args() return args def get_roidb(imdb_name, rpn_file=None): imdb = get_imdb(imdb_name) print 'Loaded dataset `{:s}` for training'.format(imdb.name) imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD) print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD) if rpn_file is not None: imdb.config['rpn_file'] = rpn_file roidb = get_training_roidb(imdb) return roidb, imdb def get_solvers(net_name): # Faster R-CNN Alternating Optimization n = 'faster_rcnn_alt_opt' # Solver for each training stage solvers = [[net_name, n, 'stage1_rpn_solver60k80k.pt'], [net_name, n, 'stage1_fast_rcnn_solver30k40k.pt'], [net_name, n, 'stage2_rpn_solver60k80k.pt'], [net_name, n, 'stage2_fast_rcnn_solver30k40k.pt']] solvers = [os.path.join(cfg.MODELS_DIR, *s) for s in solvers] # Iterations for each training stage max_iters = [80000, 40000, 80000, 40000] # max_iters = [100, 100, 100, 100] # Test prototxt for the RPN rpn_test_prototxt = os.path.join( cfg.MODELS_DIR, net_name, n, 'rpn_test.pt') return solvers, max_iters, rpn_test_prototxt # ------------------------------------------------------------------------------ # Pycaffe doesn't reliably free GPU memory when instantiated nets are discarded # (e.g. "del net" in Python code). To work around this issue, each training # stage is executed in a separate process using multiprocessing.Process. # ------------------------------------------------------------------------------ def _init_caffe(cfg): """Initialize pycaffe in a training process. """ import caffe # fix the random seeds (numpy and caffe) for reproducibility np.random.seed(cfg.RNG_SEED) caffe.set_random_seed(cfg.RNG_SEED) # set up caffe caffe.set_mode_gpu() caffe.set_device(cfg.GPU_ID) def train_rpn(queue=None, imdb_name=None, init_model=None, solver=None, max_iters=None, cfg=None): """Train a Region Proposal Network in a separate training process. """ # Not using any proposals, just ground-truth boxes cfg.TRAIN.HAS_RPN = True cfg.TRAIN.BBOX_REG = False # applies only to Fast R-CNN bbox regression cfg.TRAIN.PROPOSAL_METHOD = 'gt' cfg.TRAIN.IMS_PER_BATCH = 1 print 'Init model: {}'.format(init_model) print('Using config:') pprint.pprint(cfg) import caffe _init_caffe(cfg) roidb, imdb = get_roidb(imdb_name) print 'roidb len: {}'.format(len(roidb)) output_dir = get_output_dir(imdb) print 'Output will be saved to `{:s}`'.format(output_dir) model_paths = train_net(solver, roidb, output_dir, pretrained_model=init_model, max_iters=max_iters) # Cleanup all but the final model for i in model_paths[:-1]: os.remove(i) rpn_model_path = model_paths[-1] # Send final model path through the multiprocessing queue queue.put({'model_path': rpn_model_path}) def rpn_generate(queue=None, imdb_name=None, rpn_model_path=None, cfg=None, rpn_test_prototxt=None): """Use a trained RPN to generate proposals. """ cfg.TEST.RPN_PRE_NMS_TOP_N = -1 # no pre NMS filtering cfg.TEST.RPN_POST_NMS_TOP_N = 2000 # limit top boxes after NMS print 'RPN model: {}'.format(rpn_model_path) print('Using config:') pprint.pprint(cfg) import caffe _init_caffe(cfg) # NOTE: the matlab implementation computes proposals on flipped images, too. # We compute them on the image once and then flip the already computed # proposals. This might cause a minor loss in mAP (less proposal jittering). imdb = get_imdb(imdb_name) print 'Loaded dataset `{:s}` for proposal generation'.format(imdb.name) # Load RPN and configure output directory rpn_net = caffe.Net(rpn_test_prototxt, rpn_model_path, caffe.TEST) output_dir = get_output_dir(imdb) print 'Output will be saved to `{:s}`'.format(output_dir) # Generate proposals on the imdb rpn_proposals = imdb_proposals(rpn_net, imdb) # Write proposals to disk and send the proposal file path through the # multiprocessing queue rpn_net_name = os.path.splitext(os.path.basename(rpn_model_path))[0] rpn_proposals_path = os.path.join( output_dir, rpn_net_name + '_proposals.pkl') with open(rpn_proposals_path, 'wb') as f: cPickle.dump(rpn_proposals, f, cPickle.HIGHEST_PROTOCOL) print 'Wrote RPN proposals to {}'.format(rpn_proposals_path) queue.put({'proposal_path': rpn_proposals_path}) def train_fast_rcnn(queue=None, imdb_name=None, init_model=None, solver=None, max_iters=None, cfg=None, rpn_file=None): """Train a Fast R-CNN using proposals generated by an RPN. """ cfg.TRAIN.HAS_RPN = False # not generating prosals on-the-fly cfg.TRAIN.PROPOSAL_METHOD = 'rpn' # use pre-computed RPN proposals instead cfg.TRAIN.IMS_PER_BATCH = 2 print 'Init model: {}'.format(init_model) print 'RPN proposals: {}'.format(rpn_file) print('Using config:') pprint.pprint(cfg) import caffe _init_caffe(cfg) roidb, imdb = get_roidb(imdb_name, rpn_file=rpn_file) output_dir = get_output_dir(imdb) print 'Output will be saved to `{:s}`'.format(output_dir) # Train Fast R-CNN model_paths = train_net(solver, roidb, output_dir, pretrained_model=init_model, max_iters=max_iters) # Cleanup all but the final model for i in model_paths[:-1]: os.remove(i) fast_rcnn_model_path = model_paths[-1] # Send Fast R-CNN model path over the multiprocessing queue queue.put({'model_path': fast_rcnn_model_path}) if __name__ == '__main__': args = parse_args() print('Called with args:') print(args) if args.cfg_file is not None: cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) cfg.GPU_ID = args.gpu_id # -------------------------------------------------------------------------- # Pycaffe doesn't reliably free GPU memory when instantiated nets are # discarded (e.g. "del net" in Python code). To work around this issue, each # training stage is executed in a separate process using # multiprocessing.Process. # -------------------------------------------------------------------------- # queue for communicated results between processes mp_queue = mp.Queue() # solves, iters, etc. for each training stage solvers, max_iters, rpn_test_prototxt = get_solvers(args.net_name) print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' print 'Stage 1 RPN, init from ImageNet model' print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' cfg.TRAIN.SNAPSHOT_INFIX = 'stage1' mp_kwargs = dict( queue=mp_queue, imdb_name=args.imdb_name, init_model=args.pretrained_model, solver=solvers[0], max_iters=max_iters[0], cfg=cfg) p = mp.Process(target=train_rpn, kwargs=mp_kwargs) p.start() rpn_stage1_out = mp_queue.get() p.join() print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' print 'Stage 1 RPN, generate proposals' print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' mp_kwargs = dict( queue=mp_queue, imdb_name=args.imdb_name, rpn_model_path=str(rpn_stage1_out['model_path']), cfg=cfg, rpn_test_prototxt=rpn_test_prototxt) p = mp.Process(target=rpn_generate, kwargs=mp_kwargs) p.start() rpn_stage1_out['proposal_path'] = mp_queue.get()['proposal_path'] p.join() print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' print 'Stage 1 Fast R-CNN using RPN proposals, init from ImageNet model' print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' cfg.TRAIN.SNAPSHOT_INFIX = 'stage1' mp_kwargs = dict( queue=mp_queue, imdb_name=args.imdb_name, init_model=args.pretrained_model, solver=solvers[1], max_iters=max_iters[1], cfg=cfg, rpn_file=rpn_stage1_out['proposal_path']) p = mp.Process(target=train_fast_rcnn, kwargs=mp_kwargs) p.start() fast_rcnn_stage1_out = mp_queue.get() p.join() print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' print 'Stage 2 RPN, init from stage 1 Fast R-CNN model' print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' cfg.TRAIN.SNAPSHOT_INFIX = 'stage2' mp_kwargs = dict( queue=mp_queue, imdb_name=args.imdb_name, init_model=str(fast_rcnn_stage1_out['model_path']), solver=solvers[2], max_iters=max_iters[2], cfg=cfg) p = mp.Process(target=train_rpn, kwargs=mp_kwargs) p.start() rpn_stage2_out = mp_queue.get() p.join() print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' print 'Stage 2 RPN, generate proposals' print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' mp_kwargs = dict( queue=mp_queue, imdb_name=args.imdb_name, rpn_model_path=str(rpn_stage2_out['model_path']), cfg=cfg, rpn_test_prototxt=rpn_test_prototxt) p = mp.Process(target=rpn_generate, kwargs=mp_kwargs) p.start() rpn_stage2_out['proposal_path'] = mp_queue.get()['proposal_path'] p.join() print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' print 'Stage 2 Fast R-CNN, init from stage 2 RPN R-CNN model' print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' cfg.TRAIN.SNAPSHOT_INFIX = 'stage2' mp_kwargs = dict( queue=mp_queue, imdb_name=args.imdb_name, init_model=str(rpn_stage2_out['model_path']), solver=solvers[3], max_iters=max_iters[3], cfg=cfg, rpn_file=rpn_stage2_out['proposal_path']) p = mp.Process(target=train_fast_rcnn, kwargs=mp_kwargs) p.start() fast_rcnn_stage2_out = mp_queue.get() p.join() # Create final model (just a copy of the last stage) final_path = os.path.join( os.path.dirname(fast_rcnn_stage2_out['model_path']), args.net_name + '_faster_rcnn_final.caffemodel') print 'cp {} -> {}'.format( fast_rcnn_stage2_out['model_path'], final_path) shutil.copy(fast_rcnn_stage2_out['model_path'], final_path) print 'Final model: {}'.format(final_path) ================================================ FILE: tools/train_net.py ================================================ #!/usr/bin/env python # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- """Train a Fast R-CNN network on a region of interest database.""" import _init_paths from fast_rcnn.train import get_training_roidb, train_net from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list, get_output_dir from datasets.factory import get_imdb import datasets.imdb import caffe import argparse import pprint import numpy as np import sys def parse_args(): """ Parse input arguments """ parser = argparse.ArgumentParser(description='Train a Fast R-CNN network') parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]', default=0, type=int) parser.add_argument('--solver', dest='solver', help='solver prototxt', default=None, type=str) parser.add_argument('--iters', dest='max_iters', help='number of iterations to train', default=40000, type=int) parser.add_argument('--weights', dest='pretrained_model', help='initialize with pretrained model weights', default=None, type=str) parser.add_argument('--cfg', dest='cfg_file', help='optional config file', default=None, type=str) parser.add_argument('--imdb', dest='imdb_name', help='dataset to train on', default='voc_2007_trainval', type=str) parser.add_argument('--rand', dest='randomize', help='randomize (do not use a fixed seed)', action='store_true') parser.add_argument('--set', dest='set_cfgs', help='set config keys', default=None, nargs=argparse.REMAINDER) if len(sys.argv) == 1: parser.print_help() sys.exit(1) args = parser.parse_args() return args def combined_roidb(imdb_names): def get_roidb(imdb_name): imdb = get_imdb(imdb_name) print 'Loaded dataset `{:s}` for training'.format(imdb.name) imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD) print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD) roidb = get_training_roidb(imdb) return roidb roidbs = [get_roidb(s) for s in imdb_names.split('+')] roidb = roidbs[0] if len(roidbs) > 1: for r in roidbs[1:]: roidb.extend(r) imdb = datasets.imdb.imdb(imdb_names) else: imdb = get_imdb(imdb_names) return imdb, roidb if __name__ == '__main__': args = parse_args() print('Called with args:') print(args) if args.cfg_file is not None: cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) cfg.GPU_ID = args.gpu_id print('Using config:') pprint.pprint(cfg) if not args.randomize: # fix the random seeds (numpy and caffe) for reproducibility np.random.seed(cfg.RNG_SEED) caffe.set_random_seed(cfg.RNG_SEED) # set up caffe caffe.set_mode_gpu() caffe.set_device(args.gpu_id) imdb, roidb = combined_roidb(args.imdb_name) print '{:d} roidb entries'.format(len(roidb)) output_dir = get_output_dir(imdb) print 'Output will be saved to `{:s}`'.format(output_dir) train_net(args.solver, roidb, output_dir, pretrained_model=args.pretrained_model, max_iters=args.max_iters) ================================================ FILE: tools/train_svms.py ================================================ #!/usr/bin/env python # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- """ Train post-hoc SVMs using the algorithm and hyper-parameters from traditional R-CNN. """ import _init_paths from fast_rcnn.config import cfg, cfg_from_file from datasets.factory import get_imdb from fast_rcnn.test import im_detect from utils.timer import Timer import caffe import argparse import pprint import numpy as np import numpy.random as npr import cv2 from sklearn import svm import os, sys class SVMTrainer(object): """ Trains post-hoc detection SVMs for all classes using the algorithm and hyper-parameters of traditional R-CNN. """ def __init__(self, net, imdb): self.imdb = imdb self.net = net self.layer = 'fc7' self.hard_thresh = -1.0001 self.neg_iou_thresh = 0.3 dim = net.params['cls_score'][0].data.shape[1] scale = self._get_feature_scale() print('Feature dim: {}'.format(dim)) print('Feature scale: {:.3f}'.format(scale)) self.trainers = [SVMClassTrainer(cls, dim, feature_scale=scale) for cls in imdb.classes] def _get_feature_scale(self, num_images=100): TARGET_NORM = 20.0 # Magic value from traditional R-CNN _t = Timer() roidb = self.imdb.roidb total_norm = 0.0 count = 0.0 inds = npr.choice(xrange(self.imdb.num_images), size=num_images, replace=False) for i_, i in enumerate(inds): im = cv2.imread(self.imdb.image_path_at(i)) if roidb[i]['flipped']: im = im[:, ::-1, :] _t.tic() scores, boxes = im_detect(self.net, im, roidb[i]['boxes']) _t.toc() feat = self.net.blobs[self.layer].data total_norm += np.sqrt((feat ** 2).sum(axis=1)).sum() count += feat.shape[0] print('{}/{}: avg feature norm: {:.3f}'.format(i_ + 1, num_images, total_norm / count)) return TARGET_NORM * 1.0 / (total_norm / count) def _get_pos_counts(self): counts = np.zeros((len(self.imdb.classes)), dtype=np.int) roidb = self.imdb.roidb for i in xrange(len(roidb)): for j in xrange(1, self.imdb.num_classes): I = np.where(roidb[i]['gt_classes'] == j)[0] counts[j] += len(I) for j in xrange(1, self.imdb.num_classes): print('class {:s} has {:d} positives'. format(self.imdb.classes[j], counts[j])) return counts def get_pos_examples(self): counts = self._get_pos_counts() for i in xrange(len(counts)): self.trainers[i].alloc_pos(counts[i]) _t = Timer() roidb = self.imdb.roidb num_images = len(roidb) # num_images = 100 for i in xrange(num_images): im = cv2.imread(self.imdb.image_path_at(i)) if roidb[i]['flipped']: im = im[:, ::-1, :] gt_inds = np.where(roidb[i]['gt_classes'] > 0)[0] gt_boxes = roidb[i]['boxes'][gt_inds] _t.tic() scores, boxes = im_detect(self.net, im, gt_boxes) _t.toc() feat = self.net.blobs[self.layer].data for j in xrange(1, self.imdb.num_classes): cls_inds = np.where(roidb[i]['gt_classes'][gt_inds] == j)[0] if len(cls_inds) > 0: cls_feat = feat[cls_inds, :] self.trainers[j].append_pos(cls_feat) print 'get_pos_examples: {:d}/{:d} {:.3f}s' \ .format(i + 1, len(roidb), _t.average_time) def initialize_net(self): # Start all SVM parameters at zero self.net.params['cls_score'][0].data[...] = 0 self.net.params['cls_score'][1].data[...] = 0 # Initialize SVMs in a smart way. Not doing this because its such # a good initialization that we might not learn something close to # the SVM solution. # # subtract background weights and biases for the foreground classes # w_bg = self.net.params['cls_score'][0].data[0, :] # b_bg = self.net.params['cls_score'][1].data[0] # self.net.params['cls_score'][0].data[1:, :] -= w_bg # self.net.params['cls_score'][1].data[1:] -= b_bg # # set the background weights and biases to 0 (where they shall remain) # self.net.params['cls_score'][0].data[0, :] = 0 # self.net.params['cls_score'][1].data[0] = 0 def update_net(self, cls_ind, w, b): self.net.params['cls_score'][0].data[cls_ind, :] = w self.net.params['cls_score'][1].data[cls_ind] = b def train_with_hard_negatives(self): _t = Timer() roidb = self.imdb.roidb num_images = len(roidb) # num_images = 100 for i in xrange(num_images): im = cv2.imread(self.imdb.image_path_at(i)) if roidb[i]['flipped']: im = im[:, ::-1, :] _t.tic() scores, boxes = im_detect(self.net, im, roidb[i]['boxes']) _t.toc() feat = self.net.blobs[self.layer].data for j in xrange(1, self.imdb.num_classes): hard_inds = \ np.where((scores[:, j] > self.hard_thresh) & (roidb[i]['gt_overlaps'][:, j].toarray().ravel() < self.neg_iou_thresh))[0] if len(hard_inds) > 0: hard_feat = feat[hard_inds, :].copy() new_w_b = \ self.trainers[j].append_neg_and_retrain(feat=hard_feat) if new_w_b is not None: self.update_net(j, new_w_b[0], new_w_b[1]) print(('train_with_hard_negatives: ' '{:d}/{:d} {:.3f}s').format(i + 1, len(roidb), _t.average_time)) def train(self): # Initialize SVMs using # a. w_i = fc8_w_i - fc8_w_0 # b. b_i = fc8_b_i - fc8_b_0 # c. Install SVMs into net self.initialize_net() # Pass over roidb to count num positives for each class # a. Pre-allocate arrays for positive feature vectors # Pass over roidb, computing features for positives only self.get_pos_examples() # Pass over roidb # a. Compute cls_score with forward pass # b. For each class # i. Select hard negatives # ii. Add them to cache # c. For each class # i. If SVM retrain criteria met, update SVM # ii. Install new SVM into net self.train_with_hard_negatives() # One final SVM retraining for each class # Install SVMs into net for j in xrange(1, self.imdb.num_classes): new_w_b = self.trainers[j].append_neg_and_retrain(force=True) self.update_net(j, new_w_b[0], new_w_b[1]) class SVMClassTrainer(object): """Manages post-hoc SVM training for a single object class.""" def __init__(self, cls, dim, feature_scale=1.0, C=0.001, B=10.0, pos_weight=2.0): self.pos = np.zeros((0, dim), dtype=np.float32) self.neg = np.zeros((0, dim), dtype=np.float32) self.B = B self.C = C self.cls = cls self.pos_weight = pos_weight self.dim = dim self.feature_scale = feature_scale self.svm = svm.LinearSVC(C=C, class_weight={1: 2, -1: 1}, intercept_scaling=B, verbose=1, penalty='l2', loss='l1', random_state=cfg.RNG_SEED, dual=True) self.pos_cur = 0 self.num_neg_added = 0 self.retrain_limit = 2000 self.evict_thresh = -1.1 self.loss_history = [] def alloc_pos(self, count): self.pos_cur = 0 self.pos = np.zeros((count, self.dim), dtype=np.float32) def append_pos(self, feat): num = feat.shape[0] self.pos[self.pos_cur:self.pos_cur + num, :] = feat self.pos_cur += num def train(self): print('>>> Updating {} detector <<<'.format(self.cls)) num_pos = self.pos.shape[0] num_neg = self.neg.shape[0] print('Cache holds {} pos examples and {} neg examples'. format(num_pos, num_neg)) X = np.vstack((self.pos, self.neg)) * self.feature_scale y = np.hstack((np.ones(num_pos), -np.ones(num_neg))) self.svm.fit(X, y) w = self.svm.coef_ b = self.svm.intercept_[0] scores = self.svm.decision_function(X) pos_scores = scores[:num_pos] neg_scores = scores[num_pos:] pos_loss = (self.C * self.pos_weight * np.maximum(0, 1 - pos_scores).sum()) neg_loss = self.C * np.maximum(0, 1 + neg_scores).sum() reg_loss = 0.5 * np.dot(w.ravel(), w.ravel()) + 0.5 * b ** 2 tot_loss = pos_loss + neg_loss + reg_loss self.loss_history.append((tot_loss, pos_loss, neg_loss, reg_loss)) for i, losses in enumerate(self.loss_history): print((' {:d}: obj val: {:.3f} = {:.3f} ' '(pos) + {:.3f} (neg) + {:.3f} (reg)').format(i, *losses)) # Sanity check scores_ret = ( X * 1.0 / self.feature_scale).dot(w.T * self.feature_scale) + b assert np.allclose(scores, scores_ret[:, 0], atol=1e-5), \ "Scores from returned model don't match decision function" return ((w * self.feature_scale, b), pos_scores, neg_scores) def append_neg_and_retrain(self, feat=None, force=False): if feat is not None: num = feat.shape[0] self.neg = np.vstack((self.neg, feat)) self.num_neg_added += num if self.num_neg_added > self.retrain_limit or force: self.num_neg_added = 0 new_w_b, pos_scores, neg_scores = self.train() # scores = np.dot(self.neg, new_w_b[0].T) + new_w_b[1] # easy_inds = np.where(neg_scores < self.evict_thresh)[0] not_easy_inds = np.where(neg_scores >= self.evict_thresh)[0] if len(not_easy_inds) > 0: self.neg = self.neg[not_easy_inds, :] # self.neg = np.delete(self.neg, easy_inds) print(' Pruning easy negatives') print(' Cache holds {} pos examples and {} neg examples'. format(self.pos.shape[0], self.neg.shape[0])) print(' {} pos support vectors'.format((pos_scores <= 1).sum())) print(' {} neg support vectors'.format((neg_scores >= -1).sum())) return new_w_b else: return None def parse_args(): """ Parse input arguments """ parser = argparse.ArgumentParser(description='Train SVMs (old skool)') parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]', default=0, type=int) parser.add_argument('--def', dest='prototxt', help='prototxt file defining the network', default=None, type=str) parser.add_argument('--net', dest='caffemodel', help='model to test', default=None, type=str) parser.add_argument('--cfg', dest='cfg_file', help='optional config file', default=None, type=str) parser.add_argument('--imdb', dest='imdb_name', help='dataset to train on', default='voc_2007_trainval', type=str) if len(sys.argv) == 1: parser.print_help() sys.exit(1) args = parser.parse_args() return args if __name__ == '__main__': # Must turn this off to prevent issues when digging into the net blobs to # pull out features (tricky!) cfg.DEDUP_BOXES = 0 # Must turn this on because we use the test im_detect() method to harvest # hard negatives cfg.TEST.SVM = True args = parse_args() print('Called with args:') print(args) if args.cfg_file is not None: cfg_from_file(args.cfg_file) print('Using config:') pprint.pprint(cfg) # fix the random seed for reproducibility np.random.seed(cfg.RNG_SEED) # set up caffe caffe.set_mode_gpu() if args.gpu_id is not None: caffe.set_device(args.gpu_id) net = caffe.Net(args.prototxt, args.caffemodel, caffe.TEST) net.name = os.path.splitext(os.path.basename(args.caffemodel))[0] out = os.path.splitext(os.path.basename(args.caffemodel))[0] + '_svm' out_dir = os.path.dirname(args.caffemodel) imdb = get_imdb(args.imdb_name) print 'Loaded dataset `{:s}` for training'.format(imdb.name) # enhance roidb to contain flipped examples if cfg.TRAIN.USE_FLIPPED: print 'Appending horizontally-flipped training examples...' imdb.append_flipped_images() print 'done' SVMTrainer(net, imdb).train() filename = '{}/{}.caffemodel'.format(out_dir, out) net.save(filename) print 'Wrote svm model to: {:s}'.format(filename)