Repository: DeNA/Chainer_Mask_R-CNN
Branch: master
Commit: 315a5b098978
Files: 25
Total size: 113.7 KB

Directory structure:
gitextract_iecqmj1q/

├── .gitignore
├── LICENSE
├── README.md
├── README_JP.md
├── coco_dataset.py
├── demo.py
├── getcoco.sh
├── mask_rcnn.py
├── mask_rcnn_resnet.py
├── mask_rcnn_train_chain.py
├── mask_rcnn_train_chain_batch.py
├── train.py
└── utils/
    ├── __init__.py
    ├── bn_utils.py
    ├── box_utils.py
    ├── cocoapi_evaluator.py
    ├── detection_coco_evaluator.py
    ├── detectron_parser.py
    ├── eval_detection_coco.py
    ├── makecocolist.py
    ├── proposal_target_creator.py
    ├── region_proposal_network.py
    ├── roi_align_2d.py
    ├── updater.py
    └── vis_bbox.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
__pycache*
result
*.png

================================================
FILE: LICENSE
================================================
Copyright (c) 2018 DeNA Co., Ltd.

Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal 
in the Software without restriction, including without limitation the rights 
to use, copy, modify, merge, publish, distribute, and/or sublicense 
copies of the Software, and to permit persons to whom the Software is 
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all 
copies or substantial portions of the Software; and

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#####################################################################################
# Chainer_Mask_R-CNN is designed based on chainercv's API.
# Chainer_Mask_R-CNN's source code and documents contain the original chainercv ones.
#####################################################################################
Copyright (c) 2017 Yusuke Niitani.
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

    * Redistributions of source code must retain the above copyright
       notice, this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above
       copyright notice, this list of conditions and the following
       disclaimer in the documentation and/or other materials provided
       with the distribution.

    * Neither the name of the chainercv Developers nor the names of any
       contributors may be used to endorse or promote products derived
       from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#####################################################################################
# Chainer_Mask_R-CNN is designed based on chainer's API.
# Chainer_Mask_R-CNN's source code and documents contain the original chainer ones.
#####################################################################################
Copyright (c) 2015 Preferred Infrastructure, Inc.
Copyright (c) 2015 Preferred Networks, Inc.
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

    * Redistributions of source code must retain the above copyright
       notice, this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above
       copyright notice, this list of conditions and the following
       disclaimer in the documentation and/or other materials provided
       with the distribution.

    * Neither the name of the chainer Developers nor the names of any
       contributors may be used to endorse or promote products derived
       from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
~                                                                      


================================================
FILE: README.md
================================================
# Chainer\_Mask\_R-CNN   
Chainer implementation of Mask R-CNN - the multi-task network for object detection, object classification, and instance segmentation.
(https://arxiv.org/abs/1703.06870)   
<a href="README_JP.md">日本語版 README</a>   

## What's New

- Training result for R-50-C4 model has been evaluated!
- COCO box AP = 0.346 using our trainer (0.355 with official boxes) 
- COCO mask AP = 0.287 using our trainer (0.314 with official boxes) 

## Examples
- to be updated

## Requirements
- [Chainer](https://github.com/pfnet/chainer)
- [Chainercv](https://github.com/chainer/chainercv)
- [Cupy](https://github.com/cupy/cupy)   
(operable if your environment can run chainer > v3 with cuda and cudnn.)   
(verified as operable: chainer==3.1.0, chainercv==0.7.0, cupy==1.0.3)
```
$ pip install chainer   
$ pip install chainercv
$ pip install cupy
```   
- Python 3.0+   
- NumPy   
- Matplotlib   
- OpenCV   

## TODOs
- [x] Precision Evaluator (bbox, COCO metric)
- [x] Detectron Model Parser 
- [x] Modify ROIAlign
- [x] Mask inference using refined ROIs
- [x] Precision Evaluator (mask, COCO metric)
- [ ] Improve segmentation AP for R-50-C4 model
- [ ] Feature Pyramid Network (R-50-FPN)
- [ ] Keypoint Detection (R-50-FPN, Keypoints)

## Benchmark Results

<table><tbody>
<tr><th align="left" bgcolor=#f8f8f8> </th>     <td bgcolor=white> Box AP 50:95</td><td bgcolor=white> Segm AP 50:95</td></tr>
<tr><th align="left" bgcolor=#f8f8f8>Ours (1 GPU)</th> <td bgcolor=white> 0.346 </td><td bgcolor=white> 0.287 </td></tr>
<tr><th align="left" bgcolor=#f8f8f8>Detectron model</th> <td bgcolor=white> 0.350 </td><td bgcolor=white> 0.295 </td></tr>
<tr><th align="left" bgcolor=#f8f8f8>Detectron caffe2</th> <td bgcolor=white> 0.355 </td><td bgcolor=white> 0.314 </td></tr>
</table></tbody>

## Inference with Pretrained Models

- Download the pretrained model from the [Model Zoo] (https://github.com/facebookresearch/Detectron/blob/master/MODEL_ZOO.md)   
 (`model` link of `R-50-C4	Mask` at `End-to-End Faster & Mask R-CNN Baselines`)   
- Make `modelfiles` directory and put the downloaded file `model_final.pkl` in it   
- Execute:  
```   
python utils/detectron_parser.py
```
- And the converted model file is saved in `modelfiles`
- Run the demo:
```
python demo.py --bn2affine --modelfile modelfiles/e2e_mask_rcnn_R-50-C4_1x_d2c.npz --image <input image>
```

## Prerequisites for training
- Download 'ResNet-50-model.caffemodel' from the "OneDrive download" of [ResNet pretrained models](https://github.com/KaimingHe/deep-residual-networks#models) 
for model initialization and place it in ~/.chainer/dataset/pfnet/chainer/models/

- COCO 2017 dataset :
the COCO dataset can be downloaded and unzipped by:
```
bash getcoco.sh
```   
Setup the COCO API:   
```
git clone https://github.com/waleedka/coco
cd coco/PythonAPI/
make
python setup.py install
cd ../../
```
note: the official coco repository is not python3 compatible.    
Use the repository above in order to run our evaluation.    

## Train

```
python train.py 
```
arguments and the default conditions are defined as follows:
```
'--dataset', choices=('coco2017'), default='coco2017'   
'--extractor', choices=('resnet50','resnet101'), default='resnet50', help='extractor network'
'--gpu', '-g', type=int, default=0   
'--lr', '-l', type=float, default=1e-4   
'--batchsize', '-b', type=int, default=8   
'--freeze_bn', action='store_true', default=False, help='freeze batchnorm gamma/beta'
'--bn2affine', action='store_true', default=False, help='batchnorm to affine'
'--out', '-o', default='result',  help='output directory'   
'--seed', '-s', type=int, default=0   
'--roialign', action='store_true', default=True, help='True: ROIAlign, False: ROIpooling'
'--step_size', '-ss', type=int, default=400000  
'--lr_step', '-ls', type=int, default=480000    
'--lr_initialchange', '-li', type=int, default=800     
'--pretrained', '-p', type=str, default='imagenet'   
'--snapshot', type=int, default=4000   
'--validation', type=int, default=30000   
'--resume', type=str   
'--iteration', '-i', type=int, default=800000   
'--roi_size', '-r', type=int, default=14, help='ROI size for mask head input'
'--gamma', type=float, default=1, help='mask loss balancing factor'   
```

note that we use a subdivision-based updater to enable training with large batch size.


## Demo
Segment the objects in the input image by executing:   
```
python demo.py --image <input image> --modelfile result/snapshot_model.npz --contour
```

## Evaluation

Evaluate the trained model with COCO metric (bounding box, segmentation) :   
```
python train.py --lr 0 --iteration 1 --validation 1 --resume <trained_model> 
```

## Citation
Please cite the original paper in your publications if it helps your research:    

    @article{DBLP:journals/corr/HeGDG17,
      author    = {Kaiming He and
                  Georgia Gkioxari and
                  Piotr Doll{\'{a}}r and
                  Ross B. Girshick},
      title     = {Mask {R-CNN}},
      journal   = {CoRR},
      volume    = {abs/1703.06870},
      year      = {2017},
      url       = {http://arxiv.org/abs/1703.06870},
      archivePrefix = {arXiv},
      eprint    = {1703.06870},
      timestamp = {Wed, 07 Jun 2017 14:42:32 +0200},
      biburl    = {http://dblp.org/rec/bib/journals/corr/HeGDG17},
      bibsource = {dblp computer science bibliography, http://dblp.org}
    }


================================================
FILE: README_JP.md
================================================
# Chainer\_Mask\_R-CNN   
マルチタスク検出器Mask R-CNNのchainer実装
(https://arxiv.org/abs/1703.06870)   

## 実行例
- 準備中

## 必要環境
- [Chainer](https://github.com/pfnet/chainer)
- [Chainercv](https://github.com/chainer/chainercv)
- [Cupy](https://github.com/cupy/cupy)   
 (動作確認済み: chainer==3.1.0, chainercv==0.7.0, verified: cupy==1.0.3)
```
$ pip install chainer   
$ pip install chainercv
$ pip install cupy==1.0.3
```   
- Python 3.0+   
- NumPy   
- Matplotlib   
- OpenCV   

## TODOs
- [x] Precision Evaluator (bbox, COCO metric)
- [x] Detectron Model Parser 
- [x] Modify ROIAlign
- [x] Mask inference using refined ROIs
- [x] Precision Evaluator (mask, COCO metric)
- [ ] Feature Pyramid Network (R-50-FPN)
- [ ] Keypoint Detection (R-50-FPN, Keypoints)

## 学習済みモデルの使用

- [Model Zoo] (https://github.com/facebookresearch/Detectron/blob/master/MODEL_ZOO.md) からモデルファイルをダウンロード
 ( `End-to-End Faster & Mask R-CNN Baselines` の `R-50-C4	Mask` 行の `model` リンク)   
- `modelfiles` ディレクトリを作り、ダウンロードした `model_final.pkl` を置く
- 以下を実行
```   
python utils/detectron_parser.py
```
- `modelfiles` の中に変換されたモデルファイルが保存されます。
- 以下によりデモを実行
```
python demo.py --bn2affine --modelfile modelfiles/e2e_mask_rcnn_R-50-C4_1x_d2c.npz --image <input image>
```

## 学習のための準備
- 学習済みモデルのダウンロード  
・以下リンク先の'OneDrive download'から、ResNet-50-model.caffemodelをダウンロード
 [ResNet pretrained models](https://github.com/KaimingHe/deep-residual-networks#models)
・~/.chainer/dataset/pfnet/chainer/models/　に置く

- COCO 2017 データセット
COCOデータセットのダウンロードと解凍:   
```
bash getcoco.sh
```
- COCO APIのセットアップ:   
```
git clone https://github.com/waleedka/coco
cd coco/PythonAPI/
make
python setup.py install
cd ../../
```

## 学習

```
python train.py 
```
引数は以下です:
```
'--dataset', choices=('coco2017'), default='coco2017'   
'--extractor', choices=('resnet50','resnet101'), default='resnet50', help='extractor network'
'--gpu', '-g', type=int, default=0   
'--lr', '-l', type=float, default=1e-4   
'--batchsize', '-b', type=int, default=8   
'--freeze_bn', action='store_true', default=False, help='freeze batchnorm gamma/beta'
'--bn2affine', action='store_true', default=False, help='batchnorm to affine'
'--out', '-o', default='result',  help='output directory'   
'--seed', '-s', type=int, default=0   
'--roialign', action='store_true', default=True, help='True: ROIAlign, False: ROIpooling'
'--step_size', '-ss', type=int, default=400000  
'--lr_step', '-ls', type=int, default=480000    
'--lr_initialchange', '-li', type=int, default=800     
'--pretrained', '-p', type=str, default='imagenet'   
'--snapshot', type=int, default=4000   
'--validation', type=int, default=30000   
'--resume', type=str   
'--iteration', '-i', type=int, default=800000   
'--roi_size', '-r', type=int, default=14, help='ROI size for mask head input'
'--gamma', type=float, default=1, help='mask loss balancing factor'   
```

本実装ではsubdivisionを用いたupdateを行なっているため、batch size = 1 相当のGPUメモリでbatch size=8等を指定可能です

## デモ
入力画像のインスタンス・セグメンテーションを実行します:   
```
python demo.py --image <input image> --modelfile result/snapshot_model.npz --contour  
```

### 評価

COCO metric (Bounding Box, Segmentation) によるモデルの評価を実行します。

```
python train.py --lr 0 --iteration 1 --validation 1 --resume <trained_model> 
```

## 引用
Please cite the original paper in your publications if it helps your research:    

    @article{DBLP:journals/corr/HeGDG17,
      author    = {Kaiming He and
                  Georgia Gkioxari and
                  Piotr Doll{\'{a}}r and
                  Ross B. Girshick},
      title     = {Mask {R-CNN}},
      journal   = {CoRR},
      volume    = {abs/1703.06870},
      year      = {2017},
      url       = {http://arxiv.org/abs/1703.06870},
      archivePrefix = {arXiv},
      eprint    = {1703.06870},
      timestamp = {Wed, 07 Jun 2017 14:42:32 +0200},
      biburl    = {http://dblp.org/rec/bib/journals/corr/HeGDG17},
      bibsource = {dblp computer science bibliography, http://dblp.org}
    }


================================================
FILE: coco_dataset.py
================================================
import numpy as np
from skimage.draw import polygon
import json
import os
import cv2
import pycocotools
from pycocotools.coco import COCO

import chainer
from chainercv.utils import read_image
class COCODataset(chainer.dataset.DatasetMixin):
    def __init__(self, data_dir='COCO/', json_file='instances_train2017.json',
                 name='train2017', id_list_file='train2017.txt', sizemin=10):
        self.data_dir  = data_dir
        self.json_file = json_file
        self.coco = COCO(self.data_dir + 'annotations/'+self.json_file)
        self.ids = self.coco.getImgIds()
        self.name = name
        self.sizemin = sizemin
        self.class_ids = sorted(self.coco.getCatIds())

    def __len__(self):
        return len(self.ids)

    def ann2rle(self, ann, height, width):
        if isinstance(ann, list):
            rles = pycocotools.mask.frPyObjects(ann, height, width)
            rle = pycocotools.mask.merge(rles)
        elif isinstance(ann['counts'], list):
            rle = pycocotools.mask.frPyObjects(ann, height, width)
        else:
            rle = ann
        return rle

    def get_example(self, i):
        #i = i % 500 # for limiting data size
        numofboxes=0
        while True:
            id_ = self.ids[i]
            annot_labels, annot_bboxes, annot_segs= list(), list(), list()
            anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=None)
            annotations = self.coco.loadAnns(anno_ids)
            for a in annotations:
                if a['bbox'][2] > self.sizemin and a['bbox'][3] > self.sizemin \
                and a['iscrowd']==0:
                    annot_labels.append(a['category_id'])
                    annot_bboxes.append(a['bbox'])
                    annot_segs.append(a['segmentation'])
            numofboxes=len(annot_labels)
            if numofboxes > 0 or chainer.config.train == False:
                break
            else:
                i = i - 1
        img_file = os.path.join(self.data_dir, self.name, '{:012}'.format(id_) + '.jpg')
        img = read_image(img_file, color=True)
        _, h, w = img.shape
        annot_masks = []
        for annot_seg_polygons in annot_segs:
            rle = self.ann2rle(annot_seg_polygons, h, w)
            annot_masks.append(pycocotools.mask.decode(rle))
        if numofboxes > 0:
            annot_masks = np.stack(annot_masks).astype(np.uint8) #y,x
            annot_bboxes = np.stack(annot_bboxes).astype(np.float32)
            annot_labels = np.stack(annot_labels).astype(np.int32)
        else:
            annot_labels, annot_bboxes, annot_masks = [], [], []

        return img, annot_labels, annot_bboxes, annot_masks, i


================================================
FILE: demo.py
================================================
import argparse
import chainer
import numpy as np
from mask_rcnn_train_chain import MaskRCNNTrainChain
from utils.bn_utils import freeze_bn, bn_to_affine

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--modelfile')
    parser.add_argument('--image', type=str)
    parser.add_argument('--roi_size', '-r', type=int, default=14, help='ROI size for mask head input')
    parser.add_argument('--roialign', action='store_false', default=True, help='default: True')
    parser.add_argument('--contour', action='store_true', default=False, help='visualize contour')
    parser.add_argument('--background', action='store_true', default=False, help='background(no-display mode)')
    parser.add_argument('--bn2affine', action='store_true', default=False, help='batchnorm to affine')
    parser.add_argument('--extractor', choices=('resnet50','resnet101'),
                        default='resnet50', help='extractor network')
    args = parser.parse_args()

    #network class id --> coco label id
    test_class_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, \
    27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, \
    57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]

    if args.background:
        import matplotlib
        matplotlib.use('Agg')
    import matplotlib.pyplot as plot
    from utils.vis_bbox import vis_bbox
    from chainercv.datasets import voc_bbox_label_names
    from mask_rcnn_resnet import MaskRCNNResNet
    from chainercv import utils
    if args.extractor=='resnet50':
        model = MaskRCNNResNet(n_fg_class=80, roi_size=args.roi_size, pretrained_model=args.modelfile, n_layers=50, roi_align=args.roialign, class_ids=test_class_ids)
    elif args.extractor=='resnet101':
        model = MaskRCNNResNet(n_fg_class=80, roi_size=args.roi_size, pretrained_model=args.modelfile, n_layers=101, roi_align=args.roialign, class_ids=test_class_ids)

    chainer.serializers.load_npz(args.modelfile, model)
    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()
    if args.bn2affine:
        bn_to_affine(model)
    img = utils.read_image(args.image, color=True)
    bboxes, labels, scores, masks = model.predict([img])
    bbox, label, score, mask = bboxes[0], np.asarray(labels[0],dtype=np.int32), scores[0], masks[0]
    #print(bbox, np.asarray(label,dtype=np.int32), score, mask)

    coco_label_names=('background',  # class zero
            'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
            'fire hydrant', 'street sign', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
            'elephant', 'bear', 'zebra', 'giraffe', 'hat', 'backpack', 'umbrella', 'shoe', 'eye glasses', 'handbag', 'tie', 'suitcase', 'frisbee',
            'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle',
            'plate', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
            'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
            'mirror', 'dining table', 'window', 'desk','toilet', 'door', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
            'toaster', 'sink', 'refrigerator', 'blender', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'

    )
    vis_bbox(
        img, bbox, label=label, score=score, mask=mask, label_names=coco_label_names, contour=args.contour, labeldisplay=True)
    plot.show()
    filename = "output.png"
    plot.savefig(filename)

if __name__ == '__main__':
    main()


================================================
FILE: getcoco.sh
================================================
# get COCO dataset
mkdir COCO
cd COCO

wget http://images.cocodataset.org/zips/train2017.zip
wget http://images.cocodataset.org/zips/val2017.zip
wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip

unzip train2017.zip
unzip val2017.zip
unzip annotations_trainval2017.zip

rm -f train2017.zip
rm -f val2017.zip
rm -f annotations_trainval2017.zip

================================================
FILE: mask_rcnn.py
================================================
from __future__ import division

import numpy as np

import chainer
from chainer import cuda
import chainer.functions as F
from chainercv.links.model.faster_rcnn.utils.loc2bbox import loc2bbox
from chainercv.utils import non_maximum_suppression
from chainercv.transforms.image.resize import resize
import cv2
import pycocotools
from utils.box_utils import bbox_yxyx2xywh, im_mask

class MaskRCNN(chainer.Chain):
    def __init__(self, extractor, rpn, head, mean,
                 min_size=600, max_size=1000,
                 loc_normalize_mean=(0., 0., 0., 0.),
                 loc_normalize_std=(0.1, 0.1, 0.2, 0.2),
                 class_ids=[]
                 ):
        print("MaskRCNN initialization")
        super(MaskRCNN, self).__init__()
        with self.init_scope():
            self.extractor = extractor
            self.rpn = rpn
            self.head = head

        self.mean = mean
        self.min_size = min_size
        self.max_size = max_size
        self.loc_normalize_mean = loc_normalize_mean
        self.loc_normalize_std = loc_normalize_std
        self.use_preset('visualize')
        if class_ids==[]:
            raise ValueError('set class ids')
        self.class_ids = class_ids
        self.preset = 'visualize'
    @property
    def n_class(self):
        return self.head.n_class

    def __call__(self, x, scale=1.):
        img_size = x.shape[2:]
        h = self.extractor(x) #VGG
        rpn_locs, rpn_scores, rois, roi_indices, anchor = \
            self.rpn(h, img_size, scale) #Region Proposal Network
        hres5 = self.head.res5head(h, rois, roi_indices)
        roi_cls_locs, roi_scores = self.head.boxhead(hres5)
        return roi_cls_locs, roi_scores, rois, roi_indices, h

    def use_preset(self, preset):
        if preset == 'visualize':
            self.nms_thresh = 0.3
            self.score_thresh = 0.7
            self.preset = 'visualize'
        elif preset == 'evaluate':
            self.nms_thresh = 0.5
            self.score_thresh = 0.05
            self.preset = 'evaluate'
        else:
            raise ValueError('preset must be visualize or evaluate')

    def prepare(self, img):
        _, H, W = img.shape
        scale = self.min_size / min(H, W)
        if scale * max(H, W) > self.max_size:
            scale = self.max_size / max(H, W)
        #img = resize(img, (int(H * scale), int(W * scale)))
        img = img.transpose((1,2,0))
        img = cv2.resize(img, None, None, fx=scale, fy=scale,
                    interpolation=cv2.INTER_LINEAR)
        img = img.transpose((2,0,1))
        img = (img - self.mean).astype(np.float32, copy=False)
        img = img[::-1, :, :] # RGB to BGR order for resnet pretrained model
        return img

    def _suppress(self, raw_cls_bbox, raw_cls_roi, raw_prob):
        bbox = list()
        roi = list()
        label = list()
        score = list()
        mask = list()
        for l in range(1, self.n_class):
            cls_bbox_l = raw_cls_bbox.reshape((-1, self.n_class, 4))[:, l, :]
            cls_roi_l = raw_cls_roi.reshape((-1, self.n_class, 4))[:, l, :]
            prob_l = raw_prob[:, l]
            lmask = prob_l > self.score_thresh
            cls_bbox_l = cls_bbox_l[lmask]
            cls_roi_l = cls_roi_l[lmask]
            prob_l = prob_l[lmask]
            keep = non_maximum_suppression(cls_bbox_l, self.nms_thresh, prob_l)
            bbox.append(cls_bbox_l[keep])
            roi.append(cls_roi_l[keep])
            label.append((l - 1) * np.ones((len(keep),)))
            score.append(prob_l[keep])
        bbox = np.concatenate(bbox, axis=0).astype(np.float32)
        roi = np.concatenate(roi, axis=0).astype(np.float32)
        label = np.concatenate(label, axis=0).astype(np.float32)
        score = np.concatenate(score, axis=0).astype(np.float32)
        return bbox, roi, label, score

    def predict(self, imgs):
        prepared_imgs = list()
        sizes = list()
        #print("predicting!")
        for img in imgs:
            size = img.shape[1:]
            img = self.prepare(img.astype(np.float32))
            prepared_imgs.append(img)
            sizes.append(size)
        bboxes = list()
        out_rois = list()
        labels = list()
        scores = list()
        masks = list()
        for img, size in zip(prepared_imgs, sizes):
            with chainer.using_config('train', False), \
                chainer.function.no_backprop_mode():
                img_var = chainer.Variable(self.xp.asarray(img[None]))
                scale = img_var.shape[3] / size[1]
                roi_cls_locs, roi_scores, rois, _,  h = self.__call__(img_var, scale=scale)
            #assuming batch size = 1
            roi_cls_loc = roi_cls_locs.data
            roi_score = roi_scores.data
            roi = rois / scale
            mean = self.xp.tile(self.xp.asarray(self.loc_normalize_mean), self.n_class)
            std = self.xp.tile(self.xp.asarray(self.loc_normalize_std), self.n_class)
            roi_cls_loc = (roi_cls_loc * std + mean).astype(np.float32)
            roi_cls_loc = roi_cls_loc.reshape((-1, self.n_class, 4))
            roi = self.xp.broadcast_to(roi[:, None], roi_cls_loc.shape).reshape((-1, 4))
            cls_bbox = loc2bbox(roi, roi_cls_loc.reshape((-1, 4)))
            cls_bbox = cls_bbox.reshape((-1, self.n_class * 4))
            cls_roi = roi.reshape((-1, self.n_class * 4))
            #clip the bbox
            cls_bbox[:, 0::2] = self.xp.clip(cls_bbox[:, 0::2], 0, size[0])
            cls_bbox[:, 1::2] = self.xp.clip(cls_bbox[:, 1::2], 0, size[1])
            cls_roi[:, 0::2] = self.xp.clip(cls_roi[:, 0::2], 0, size[0])
            cls_roi[:, 1::2] = self.xp.clip(cls_roi[:, 1::2], 0, size[1])

            prob = F.softmax(roi_score).data
            raw_cls_bbox = cuda.to_cpu(cls_bbox)
            raw_cls_roi = cuda.to_cpu(cls_roi)
            raw_prob = cuda.to_cpu(prob)
            bbox, out_roi, label, score = self._suppress(raw_cls_bbox, raw_cls_roi, raw_prob)
            mask=[]
            if len(bbox) > 0:
                # mask head
                roi_indices = self.xp.zeros((len(bbox),), dtype=np.int32)
                with chainer.using_config('train', False), \
                    chainer.function.no_backprop_mode():
                    hres5 = self.head.res5head(h, cuda.to_gpu(bbox * scale), roi_indices)
                    roi_masks = self.head.maskhead(hres5)
                roi_mask = F.sigmoid(roi_masks).data
                raw_mask = cuda.to_cpu(roi_mask)
                # postprocess 
                if self.preset == 'evaluate':
                    bboxes.append(bbox_yxyx2xywh(bbox))
                    wmasks = []
                    for m, b, l in zip(raw_mask, bbox, label):
                        wm = im_mask(m[int(l+1)], size, b)
                        # encode the mask 
                        wm = pycocotools.mask.encode(np.asfortranarray(wm))
                        wm['counts'] = wm['counts'].decode('ascii')
                        mask.append(wm)
                elif self.preset == 'visualize':
                    bboxes.append(bbox)
                    wmasks = []
                    for m, b, l in zip(raw_mask, bbox, label):
                        wm = im_mask(m[int(l+1)], size, b)
                        mask.append(wm)
            elif self.preset == 'evaluate':
                # len(bbox) = 0
                wm = np.zeros((size[0], size[1]), dtype=np.uint8)
                wm = pycocotools.mask.encode(np.asfortranarray(wm))
                wm['counts'] = wm['counts'].decode('ascii')
                mask.append(wm)
                bboxes.append(bbox_yxyx2xywh(bbox))
            labels.append([self.class_ids[int(l)] for l in label.tolist()])
            scores.append(score)
            masks.append(mask)

        return bboxes, labels, scores, masks


================================================
FILE: mask_rcnn_resnet.py
================================================
import numpy as np

import chainer
import chainer.functions as F
import chainer.links as L
from mask_rcnn import MaskRCNN
#from chainercv.links.model.faster_rcnn.region_proposal_network import \
#    RegionProposalNetwork
from utils.region_proposal_network import RegionProposalNetwork
from utils import roi_align_2d
from chainer.links.model.vision.resnet import BuildingBlock, _retrieve
from chainer.links.connection.convolution_2d import Convolution2D
from chainer.links.connection.linear import Linear
from chainer.links.normalization.batch_normalization import BatchNormalization
from chainer.initializers import constant

class ExtractorResNet(chainer.link.Chain):
    def __init__(self, pretrained_model='auto', n_layers=50, roi_size=14):
        super(ExtractorResNet, self).__init__()
        print('Extractor ResNet',n_layers,' initialization')
        kwargs = {'initialW': constant.Zero()}
        if pretrained_model=='auto':
            if n_layers == 50:
                pretrained_model = 'ResNet-50-model.caffemodel'
                block = [3, 4, 6, 3]
            elif n_layers == 101:
                pretrained_model = 'ResNet-101-model.caffemodel'
                block = [3, 4, 23, 3]    
        with self.init_scope():
            self.conv1 = Convolution2D(3, 64, 7, 2, 3, **kwargs, nobias=True)
            self.bn1 = BatchNormalization(64)
            self.res2 = BuildingBlock(block[0], 64, 64, 256, 1, **kwargs)
            self.res3 = BuildingBlock(block[1], 256, 128, 512, 2, **kwargs)
            self.res4 = BuildingBlock(block[2], 512, 256, 1024, 2, **kwargs)
            self.res5 = BuildingBlock(block[3], 1024, 512, 2048, roi_size//7, **kwargs)
            self.fc6 = Linear(2048, 1000)
        if pretrained_model and pretrained_model.endswith('.caffemodel'):
            _retrieve(n_layers, 'ResNet-{}-model.npz'.format(n_layers),
                      pretrained_model, self)
        elif pretrained_model:
            npz.load_npz(pretrained_model, self)
        del self.fc6
    def __call__(self, x):
        h = F.relu(self.bn1(self.conv1(x)))
        _, _, H, W = h.shape
        Hpool = (H + 1)//2
        Wpool = (W + 1)//2
        h = F.max_pooling_2d(h, ksize=3, stride=2, pad=1)
        h = h[:, :, :Hpool, :Wpool]
        h = self.res2(h)
        h = self.res3(h)
        h = self.res4(h)
        return h

class MaskRCNNResNet(MaskRCNN):
    feat_stride = 16
    def __init__(self,
                 n_fg_class=None,
                 pretrained_model=None,
                 min_size=800, max_size=1333,
                 ratios=[0.5 ,1, 2], anchor_scales=[2, 4, 8, 16, 32],
                 initialW=None, rpn_initialW=None,
                 loc_initialW=None, score_initialW=None,
                 proposal_creator_params={"n_test_pre_nms":6000,"n_test_post_nms": 1000,"min_size":4},
                 roi_size=14,
                 class_ids=[],
                 n_layers=50, 
                 roi_align=True
                 ):
        print("MaskRNNResNet initialization")
        if n_fg_class is None:
            raise ValueError('supply n_fg_class!')
        if loc_initialW is None:
            loc_initialW = chainer.initializers.Normal(0.001)
        if score_initialW is None:
            score_initialW = chainer.initializers.Normal(0.01)
        if rpn_initialW is None:
            rpn_initialW = chainer.initializers.Normal(0.01)
        if initialW is None:# and pretrained_model:
            print("setting initialW")
            initialW = chainer.initializers.Normal(0.01)
        self.roi_size=roi_size
        if pretrained_model is not None:
            pretrained_model = 'auto'
        extractor = ExtractorResNet(pretrained_model, n_layers=n_layers, roi_size=roi_size)
        rpn = RegionProposalNetwork(
            1024, 1024,
            ratios=ratios, anchor_scales=anchor_scales,
            feat_stride=self.feat_stride,
            initialW=rpn_initialW,
            proposal_creator_params=proposal_creator_params,
        )
        head = MaskRCNNHead(
            n_fg_class + 1,
            roi_size=self.roi_size, spatial_scale=1. / self.feat_stride,
            initialW=initialW, loc_initialW=loc_initialW, score_initialW=score_initialW,
            roi_align=roi_align, reslayer=extractor.res5
        )
        del extractor.res5
        super(MaskRCNNResNet, self).__init__(
            extractor, rpn, head,
            mean=np.array([122.7717, 115.9465, 102.9801], dtype=np.float32)[:, None, None],
            min_size=min_size, max_size=max_size, class_ids=class_ids
        )

class MaskRCNNHead(chainer.Chain):
    def __init__(self, n_class, roi_size, spatial_scale,
                 initialW=None, loc_initialW=None, score_initialW=None, roi_align=True, reslayer=None):
        super(MaskRCNNHead, self).__init__()
        with self.init_scope():
            self.res5 = reslayer#BuildingBlock(3, 1024, 512, 2048, 1, initialW=initialW) 
            #class / loc branch
            self.cls_loc = L.Linear(2048, n_class * 4, initialW=initialW)
            self.score = L.Linear(2048, n_class, initialW=score_initialW)
            #Mask-RCNN branch
            self.deconvm1 = L.Deconvolution2D(2048, 256, 2, 2, initialW=initialW)
            self.convm2 = L.Convolution2D(256, n_class, 1, 1, pad=0,initialW=initialW)

        self.n_class = n_class
        self.roi_size = roi_size
        self.spatial_scale = spatial_scale
        self.roi_align = roi_align
        print("ROI Align=",roi_align)

    def res5head(self, x, rois, roi_indices):
        # extracted feature map -> pooling -> res5 block 
        roi_indices = roi_indices.astype(np.float32)
        indices_and_rois = self.xp.concatenate(
            (roi_indices[:, None], rois), axis=1)
        #x: (batch, channel, w, h)
        #rois: (128, 4) (ROI indices)
        if self.roi_align:
            pool = _roi_align_2d_yx(
                x, indices_and_rois, self.roi_size,self.roi_size,
                self.spatial_scale)
        else:
            pool = _roi_pooling_2d_yx(
                x, indices_and_rois, self.roi_size,self.roi_size,
                self.spatial_scale)
        hres5 = self.res5(pool)
        return hres5

    def maskhead(self, hres5):
        # mask branch
        h = F.relu(self.deconvm1(hres5)) 
        masks=self.convm2(h)
        return masks

    def boxhead(self, hres5):
        # box branch
        h = F.average_pooling_2d(hres5, self.roi_size//2, stride=7)
        roi_cls_locs = self.cls_loc(h)
        roi_scores = self.score(h)
        return roi_cls_locs, roi_scores

def _roi_pooling_2d_yx(x, indices_and_rois, outh, outw, spatial_scale):
    xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]]
    pool = F.roi_pooling_2d(
        x, xy_indices_and_rois, outh, outw, spatial_scale)
    return pool

def _roi_align_2d_yx(x, indices_and_rois, outh, outw, spatial_scale):
    xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]]
    pool = roi_align_2d.roi_align_2d(
        x, xy_indices_and_rois, outh, outw, spatial_scale)
    return pool


================================================
FILE: mask_rcnn_train_chain.py
================================================
import numpy as np

import chainer
from chainer import cuda
import chainer.functions as F

from chainercv.links.model.faster_rcnn.utils.anchor_target_creator import AnchorTargetCreator
from utils.proposal_target_creator import ProposalTargetCreator
from chainer import computational_graph as c
from chainercv.links import PixelwiseSoftmaxClassifier

class MaskRCNNTrainChain(chainer.Chain):
    def __init__(self, mask_rcnn, rpn_sigma=3., roi_sigma=1., gamma=1,
                 anchor_target_creator=AnchorTargetCreator(),
                 roi_size=14):
        super(MaskRCNNTrainChain, self).__init__()
        with self.init_scope():
            self.mask_rcnn = mask_rcnn
        self.rpn_sigma = rpn_sigma
        self.roi_sigma = roi_sigma
        self.anchor_target_creator = anchor_target_creator
        self.proposal_target_creator = ProposalTargetCreator(roi_size=roi_size//2)
        self.loc_normalize_mean = mask_rcnn.loc_normalize_mean
        self.loc_normalize_std = mask_rcnn.loc_normalize_std
        self.decayrate=0.99
        self.avg_loss = None
        self.gamma=gamma
    def __call__(self, imgs, bboxes, labels, scale, masks, i):

        if isinstance(bboxes, chainer.Variable):
            bboxes = bboxes.data
        if isinstance(labels, chainer.Variable):
            labels = labels.data
        if isinstance(scale, chainer.Variable):
            scale = scale.data
        if isinstance(masks, chainer.Variable):
            masks = masks.data
        scale = np.asscalar(cuda.to_cpu(scale))
        n = bboxes.shape[0]
        if n != 1:
            raise ValueError('only batch size 1 is supported')
        _, _, H, W = imgs.shape
        img_size = (H, W)
        #Extractor (VGG) : img -> features
        with chainer.using_config('train', False):
            features = self.mask_rcnn.extractor(imgs)

        #Region Proposal Network : features -> rpn_locs, rpn_scores, rois
        rpn_locs, rpn_scores, rois, roi_indices, anchor = self.mask_rcnn.rpn(
            features, img_size, scale)
        bbox, label, mask, rpn_score, rpn_loc, roi = \
            bboxes[0], labels[0], masks[0], rpn_scores[0], rpn_locs[0], rois # batch size=1

        #proposal target : roi(proposed) , bbox(GT), label(GT) -> sample_roi, gt_roi_loc, gt_roi_label
        #the targets are compared with the head output.
        sample_roi, gt_roi_loc, gt_roi_label, gt_roi_mask = self.proposal_target_creator(
            roi, bbox, label, mask, self.loc_normalize_mean, self.loc_normalize_std)
        sample_roi_index = self.xp.zeros((len(sample_roi),), dtype=np.int32)

        #Head Network : features, sample_roi -> roi_cls_loc, roi_score
        with chainer.using_config('train', False):
            hres5 = self.mask_rcnn.head.res5head(features, sample_roi, sample_roi_index)
            roi_cls_loc, roi_score = self.mask_rcnn.head.boxhead(hres5)
            roi_cls_mask = self.mask_rcnn.head.maskhead(hres5)
            del(hres5)

        #RPN losses
        gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(bbox, anchor, img_size)
        rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc, gt_rpn_label, self.rpn_sigma)
        rpn_cls_loss = F.sigmoid_cross_entropy(rpn_score, gt_rpn_label)

        #Head output losses
        n_sample = roi_cls_loc.shape[0]
        roi_cls_loc = roi_cls_loc.reshape((n_sample, -1, 4))
        roi_loc = roi_cls_loc[self.xp.arange(n_sample), gt_roi_label] 
        roi_mask = roi_cls_mask[self.xp.arange(n_sample), gt_roi_label]
        roi_loc_loss = _fast_rcnn_loc_loss(roi_loc, gt_roi_loc, gt_roi_label, self.roi_sigma)
        roi_cls_loss = F.softmax_cross_entropy(roi_score, gt_roi_label)

        #mask loss:  average binary cross-entropy loss
        mask_loss = F.sigmoid_cross_entropy(roi_mask[0:gt_roi_mask.shape[0]], gt_roi_mask)

        #total loss
        loss = rpn_loc_loss + rpn_cls_loss + roi_loc_loss + roi_cls_loss + self.gamma * mask_loss

        #avg loss calculation
        if self.avg_loss is None:
            self.avg_loss = loss.data
        else:
            self.avg_loss = self.avg_loss * self.decayrate + loss.data*(1-self.decayrate)
        chainer.reporter.report({'rpn_loc_loss':rpn_loc_loss,
                                 'rpn_cls_loss':rpn_cls_loss,
                                 'roi_loc_loss':roi_loc_loss,
                                 'roi_cls_loss':roi_cls_loss,
                                 'roi_mask_loss':self.gamma * mask_loss,
                                 'avg_loss':self.avg_loss,
                                 'loss':loss}, self)
        return loss


def _smooth_l1_loss(x, t, in_weight, sigma):
    sigma2 = sigma ** 2
    diff = in_weight * (x - t)
    abs_diff = F.absolute(diff)
    flag = (abs_diff.data < (1. / sigma2)).astype(np.float32)
    y = (flag * (sigma2 / 2.) * F.square(diff) +
         (1 - flag) * (abs_diff - 0.5 / sigma2))
    return F.sum(y)

def _fast_rcnn_loc_loss(pred_loc, gt_loc, gt_label, sigma):
    xp = chainer.cuda.get_array_module(pred_loc)
    in_weight = xp.zeros_like(gt_loc)
    in_weight[gt_label > 0] = 1
    loc_loss = _smooth_l1_loss(pred_loc, gt_loc, in_weight, sigma)
    loc_loss /= xp.sum(gt_label >= 0)
    return loc_loss


================================================
FILE: mask_rcnn_train_chain_batch.py
================================================
import numpy as np

import chainer
from chainer import cuda
import chainer.functions as F

from chainercv.links.model.faster_rcnn.utils.anchor_target_creator import AnchorTargetCreator
from utils.proposal_target_creator import ProposalTargetCreator
from chainer import computational_graph as c
from chainercv.links import PixelwiseSoftmaxClassifier

class MaskRCNNTrainChain(chainer.Chain):
    def __init__(self, mask_rcnn, rpn_sigma=3., roi_sigma=1., gamma=1,
                 anchor_target_creator=AnchorTargetCreator(),
                 roi_size=7):
        super(MaskRCNNTrainChain, self).__init__()
        with self.init_scope():
            self.mask_rcnn = mask_rcnn
        self.rpn_sigma = rpn_sigma
        self.roi_sigma = roi_sigma
        self.anchor_target_creator = anchor_target_creator
        self.proposal_target_creator = ProposalTargetCreator(roi_size=roi_size)
        self.loc_normalize_mean = mask_rcnn.loc_normalize_mean
        self.loc_normalize_std = mask_rcnn.loc_normalize_std
        self.decayrate=0.99
        self.avg_loss = None
        self.gamma=gamma
    def __call__(self, imgs, bboxes, labels, scale, masks):

        if isinstance(bboxes, chainer.Variable):
            bboxes = bboxes.data
        if isinstance(labels, chainer.Variable):
            labels = labels.data
        if isinstance(scale, chainer.Variable):
            scale = scale.data
        if isinstance(masks, chainer.Variable):
            masks = masks.data
        scale = np.asscalar(cuda.to_cpu(scale[0]))
        n = bboxes.shape[0]
        #if n != 1:
        #    raise ValueError('only batch size 1 is supported')
        _, _, H, W = imgs.shape
        img_size = (H, W)
        #Extractor (VGG) : img -> features
        features = self.mask_rcnn.extractor(imgs)

        #Region Proposal Network : features -> rpn_locs, rpn_scores, rois
        rpn_loc_loss,rpn_cls_loss, roi_loc_loss, roi_cls_loss, mask_loss= 0,0,0,0,0    
        for i in range(n):
            rpn_locs, rpn_scores, rois, roi_indices, anchor = self.mask_rcnn.rpn(
                features[i:i+1], img_size, scale)
            bbox, label, mask, rpn_score, rpn_loc, roi = \
                bboxes[i], labels[i], masks[i], rpn_scores[0], rpn_locs[0], rois
            mask[mask>1]=0
            numdata = sum(label>=0)
            label = label[0:numdata]
            bbox = bbox[0:numdata]
            mask = mask[0:numdata]
            #proposal target : roi(proposed) , bbox(GT), label(GT) -> sample_roi, gt_roi_loc, gt_roi_label
            #the targets are compared with the head output.
            sample_roi, gt_roi_loc, gt_roi_label, gt_roi_mask = self.proposal_target_creator(
            roi, bbox, label, mask, self.loc_normalize_mean, self.loc_normalize_std)
            sample_roi_index = self.xp.zeros((len(sample_roi),), dtype=np.int32)

            #Head Network : features, sample_roi -> roi_cls_loc, roi_score
            roi_cls_loc, roi_score, roi_cls_mask = self.mask_rcnn.head(
                features[i:i+1], sample_roi, sample_roi_index)

            #RPN losses
            gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(bbox, anchor, img_size)
            rpn_loc_loss += _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc, gt_rpn_label, self.rpn_sigma)
            rpn_cls_loss += F.softmax_cross_entropy(rpn_score, gt_rpn_label)

            #Head output losses
            n_sample = roi_cls_loc.shape[0]
            roi_cls_loc = roi_cls_loc.reshape((n_sample, -1, 4))
            roi_loc = roi_cls_loc[self.xp.arange(n_sample), gt_roi_label] 
            roi_mask = roi_cls_mask[self.xp.arange(n_sample), gt_roi_label]
            roi_loc_loss += _fast_rcnn_loc_loss(roi_loc, gt_roi_loc, gt_roi_label, self.roi_sigma)
            roi_cls_loss += F.softmax_cross_entropy(roi_score, gt_roi_label)

            #mask loss:  average binary cross-entropy loss
            mask_loss += F.sigmoid_cross_entropy(roi_mask[0:gt_roi_mask.shape[0]], gt_roi_mask)

        #total loss
        loss = rpn_loc_loss + rpn_cls_loss + roi_loc_loss + roi_cls_loss + self.gamma * mask_loss
        loss /= n

        #avg loss calculation
        if self.avg_loss is None:
            self.avg_loss = loss.data
        else:
            self.avg_loss = self.avg_loss * self.decayrate + loss.data*(1-self.decayrate)
        chainer.reporter.report({'rpn_loc_loss':rpn_loc_loss/n,
                                 'rpn_cls_loss':rpn_cls_loss/n,
                                 'roi_loc_loss':roi_loc_loss/n,
                                 'roi_cls_loss':roi_cls_loss/n,
                                 'roi_mask_loss':self.gamma * mask_loss/n,
                                 'avg_loss':self.avg_loss,
                                 'loss':loss}, self)
        return loss


def _smooth_l1_loss(x, t, in_weight, sigma):
    sigma2 = sigma ** 2
    diff = in_weight * (x - t)
    abs_diff = F.absolute(diff)
    flag = (abs_diff.data < (1. / sigma2)).astype(np.float32)
    y = (flag * (sigma2 / 2.) * F.square(diff) +
         (1 - flag) * (abs_diff - 0.5 / sigma2))
    return F.sum(y)

def _fast_rcnn_loc_loss(pred_loc, gt_loc, gt_label, sigma):
    xp = chainer.cuda.get_array_module(pred_loc)
    in_weight = xp.zeros_like(gt_loc)
    in_weight[gt_label > 0] = 1
    loc_loss = _smooth_l1_loss(pred_loc, gt_loc, in_weight, sigma)
    loc_loss /= xp.sum(gt_label >= 0)
    return loc_loss


================================================
FILE: train.py
================================================
import chainer
from chainer import training
from chainer.training import extensions, ParallelUpdater
from chainer.training.triggers import ManualScheduleTrigger
from chainer.datasets import TransformDataset
from chainercv.datasets import VOCBboxDataset, voc_bbox_label_names
from chainercv import transforms
from chainercv.transforms.image.resize import resize

import argparse
import numpy as np
import time
#from mask_rcnn_vgg import MaskRCNNVGG16
from mask_rcnn_resnet import MaskRCNNResNet
from coco_dataset import COCODataset
from mask_rcnn_train_chain import MaskRCNNTrainChain
from utils.bn_utils import freeze_bn, bn_to_affine
from utils.cocoapi_evaluator import COCOAPIEvaluator
from utils.detection_coco_evaluator import DetectionCOCOEvaluator
import logging
import traceback
from utils.updater import SubDivisionUpdater
import cv2

def resize_bbox(bbox, in_size, out_size):
    bbox_o = bbox.copy()
    y_scale = float(out_size[0]) / in_size[0]
    x_scale = float(out_size[1]) / in_size[1]
    bbox_o[:, 0] = y_scale * bbox[:, 1]
    bbox_o[:, 2] = y_scale * (bbox[:, 1]+bbox[:, 3])
    bbox_o[:, 1] = x_scale * bbox[:, 0]
    bbox_o[:, 3] = x_scale * (bbox[:, 0]+bbox[:, 2])
    return bbox_o

def parse():
    parser = argparse.ArgumentParser(
        description='Mask RCNN trainer')
    parser.add_argument('--dataset', choices=('coco2017'),
                        default='coco2017')
    parser.add_argument('--extractor', choices=('resnet50','resnet101'),
                        default='resnet50', help='extractor network')
    parser.add_argument('--gpu', '-g', type=int, default=0)
    parser.add_argument('--lr', '-l', type=float, default=1e-4)
    parser.add_argument('--batchsize', '-b', type=int, default=8)
    parser.add_argument('--freeze_bn', action='store_true', default=False, help='freeze batchnorm gamma/beta')
    parser.add_argument('--bn2affine', action='store_true', default=False, help='batchnorm to affine')
    parser.add_argument('--out', '-o', default='result',
                        help='Output directory')
    parser.add_argument('--seed', '-s', type=int, default=0)
    parser.add_argument('--roialign', action='store_false', default=True, help='default: True')
    parser.add_argument('--lr_step', '-ls', type=int, default=120000)
    parser.add_argument('--lr_initialchange', '-li', type=int, default=400)
    parser.add_argument('--pretrained', '-p', type=str, default='imagenet')
    parser.add_argument('--snapshot', type=int, default=4000)
    parser.add_argument('--validation', type=int, default=30000)
    parser.add_argument('--resume', type=str)
    parser.add_argument('--iteration', '-i', type=int, default=180000)
    parser.add_argument('--roi_size', '-r', type=int, default=14, help='ROI size for mask head input')
    parser.add_argument('--gamma', type=float, default=1, help='mask loss weight')
    return parser.parse_args()

class Transform(object):
    def __init__(self, net, labelids):
        self.net = net
        self.labelids = labelids
    def __call__(self, in_data):
        if len(in_data)==5:
            img, label, bbox, mask, i = in_data
        elif len(in_data)==4:
            img, bbox, label, i= in_data
        label = [self.labelids.index(l) for l in label]
        _, H, W = img.shape
        if chainer.config.train:
            img = self.net.prepare(img)
        _, o_H, o_W = img.shape
        scale = o_H / H
        if len(bbox)==0:
            return img, [],[],1
        bbox = resize_bbox(bbox, (H, W), (o_H, o_W))
        mask = resize(mask,(o_H, o_W))
        if chainer.config.train:
            #horizontal flip
            img, params = transforms.random_flip(
                img, x_random=True, return_param=True)
            bbox = transforms.flip_bbox(
                bbox, (o_H, o_W), x_flip=params['x_flip'])
            mask = transforms.flip(mask, x_flip=params['x_flip'])
        return img, bbox, label, scale, mask, i

def convert(batch, device):
    return chainer.dataset.convert.concat_examples(batch, device, padding=-1)

def main():
    args = parse()
    np.random.seed(args.seed)
    print('arguments: ', args)

    # Model setup
    if args.dataset == 'coco2017':
        train_data = COCODataset()
    test_data = COCODataset(json_file='instances_val2017.json', name='val2017', id_list_file='val2017.txt')
    train_class_ids =train_data.class_ids
    test_ids = test_data.ids
    cocoanns = test_data.coco
    if args.extractor=='vgg16':
        mask_rcnn = MaskRCNNVGG16(n_fg_class=80, pretrained_model=args.pretrained, roi_size=args.roi_size, roi_align = args.roialign)
    elif args.extractor=='resnet50':
        mask_rcnn = MaskRCNNResNet(n_fg_class=80, pretrained_model=args.pretrained,roi_size=args.roi_size, n_layers=50, roi_align = args.roialign, class_ids=train_class_ids)
    elif args.extractor=='resnet101':
        mask_rcnn = MaskRCNNResNet(n_fg_class=80, pretrained_model=args.pretrained,roi_size=args.roi_size, n_layers=101, roi_align = args.roialign, class_ids=train_class_ids)
    mask_rcnn.use_preset('evaluate')
    model = MaskRCNNTrainChain(mask_rcnn, gamma=args.gamma, roi_size=args.roi_size)
 
    # Trainer setup
    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()
    optimizer = chainer.optimizers.MomentumSGD(lr=args.lr, momentum=0.9)
    #optimizer = chainer.optimizers.Adam()#alpha=0.001, beta1=0.9, beta2=0.999 , eps=0.00000001)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(rate=0.0001))

    train_data=TransformDataset(train_data, Transform(mask_rcnn, train_class_ids))
    test_data=TransformDataset(test_data, Transform(mask_rcnn, train_class_ids))
    train_iter = chainer.iterators.SerialIterator(
        train_data, batch_size=args.batchsize)
    test_iter = chainer.iterators.SerialIterator(
        test_data, batch_size=1, repeat=False, shuffle=False)
    updater = SubDivisionUpdater(train_iter, optimizer, device=args.gpu, subdivisions=args.batchsize)
    #updater = ParallelUpdater(train_iter, optimizer, devices={"main": 0, "second": 1}, converter=convert ) #for training with multiple GPUs
    trainer = training.Trainer(
        updater, (args.iteration, 'iteration'), out=args.out)

    # Extensions
    trainer.extend(
        extensions.snapshot_object(model.mask_rcnn, 'snapshot_model.npz'),
        trigger=(args.snapshot, 'iteration'))
    trainer.extend(extensions.ExponentialShift('lr', 10),
                       trigger=ManualScheduleTrigger(
                          [args.lr_initialchange], 'iteration'))
    trainer.extend(extensions.ExponentialShift('lr', 0.1),
                   trigger=(args.lr_step, 'iteration'))
    if args.resume is not None:
        chainer.serializers.load_npz(args.resume, model.mask_rcnn)
    if args.freeze_bn:
        freeze_bn(model.mask_rcnn)
    if args.bn2affine:
        bn_to_affine(model.mask_rcnn)
    log_interval = 40, 'iteration'
    plot_interval = 160, 'iteration'
    print_interval = 40, 'iteration'

    #trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu), trigger=(args.validation, 'iteration'))
    #trainer.extend(DetectionCOCOEvaluator(test_iter, model.mask_rcnn), trigger=(args.validation, 'iteration')) #COCO AP Evaluator with VOC metric
    trainer.extend(COCOAPIEvaluator(test_iter, model.mask_rcnn, test_ids, cocoanns), trigger=(args.validation, 'iteration')) #COCO AP Evaluator
    trainer.extend(chainer.training.extensions.observe_lr(),
                   trigger=log_interval)
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(extensions.PrintReport(
        ['iteration', 'epoch', 'elapsed_time', 'lr',
         'main/loss',
         'main/avg_loss',
         'main/roi_loc_loss',
         'main/roi_cls_loss',
         'main/roi_mask_loss',
         'main/rpn_loc_loss',
         'main/rpn_cls_loss',
         'validation/main/loss',
         'validation/main/map',
         ]), trigger=print_interval)
    trainer.extend(extensions.ProgressBar(update_interval=1000))
    #trainer.extend(extensions.dump_graph('main/loss'))
    try:
        trainer.run()
    except:
        traceback.print_exc()

if __name__ == '__main__':
    main()


================================================
FILE: utils/__init__.py
================================================


================================================
FILE: utils/bn_utils.py
================================================
import numpy as np
import cupy

def freeze_bn(model):
    # freeze batchnorm update 
    def disableupdate(block):
        for name in block._forward:
            l = getattr(block, name)
            l.bn1.disable_update()   
            l.bn2.disable_update()   
            l.bn3.disable_update()   
            if name=='a':
                l.bn4.disable_update()
    model.extractor.bn1.disable_update()  
    disableupdate(model.extractor.res2)
    disableupdate(model.extractor.res3)
    disableupdate(model.extractor.res4)
    disableupdate(model.head.res5)
    print("batchnorm update disabled!")

def bn_to_affine(model):
    # change batchnorm layers to affine layers (mean -> 0, var -> 1)
    def bn_to_affine_block(block):
        for name in block._forward:
            l = getattr(block, name)
            l.bn1.avg_mean = cupy.zeros(l.bn1.avg_mean.shape, dtype=np.float32)
            l.bn1.avg_var = cupy.ones(l.bn1.avg_var.shape, dtype=np.float32) - l.bn1.eps
            l.bn2.avg_mean = cupy.zeros(l.bn2.avg_mean.shape, dtype=np.float32)
            l.bn2.avg_var = cupy.ones(l.bn2.avg_var.shape, dtype=np.float32) - l.bn1.eps   
            l.bn3.avg_mean = cupy.zeros(l.bn3.avg_mean.shape, dtype=np.float32) 
            l.bn3.avg_var = cupy.ones(l.bn3.avg_var.shape, dtype=np.float32) - l.bn1.eps  
            if name=='a':
                l.bn4.avg_mean = cupy.zeros(l.bn4.avg_mean.shape, dtype=np.float32) 
                l.bn4.avg_var = cupy.ones(l.bn4.avg_var.shape, dtype=np.float32) - l.bn1.eps 
    model.extractor.bn1.avg_mean = cupy.zeros(model.extractor.bn1.avg_mean.shape, dtype=np.float32)
    model.extractor.bn1.avg_var = cupy.ones(model.extractor.bn1.avg_var.shape, dtype=np.float32) - model.extractor.bn1.eps 
    bn_to_affine_block(model.extractor.res2)
    bn_to_affine_block(model.extractor.res3)
    bn_to_affine_block(model.extractor.res4)
    bn_to_affine_block(model.head.res5)
    print("converted batchnorm to affine")

================================================
FILE: utils/box_utils.py
================================================
import numpy as np
import cupy
import cv2

def resize_bbox(bbox, in_size, out_size):
    bbox_o = bbox.copy()
    y_scale = float(out_size[0]) / in_size[0]
    x_scale = float(out_size[1]) / in_size[1]
    bbox_o[:, 0] = y_scale * bbox[:, 1]
    bbox_o[:, 2] = y_scale * (bbox[:, 1]+bbox[:, 3])
    bbox_o[:, 1] = x_scale * bbox[:, 0]
    bbox_o[:, 3] = x_scale * (bbox[:, 0]+bbox[:, 2])
    return bbox_o

def bbox_yxyx2xywh(bbox):
    bbox_o = bbox.copy()
    bbox_o[:, 0] = bbox[:, 1]
    bbox_o[:, 2] = bbox[:, 3] - bbox[:, 1]
    bbox_o[:, 1] = bbox[:, 0]
    bbox_o[:, 3] = bbox[:, 2] - bbox[:, 0]
    return bbox_o

def im_mask(mask, size, bbox):
    # bboxes are already clipped to [0, w], [0, h]
    masksize = mask.shape[0]
    # pad the mask to avoid cv2.resize artifacts 
    pmask = np.zeros((masksize + 2, masksize + 2), dtype=np.float32)
    pmask[1:-1, 1:-1] = mask
    # extend the boxhead
    scale = (masksize + 2) / masksize
    ex_w = (bbox[3] - bbox[1]) * scale
    ex_h = (bbox[2] - bbox[0]) * scale
    ex_x0 = (bbox[3] + bbox[1] - ex_w) / 2
    ex_y0 = (bbox[2] + bbox[0] - ex_h) / 2
    ex_x1 = (bbox[3] + bbox[1] + ex_w) / 2
    ex_y1 = (bbox[2] + bbox[0] + ex_h) / 2
    ex_bbox = np.asarray([ex_y0, ex_x0, ex_y1, ex_x1], dtype=np.int32)
    # whole-image-sized mask 
    immask = np.zeros((size[0],size[1]), dtype=np.uint8)
    x0, x1 = max(ex_bbox[1], 0), min(ex_bbox[3] + 1, size[1])
    y0, y1= max(ex_bbox[0], 0), min(ex_bbox[2] + 1, size[0])
    immask_roi = cv2.resize(pmask, (x1 - x0, y1 - y0))
    immask[y0:y1, x0:x1] = np.round(immask_roi).astype(np.uint8)
    return immask


================================================
FILE: utils/cocoapi_evaluator.py
================================================
import copy
import numpy as np

from chainer import reporter
import chainer.training.extensions

from utils import eval_detection_coco
from chainercv.utils import apply_prediction_to_iterator
import pycocotools
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

class COCOAPIEvaluator(chainer.training.extensions.Evaluator):
    trigger = 1, 'epoch'
    default_name = 'validation'
    priority = chainer.training.PRIORITY_WRITER

    def __init__(
            self, iterator, target, ids, cocoanns, label_names=None):
        super(COCOAPIEvaluator, self).__init__(
            iterator, target)
        self.ids = ids
        self.cocoanns = cocoanns

    def evaluate(self):
        iterator = self._iterators['main']
        target = self._targets['main']

        annType = ['segm','bbox','keypoints']
        if hasattr(iterator, 'reset'):
            iterator.reset()
            it = iterator
        else:
            it = copy.copy(iterator)

        in_values, out_values, rest_values = apply_prediction_to_iterator(
            target.predict, it)
        # delete unused iterators explicitly
        del in_values

        pred_bboxes, pred_labels, pred_scores, pred_masks = out_values

        if len(rest_values) == 3:
            gt_bboxes, gt_labels, gt_difficults = rest_values
        elif len(rest_values) == 2:
            gt_bboxes, gt_labels = rest_values
            gt_difficults = None
        elif len(rest_values) == 5:
            gt_bboxes, gt_labels, _, _, i = rest_values
            gt_difficults = None
        pred_bboxes = iter(list(pred_bboxes))
        pred_labels = iter(list(pred_labels))
        pred_scores = iter(list(pred_scores))
        gt_bboxes = iter(list(gt_bboxes))
        gt_labels = iter(list(gt_labels))
        data_dict = []
        for i, (pred_bbox, pred_label, pred_score, pred_mask) in \
            enumerate(zip(pred_bboxes, pred_labels, pred_scores, pred_masks)):
            for bbox, label, score, mask in zip(pred_bbox, pred_label, pred_score, pred_mask):
                A={"image_id":int(self.ids[i]), "category_id":int(label), "bbox":bbox.tolist(),
                 "score":float(score), "segmentation": mask}
                data_dict.append(A)
        if len(data_dict)>0:
            for i in range(2):  # 'segm','bbox'
                cocoGt=self.cocoanns
                cocoDt=cocoGt.loadRes(data_dict)
                cocoEval = COCOeval(self.cocoanns, cocoDt, annType[i])
                cocoEval.params.imgIds  = [int(id_) for id_ in self.ids]
                cocoEval.evaluate()
                cocoEval.accumulate()
                cocoEval.summarize()
            report = {'map': cocoEval.stats[0]} # report COCO AP (IoU=0.5:0:95)
        else:
            report = {'map': 0}
        observation = {}
        with reporter.report_scope(observation):
            reporter.report(report, target)
        return observation

================================================
FILE: utils/detection_coco_evaluator.py
================================================
import copy
import numpy as np

from chainer import reporter
import chainer.training.extensions

from utils import eval_detection_coco
from chainercv.utils import apply_prediction_to_iterator


class DetectionCOCOEvaluator(chainer.training.extensions.Evaluator):

    """An extension that evaluates a detection model by PASCAL VOC metric.

    This extension iterates over an iterator and evaluates the prediction
    results by average precisions (APs) and mean of them
    (mean Average Precision, mAP).
    This extension reports the following values with keys.
    Please note that :obj:`'ap/<label_names[l]>'` is reported only if
    :obj:`label_names` is specified.

    * :obj:`'map'`: Mean of average precisions (mAP).
    * :obj:`'ap/<label_names[l]>'`: Average precision for class \
        :obj:`label_names[l]`, where :math:`l` is the index of the class. \
        For example, this evaluator reports :obj:`'ap/aeroplane'`, \
        :obj:`'ap/bicycle'`, etc. if :obj:`label_names` is \
        :obj:`~chainercv.datasets.voc_bbox_label_names`. \
        If there is no bounding box assigned to class :obj:`label_names[l]` \
        in either ground truth or prediction, it reports :obj:`numpy.nan` as \
        its average precision. \
        In this case, mAP is computed without this class.

    Args:
        iterator (chainer.Iterator): An iterator. Each sample should be
            following tuple :obj:`img, bbox, label` or
            :obj:`img, bbox, label, difficult`.
            :obj:`img` is an image, :obj:`bbox` is coordinates of bounding
            boxes, :obj:`label` is labels of the bounding boxes and
            :obj:`difficult` is whether the bounding boxes are difficult or
            not. If :obj:`difficult` is returned, difficult ground truth
            will be ignored from evaluation.
        target (chainer.Link): A detection link. This link must have
            :meth:`predict` method that takes a list of images and returns
            :obj:`bboxes`, :obj:`labels` and :obj:`scores`.
        use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric
            for calculating average precision. The default value is
            :obj:`False`.
        label_names (iterable of strings): An iterable of names of classes.
            If this value is specified, average precision for each class is
            also reported with the key :obj:`'ap/<label_names[l]>'`.

    """

    trigger = 1, 'epoch'
    default_name = 'validation'
    priority = chainer.training.PRIORITY_WRITER

    def __init__(
            self, iterator, target, use_07_metric=False, label_names=None):
        super(DetectionCOCOEvaluator, self).__init__(
            iterator, target)
        self.use_07_metric = use_07_metric
        self.label_names = ['background',  # class zero
            'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
            'fire hydrant', 'street sign', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
            'elephant', 'bear', 'zebra', 'giraffe', 'hat', 'backpack', 'umbrella', 'shoe', 'eye glasses', 'handbag', 'tie', 'suitcase', 'frisbee',
            'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle',
            'plate', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
            'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
            'mirror', 'dining table', 'window', 'desk','toilet', 'door', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
            'toaster', 'sink', 'refrigerator', 'blender', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']

    def evaluate(self):
        iterator = self._iterators['main']
        target = self._targets['main']

        if hasattr(iterator, 'reset'):
            iterator.reset()
            it = iterator
        else:
            it = copy.copy(iterator)

        in_values, out_values, rest_values = apply_prediction_to_iterator(
            target.predict, it)
        # delete unused iterators explicitly
        del in_values

        pred_bboxes, _, pred_labels, pred_scores, _ = out_values

        if len(rest_values) == 3:
            gt_bboxes, gt_labels, gt_difficults = rest_values
        elif len(rest_values) == 2:
            gt_bboxes, gt_labels = rest_values
            gt_difficults = None
        elif len(rest_values) == 5:
            gt_bboxes, gt_labels, _, _, i = rest_values
            gt_difficults = None

        result = eval_detection_coco.eval_detection_coco(
            pred_bboxes, pred_labels, pred_scores,
            gt_bboxes, gt_labels, gt_difficults,
            use_07_metric=self.use_07_metric)

        report = {'map': result['map']}

        if self.label_names is not None:
            for l, label_name in enumerate(self.label_names):
                try:
                    report['ap/{:s}'.format(label_name)] = result['ap'][l]
                except IndexError:
                    report['ap/{:s}'.format(label_name)] = np.nan
        if True:
            print(report)

        observation = {}
        with reporter.report_scope(observation):
            reporter.report(report, target)
        return observation

================================================
FILE: utils/detectron_parser.py
================================================
import numpy as np
import os
path = os.path.join(os.path.dirname(__file__), '../')
import sys
sys.path.append(path)
from mask_rcnn_resnet import MaskRCNNResNet
from chainer import serializers
import pickle

model = MaskRCNNResNet(n_fg_class=80, roi_size=14, pretrained_model='auto', anchor_scales=[2, 4, 8, 16, 32], n_layers=50, class_ids=[[1]])

modeldir = "modelfiles"
if os.path.exists(modeldir)==False:
    os.mkdir(modeldir)
    
# resnet50, end-to-end, C4
d_model_file = "modelfiles/model_final.pkl"
c_model_file = "modelfiles/e2e_mask_rcnn_R-50-C4_1x_d2c.npz"

with open(d_model_file, 'rb') as f:
    d = pickle.load(f, encoding='latin-1')['blobs']
d_key  = sorted(d)

parsecount = 0
for bl in d_key:
    if 'res' in bl:
        stage = bl[3] # resnet stage, 2, 3, 4, 5
        block = bl[5] # resnet block, a or b
        if stage=='_': # non-resnet layers
            continue
        else:
            stage = int(stage) - 1
            if stage == 4:
                netname='head'
            else:
                netname='extractor'
            if 'branch2a' in bl:
                c_nlayer = 1
            elif 'branch2b' in bl:
                c_nlayer = 2
            elif 'branch2c' in bl:
                c_nlayer = 3
            elif 'branch1' in bl:
                c_nlayer = 4
            else:
                c_nlayer = 0
            
            # do not copy
            if bl.endswith('_b') and 'bn_b' not in bl:
                continue
            if 'momentum' in bl:
                continue
            
            # conv / bn gamma / bn beta
            if '_w' in bl:
                c_kind = 'conv%d.W' % c_nlayer
            elif 'bn_s' in bl:
                c_kind = 'bn%d.gamma' % c_nlayer
            elif 'bn_b' in bl:
                c_kind = 'bn%d.beta' % c_nlayer
                
            # chainer block kind
            if block == '0':
                c_block = 'a'
            else:
                c_block = 'b'+block
            
            # shape checker
            exec("c_shape = model.%s.res%d.%s.%s.data.shape" % (netname, stage + 1, c_block, c_kind))
            exec("d_shape = d['%s'].shape" % bl)
            if c_shape == d_shape:
                # execute copy
                txt = "model.%s.res%d.%s.%s.data = d['%s']" % (netname, stage + 1, c_block, c_kind, bl )
                print(txt)
                exec(txt)
                parsecount += 1
            else:
                print("shape mismatch error!")

# copy the other layers
layer_pairs = \
[('extractor.conv1.W', 'conv1_w'), ('extractor.bn1.gamma', 'res_conv1_bn_s'), ('extractor.bn1.beta', 'res_conv1_bn_b'),
 ('rpn.conv1.W', 'conv_rpn_w'), ('rpn.conv1.b', 'conv_rpn_b'), 
 ('rpn.loc.W', 'rpn_bbox_pred_w'), ('rpn.loc.b', 'rpn_bbox_pred_b'), 
 ('rpn.score.W', 'rpn_cls_logits_w'), ('rpn.score.b', 'rpn_cls_logits_b'), 
 ('head.score.W', 'cls_score_w'), ('head.score.b', 'cls_score_b'), 
 ('head.cls_loc.W', 'bbox_pred_w'), ('head.cls_loc.b', 'bbox_pred_b'), 
 ('head.deconvm1.W', 'conv5_mask_w'), ('head.deconvm1.b', 'conv5_mask_b'),
 ('head.convm2.W', 'mask_fcn_logits_w'), ('head.convm2.b', 'mask_fcn_logits_b'),
]

def xytrans(src):
    sh = src.shape
    dst = src.reshape(sh[0]//4, 4, -1)[:,[1, 0, 3, 2]].reshape(sh)
    return dst

for layer_pair in layer_pairs:
    exec("c_shape = model.%s.data.shape" % layer_pair[0])
    exec("d_shape = d['%s'].shape" % layer_pair[1])
    if 'bbox_pred' in layer_pair[1]:
        d[layer_pair[1]] = xytrans(d[layer_pair[1]])
    if c_shape == d_shape:
        txt = "model.%s.data = d['%s']" % layer_pair
        print(txt)
        exec(txt)
        parsecount += 1
    else:
        print("shape mismatch error!")

print(parsecount, " layers copied")
serializers.save_npz(c_model_file, model)
print("save weights file to a chainer model", c_model_file)

================================================
FILE: utils/eval_detection_coco.py
================================================
from __future__ import division

from collections import defaultdict
import itertools
import numpy as np
import six

from chainercv.utils.bbox.bbox_iou import bbox_iou


def eval_detection_coco(
        pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels,
        gt_difficults=None,
        iou_thresh=0.5, use_07_metric=False):
    """Calculate average precisions based on evaluation code of PASCAL VOC.

    This function evaluates predicted bounding boxes obtained from a dataset
    which has :math:`N` images by using average precision for each class.
    The code is based on the evaluation code used in PASCAL VOC Challenge.

    Args:
        pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N`
            sets of bounding boxes.
            Its index corresponds to an index for the base dataset.
            Each element of :obj:`pred_bboxes` is a set of coordinates
            of bounding boxes. This is an array whose shape is :math:`(R, 4)`,
            where :math:`R` corresponds
            to the number of bounding boxes, which may vary among boxes.
            The second axis corresponds to
            :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box.
        pred_labels (iterable of numpy.ndarray): An iterable of labels.
            Similar to :obj:`pred_bboxes`, its index corresponds to an
            index for the base dataset. Its length is :math:`N`.
        pred_scores (iterable of numpy.ndarray): An iterable of confidence
            scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`,
            its index corresponds to an index for the base dataset.
            Its length is :math:`N`.
        gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth
            bounding boxes
            whose length is :math:`N`. An element of :obj:`gt_bboxes` is a
            bounding box whose shape is :math:`(R, 4)`. Note that the number of
            bounding boxes in each image does not need to be same as the number
            of corresponding predicted boxes.
        gt_labels (iterable of numpy.ndarray): An iterable of ground truth
            labels which are organized similarly to :obj:`gt_bboxes`.
        gt_difficults (iterable of numpy.ndarray): An iterable of boolean
            arrays which is organized similarly to :obj:`gt_bboxes`.
            This tells whether the
            corresponding ground truth bounding box is difficult or not.
            By default, this is :obj:`None`. In that case, this function
            considers all bounding boxes to be not difficult.
        iou_thresh (float): A prediction is correct if its Intersection over
            Union with the ground truth is above this value.
        use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric
            for calculating average precision. The default value is
            :obj:`False`.

    Returns:
        dict:

        The keys, value-types and the description of the values are listed
        below.

        * **ap** (*numpy.ndarray*): An array of average precisions. \
            The :math:`l`-th value corresponds to the average precision \
            for class :math:`l`. If class :math:`l` does not exist in \
            either :obj:`pred_labels` or :obj:`gt_labels`, the corresponding \
            value is set to :obj:`numpy.nan`.
        * **map** (*float*): The average of Average Precisions over classes.

    """

    prec, rec = calc_detection_coco_prec_rec(
        pred_bboxes, pred_labels, pred_scores,
        gt_bboxes, gt_labels, gt_difficults,
        iou_thresh=iou_thresh)

    ap = calc_detection_coco_ap(prec, rec, use_07_metric=use_07_metric)
    #for name, ap0 in zip(coconames, ap):
    #    if ~(ap0==ap0):
    #        ap0 = -1
    #    apresults.append([name, ap0])
    #print("average precision evaluation results: ", apresults)


    return {'ap': ap, 'map': np.nanmean(ap)}


def calc_detection_coco_prec_rec(
        pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels,
        gt_difficults=None,
        iou_thresh=0.5):
    """Calculate precision and recall based on evaluation code of PASCAL VOC.

    This function calculates precision and recall of
    predicted bounding boxes obtained from a dataset which has :math:`N`
    images.
    The code is based on the evaluation code used in PASCAL VOC Challenge.

    Args:
        pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N`
            sets of bounding boxes.
            Its index corresponds to an index for the base dataset.
            Each element of :obj:`pred_bboxes` is a set of coordinates
            of bounding boxes. This is an array whose shape is :math:`(R, 4)`,
            where :math:`R` corresponds
            to the number of bounding boxes, which may vary among boxes.
            The second axis corresponds to
            :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box.
        pred_labels (iterable of numpy.ndarray): An iterable of labels.
            Similar to :obj:`pred_bboxes`, its index corresponds to an
            index for the base dataset. Its length is :math:`N`.
        pred_scores (iterable of numpy.ndarray): An iterable of confidence
            scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`,
            its index corresponds to an index for the base dataset.
            Its length is :math:`N`.
        gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth
            bounding boxes
            whose length is :math:`N`. An element of :obj:`gt_bboxes` is a
            bounding box whose shape is :math:`(R, 4)`. Note that the number of
            bounding boxes in each image does not need to be same as the number
            of corresponding predicted boxes.
        gt_labels (iterable of numpy.ndarray): An iterable of ground truth
            labels which are organized similarly to :obj:`gt_bboxes`.
        gt_difficults (iterable of numpy.ndarray): An iterable of boolean
            arrays which is organized similarly to :obj:`gt_bboxes`.
            This tells whether the
            corresponding ground truth bounding box is difficult or not.
            By default, this is :obj:`None`. In that case, this function
            considers all bounding boxes to be not difficult.
        iou_thresh (float): A prediction is correct if its Intersection over
            Union with the ground truth is above this value..

    Returns:
        tuple of two lists:
        This function returns two lists: :obj:`prec` and :obj:`rec`.

        * :obj:`prec`: A list of arrays. :obj:`prec[l]` is precision \
            for class :math:`l`. If class :math:`l` does not exist in \
            either :obj:`pred_labels` or :obj:`gt_labels`, :obj:`prec[l]` is \
            set to :obj:`None`.
        * :obj:`rec`: A list of arrays. :obj:`rec[l]` is recall \
            for class :math:`l`. If class :math:`l` that is not marked as \
            difficult does not exist in \
            :obj:`gt_labels`, :obj:`rec[l]` is \
            set to :obj:`None`.

    """

    pred_bboxes = iter(list(pred_bboxes))
    pred_labels = iter(list(pred_labels))
    pred_scores = iter(list(pred_scores))
    gt_bboxes = iter(list(gt_bboxes))
    gt_labels = iter(list(gt_labels))
    if gt_difficults is None:
        gt_difficults = itertools.repeat(None)
    else:
        gt_difficults = iter(gt_difficults)

    n_pos = defaultdict(int)
    score = defaultdict(list)
    match = defaultdict(list)

    for pred_bbox, pred_label, pred_score, gt_bbox, gt_label, gt_difficult in \
        six.moves.zip(
            pred_bboxes, pred_labels, pred_scores,
            gt_bboxes, gt_labels, gt_difficults):

        if gt_difficult is None:
            gt_difficult = np.zeros(gt_bbox.shape[0], dtype=bool)

        for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)):
            pred_mask_l = pred_label == l
            pred_bbox_l = pred_bbox[pred_mask_l]
            pred_score_l = pred_score[pred_mask_l]
            # sort by score
            order = pred_score_l.argsort()[::-1]
            pred_bbox_l = pred_bbox_l[order]
            pred_score_l = pred_score_l[order]

            gt_mask_l = gt_label == l
            gt_bbox_l = gt_bbox[gt_mask_l]
            gt_difficult_l = gt_difficult[gt_mask_l]

            n_pos[l] += np.logical_not(gt_difficult_l).sum()
            score[l].extend(pred_score_l)

            if len(pred_bbox_l) == 0:
                continue
            if len(gt_bbox_l) == 0:
                match[l].extend((0,) * pred_bbox_l.shape[0])
                continue

            # VOC evaluation follows integer typed bounding boxes.
            pred_bbox_l = pred_bbox_l.copy()
            pred_bbox_l[:, 2:] += 1
            gt_bbox_l = gt_bbox_l.copy()
            gt_bbox_l[:, 2:] += 1

            iou = bbox_iou(pred_bbox_l, gt_bbox_l)
            gt_index = iou.argmax(axis=1)
            # set -1 if there is no matching ground truth
            gt_index[iou.max(axis=1) < iou_thresh] = -1
            del iou

            selec = np.zeros(gt_bbox_l.shape[0], dtype=bool)
            for gt_idx in gt_index:
                if gt_idx >= 0:
                    if gt_difficult_l[gt_idx]:
                        match[l].append(-1)
                    else:
                        if not selec[gt_idx]:
                            match[l].append(1)
                        else:
                            match[l].append(0)
                    selec[gt_idx] = True
                else:
                    match[l].append(0)

    for iter_ in (
            pred_bboxes, pred_labels, pred_scores,
            gt_bboxes, gt_labels, gt_difficults):
        if next(iter_, None) is not None:
            raise ValueError('Length of input iterables need to be same.')

    n_fg_class = max(n_pos.keys()) + 1
    prec = [None] * n_fg_class
    rec = [None] * n_fg_class

    for l in n_pos.keys():
        score_l = np.array(score[l])
        match_l = np.array(match[l], dtype=np.int8)

        order = score_l.argsort()[::-1]
        match_l = match_l[order]

        tp = np.cumsum(match_l == 1)
        fp = np.cumsum(match_l == 0)

        # If an element of fp + tp is 0,
        # the corresponding element of prec[l] is nan.
        prec[l] = tp / (fp + tp)
        # If n_pos[l] is 0, rec[l] is None.
        if n_pos[l] > 0:
            rec[l] = tp / n_pos[l]

    return prec, rec


def calc_detection_coco_ap(prec, rec, use_07_metric=False):
    """Calculate average precisions based on evaluation code of PASCAL VOC.

    This function calculates average precisions
    from given precisions and recalls.
    The code is based on the evaluation code used in PASCAL VOC Challenge.

    Args:
        prec (list of numpy.array): A list of arrays.
            :obj:`prec[l]` indicates precision for class :math:`l`.
            If :obj:`prec[l]` is :obj:`None`, this function returns
            :obj:`numpy.nan` for class :math:`l`.
        rec (list of numpy.array): A list of arrays.
            :obj:`rec[l]` indicates recall for class :math:`l`.
            If :obj:`rec[l]` is :obj:`None`, this function returns
            :obj:`numpy.nan` for class :math:`l`.
        use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric
            for calculating average precision. The default value is
            :obj:`False`.

    Returns:
        ~numpy.ndarray:
        This function returns an array of average precisions.
        The :math:`l`-th value corresponds to the average precision
        for class :math:`l`. If :obj:`prec[l]` or :obj:`rec[l]` is
        :obj:`None`, the corresponding value is set to :obj:`numpy.nan`.

    """

    n_fg_class = len(prec)
    ap = np.empty(n_fg_class)
    for l in six.moves.range(n_fg_class):
        if prec[l] is None or rec[l] is None:
            ap[l] = np.nan
            continue

        if use_07_metric:
            # 11 point metric
            ap[l] = 0
            for t in np.arange(0., 1.1, 0.1):
                if np.sum(rec[l] >= t) == 0:
                    p = 0
                else:
                    p = np.max(np.nan_to_num(prec[l])[rec[l] >= t])
                ap[l] += p / 11
        else:
            # correct AP calculation
            # first append sentinel values at the end
            mpre = np.concatenate(([0], np.nan_to_num(prec[l]), [0]))
            mrec = np.concatenate(([0], rec[l], [1]))

            mpre = np.maximum.accumulate(mpre[::-1])[::-1]

            # to calculate area under PR curve, look for points
            # where X axis (recall) changes value
            i = np.where(mrec[1:] != mrec[:-1])[0]

            # and sum (\Delta recall) * prec
            ap[l] = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])

    return ap

================================================
FILE: utils/makecocolist.py
================================================
import glob
fnames = glob.glob('COCO/train2017/*.jpg')

with open("COCO/train2017.txt", "w") as f:
    for fname in fnames:
        f.write(fname.split('/')[-1].split('.')[0]+'\n')
f.close()

fnames = glob.glob('COCO/val2017/*.jpg')

with open("COCO/val2017.txt", "w") as f:
    for i, fname in enumerate(fnames):
        f.write(fname.split('/')[-1].split('.')[0]+'\n')
        if i > 1000:
            break
f.close()


================================================
FILE: utils/proposal_target_creator.py
================================================
import numpy as np

from chainer import cuda

from chainercv.links.model.faster_rcnn.utils.bbox2loc import bbox2loc
from chainercv.utils.bbox.bbox_iou import bbox_iou
import cv2


class ProposalTargetCreator(object):
    """Assign ground truth bounding boxes to given RoIs.

    The :meth:`__call__` of this class generates training targets
    for each object proposal.
    This is used to train Faster RCNN [#]_.

    .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
    Faster R-CNN: Towards Real-Time Object Detection with \
    Region Proposal Networks. NIPS 2015.

    Args:
        n_sample (int): The number of sampled regions.
        pos_ratio (float): Fraction of regions that is labeled as a
            foreground.
        pos_iou_thresh (float): IoU threshold for a RoI to be considered as a
            foreground.
        neg_iou_thresh_hi (float): RoI is considered to be the background
            if IoU is in
            [:obj:`neg_iou_thresh_hi`, :obj:`neg_iou_thresh_hi`).
        neg_iou_thresh_lo (float): See above.

    """

    def __init__(self,
                 n_sample=128,
                 pos_ratio=0.25, pos_iou_thresh=0.5,
                 neg_iou_thresh_hi=0.5, neg_iou_thresh_lo=0.0,
                 roi_size=7
                 ):
        self.roi_size=roi_size
        self.n_sample = n_sample
        self.pos_ratio = pos_ratio
        self.pos_iou_thresh = pos_iou_thresh
        self.neg_iou_thresh_hi = neg_iou_thresh_hi
        self.neg_iou_thresh_lo = neg_iou_thresh_lo

    def __call__(self, roi, bbox, label, mask,
                 loc_normalize_mean=(0., 0., 0., 0.),
                 loc_normalize_std=(0.1, 0.1, 0.2, 0.2)):
        """Assigns ground truth to sampled proposals.

        This function samples total of :obj:`self.n_sample` RoIs
        from the combination of :obj:`roi` and :obj:`bbox`.
        The RoIs are assigned with the ground truth class labels as well as
        bounding box offsets and scales to match the ground truth bounding
        boxes. As many as :obj:`pos_ratio * self.n_sample` RoIs are
        sampled as foregrounds.

        Offsets and scales of bounding boxes are calculated using
        :func:`chainercv.links.model.faster_rcnn.bbox2loc`.
        Also, types of input arrays and output arrays are same.

        Here are notations.

        * :math:`S` is the total number of sampled RoIs, which equals \
            :obj:`self.n_sample`.
        * :math:`L` is number of object classes possibly including the \
            background.

        Args:
            roi (array): Region of Interests (RoIs) from which we sample.
                Its shape is :math:`(R, 4)`
            bbox (array): The coordinates of ground truth bounding boxes.
                Its shape is :math:`(R', 4)`.
            label (array): Ground truth bounding box labels. Its shape
                is :math:`(R',)`. Its range is :math:`[0, L - 1]`, where
                :math:`L` is the number of foreground classes.
            loc_normalize_mean (tuple of four floats): Mean values to normalize
                coordinates of bouding boxes.
            loc_normalize_std (tupler of four floats): Standard deviation of
                the coordinates of bounding boxes.

        Returns:
            (array, array, array):

            * **sample_roi**: Regions of interests that are sampled. \
                Its shape is :math:`(S, 4)`.
            * **gt_roi_loc**: Offsets and scales to match \
                the sampled RoIs to the ground truth bounding boxes. \
                Its shape is :math:`(S, 4)`.
            * **gt_roi_label**: Labels assigned to sampled RoIs. Its shape is \
                :math:`(S,)`. Its range is :math:`[0, L]`. The label with \
                value 0 is the background.

        """
        xp = cuda.get_array_module(roi)
        roi = cuda.to_cpu(roi)
        bbox = cuda.to_cpu(bbox)
        label = cuda.to_cpu(label)
        mask = cuda.to_cpu(mask)

        n_bbox, _ = bbox.shape
        roi = np.concatenate((roi, bbox), axis=0)

        pos_roi_per_image = np.round(self.n_sample * self.pos_ratio)
        iou = bbox_iou(roi, bbox)
        gt_assignment = iou.argmax(axis=1)
        max_iou = iou.max(axis=1)

        # Offset range of classes from [0, n_fg_class - 1] to [1, n_fg_class].
        # The label with value 0 is the background.
        gt_roi_label = label[gt_assignment] + 1

        # Select foreground RoIs as those with >= pos_iou_thresh IoU.
        pos_index = np.where(max_iou >= self.pos_iou_thresh)[0]
        pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size))
        if pos_index.size > 0:
            pos_index = np.random.choice(
                pos_index, size=pos_roi_per_this_image, replace=False)

        # Select background RoIs as those within
        # [neg_iou_thresh_lo, neg_iou_thresh_hi).
        neg_index = np.where((max_iou < self.neg_iou_thresh_hi) &
                             (max_iou >= self.neg_iou_thresh_lo))[0]
        neg_roi_per_this_image = self.n_sample - pos_roi_per_this_image
        neg_roi_per_this_image = int(min(neg_roi_per_this_image,
                                         neg_index.size))
        if neg_index.size > 0:
            neg_index = np.random.choice(
                neg_index, size=neg_roi_per_this_image, replace=False)

        # The indices that we're selecting (both positive and negative).
        keep_index = np.append(pos_index, neg_index)
        gt_roi_label = gt_roi_label[keep_index]
        gt_roi_label[pos_roi_per_this_image:] = 0  # negative labels --> 0
        sample_roi = roi[keep_index]# sampled <- proposed

        # Compute offsets and scales to match sampled RoIs to the GTs.
        gt_roi_loc = bbox2loc(sample_roi, bbox[gt_assignment[keep_index]])
        gt_roi_loc = ((gt_roi_loc - np.array(loc_normalize_mean, np.float32)
                       ) / np.array(loc_normalize_std, np.float32))
        
        # Prepare groundtruth masks
        gt_roi_mask=[]
        _, h, w = mask.shape
        for i , idx in enumerate(gt_assignment[pos_index]):
            A=mask[idx, np.max((int(sample_roi[i,0]),0)):np.min((int(sample_roi[i,2]),h)), np.max((int(sample_roi[i,1]),0)):np.min((int(sample_roi[i,3]),w))]
            gt_roi_mask.append(cv2.resize(A, (self.roi_size*2,self.roi_size*2)))
        #debug: visualize masks
        #cv2.imwrite("gt_assignment_mask.png",mask[0,np.max((int(sample_roi[0,0]),0)):np.min((int(sample_roi[0,2]),h)), np.max((int(sample_roi[0,1]),0)):np.min((int(sample_roi[0,3]),w))]*255)
        #cv2.imwrite("gt_roi_mask.png",gt_roi_mask[0]*244)#

        if xp != np:
            sample_roi = cuda.to_gpu(sample_roi)
            gt_roi_loc = cuda.to_gpu(gt_roi_loc)
            gt_roi_label = cuda.to_gpu(gt_roi_label) 
            gt_roi_mask = cuda.to_gpu(np.stack(gt_roi_mask).astype(np.int32))
        else:
            gt_roi_mask = np.stack(gt_roi_mask).astype(np.int32)
        return sample_roi, gt_roi_loc, gt_roi_label, gt_roi_mask


================================================
FILE: utils/region_proposal_network.py
================================================
import numpy as np

import chainer
from chainer import cuda
import chainer.functions as F
import chainer.links as L

from chainercv.links.model.faster_rcnn.utils.generate_anchor_base import \
    generate_anchor_base
from chainercv.links.model.faster_rcnn.utils.proposal_creator import \
    ProposalCreator

class RegionProposalNetwork(chainer.Chain):

    """Region Proposal Network introduced in Faster R-CNN.

    This is Region Proposal Network introduced in Faster R-CNN [#]_.
    This takes features extracted from images and propose
    class agnostic bounding boxes around "objects".

    .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
    Faster R-CNN: Towards Real-Time Object Detection with \
    Region Proposal Networks. NIPS 2015.

    Args:
        in_channels (int): The channel size of input.
        mid_channels (int): The channel size of the intermediate tensor.
        ratios (list of floats): This is ratios of width to height of
            the anchors.
        anchor_scales (list of numbers): This is areas of anchors.
            Those areas will be the product of the square of an element in
            :obj:`anchor_scales` and the original area of the reference
            window.
        feat_stride (int): Stride size after extracting features from an
            image.
        initialW (callable): Initial weight value. If :obj:`None` then this
            function uses Gaussian distribution scaled by 0.1 to
            initialize weight.
            May also be a callable that takes an array and edits its values.
        proposal_creator_params (dict): Key valued paramters for
            :class:`~chainercv.links.model.faster_rcnn.ProposalCreator`.

    .. seealso::
        :class:`~chainercv.links.model.faster_rcnn.ProposalCreator`

    """

    def __init__(
            self, in_channels=512, mid_channels=512, ratios=[0.5, 1, 2],
            anchor_scales=[8, 16, 32], feat_stride=16,
            initialW=None,
            proposal_creator_params={},
    ):
        self.anchor_base = generate_anchor_base(
            anchor_scales=anchor_scales, ratios=ratios)
        self.feat_stride = feat_stride
        self.proposal_layer = ProposalCreator(**proposal_creator_params)

        n_anchor = self.anchor_base.shape[0]
        super(RegionProposalNetwork, self).__init__()
        with self.init_scope():
            self.conv1 = L.Convolution2D(
                in_channels, mid_channels, 3, 1, 1, initialW=initialW)
            self.score = L.Convolution2D(
                mid_channels, n_anchor * 1, 1, 1, 0, initialW=initialW)
            self.loc = L.Convolution2D(
                mid_channels, n_anchor * 4, 1, 1, 0, initialW=initialW)

    def __call__(self, x, img_size, scale=1.):
        """Forward Region Proposal Network.

        Here are notations.

        * :math:`N` is batch size.
        * :math:`C` channel size of the input.
        * :math:`H` and :math:`W` are height and witdh of the input feature.
        * :math:`A` is number of anchors assigned to each pixel.

        Args:
            x (~chainer.Variable): The Features extracted from images.
                Its shape is :math:`(N, C, H, W)`.
            img_size (tuple of ints): A tuple :obj:`height, width`,
                which contains image size after scaling.
            scale (float): The amount of scaling done to the input images after
                reading them from files.

        Returns:
            (~chainer.Variable, ~chainer.Variable, array, array, array):

            This is a tuple of five following values.

            * **rpn_locs**: Predicted bounding box offsets and scales for \
                anchors. Its shape is :math:`(N, H W A, 4)`.
            * **rpn_scores**:  Predicted foreground scores for \
                anchors. Its shape is :math:`(N, H W A, 2)`.
            * **rois**: A bounding box array containing coordinates of \
                proposal boxes.  This is a concatenation of bounding box \
                arrays from multiple images in the batch. \
                Its shape is :math:`(R', 4)`. Given :math:`R_i` predicted \
                bounding boxes from the :math:`i` th image, \
                :math:`R' = \\sum _{i=1} ^ N R_i`.
            * **roi_indices**: An array containing indices of images to \
                which RoIs correspond to. Its shape is :math:`(R',)`.
            * **anchor**: Coordinates of enumerated shifted anchors. \
                Its shape is :math:`(H W A, 4)`.

        """
        n, _, hh, ww = x.shape
        anchor = _enumerate_shifted_anchor(
            self.xp.array(self.anchor_base), self.feat_stride, hh, ww)
        n_anchor = anchor.shape[0] // (hh * ww)
        h = F.relu(self.conv1(x))

        rpn_locs = self.loc(h)
        rpn_scores = self.score(h)
        
        rpn_locs = rpn_locs.transpose((0, 2, 3, 1)).reshape((n, -1, 4))
        rpn_scores = rpn_scores.transpose((0, 2, 3, 1))
        rpn_fg_scores =\
            rpn_scores.reshape((n, hh, ww, n_anchor))[:, :, :, :] # modified from chainercv
        rpn_fg_scores = rpn_fg_scores.reshape((n, -1))
        rpn_scores = rpn_scores.reshape((n, -1)) # modified from chainercv

        rois = []
        roi_indices = []
        for i in range(n):
            roi = self.proposal_layer(
                rpn_locs[i].array, rpn_fg_scores[i].array, anchor, img_size,
                scale=scale)
            batch_index = i * self.xp.ones((len(roi),), dtype=np.int32)
            rois.append(roi)
            roi_indices.append(batch_index)
        rois = self.xp.concatenate(rois, axis=0)
        roi_indices = self.xp.concatenate(roi_indices, axis=0)
        return rpn_locs, rpn_scores, rois, roi_indices, anchor


def _enumerate_shifted_anchor(anchor_base, feat_stride, height, width):
    # Enumerate all shifted anchors:
    #
    # add A anchors (1, A, 4) to
    # cell K shifts (K, 1, 4) to get
    # shift anchors (K, A, 4)
    # reshape to (K*A, 4) shifted anchors
    xp = cuda.get_array_module(anchor_base)
    shift_y = xp.arange(0, height * feat_stride, feat_stride)
    shift_x = xp.arange(0, width * feat_stride, feat_stride)
    shift_x, shift_y = xp.meshgrid(shift_x, shift_y)
    shift = xp.stack((shift_y.ravel(), shift_x.ravel(),
                      shift_y.ravel(), shift_x.ravel()), axis=1)

    A = anchor_base.shape[0]
    K = shift.shape[0]
    anchor = anchor_base.reshape((1, A, 4)) + \
        shift.reshape((1, K, 4)).transpose((1, 0, 2))
    anchor = anchor.reshape((K * A, 4)).astype(np.float32)
    return anchor

================================================
FILE: utils/roi_align_2d.py
================================================
# Modified work as ROIAlign:
# -----------------------------------------------------------------------------
# Copyright (c) 2018 DeNA
# -----------------------------------------------------------------------------

# Modified work:
# -----------------------------------------------------------------------------
# Copyright (c) 2015 Preferred Infrastructure, Inc.
# Copyright (c) 2015 Preferred Networks, Inc.
# -----------------------------------------------------------------------------

# Original work of forward_gpu and backward_gpu:
# -----------------------------------------------------------------------------
# Fast R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see fast-rcnn/LICENSE for details]
# Written by Ross Girshick
# -----------------------------------------------------------------------------

import numpy
import six

from chainer import cuda
from chainer import function
from chainer.utils import type_check

class ROIAlign2D(function.Function):

    """RoI align over a set of 2d planes."""

    def __init__(self, outh, outw, spatial_scale):
        self.outh, self.outw = outh, outw
        self.spatial_scale = spatial_scale

    def check_type_forward(self, in_types):
        type_check.expect(in_types.size() == 2)

        x_type, roi_type = in_types
        type_check.expect(
            x_type.dtype == numpy.float32,
            x_type.ndim == 4,
            roi_type.dtype == numpy.float32,
            roi_type.ndim == 2,
            roi_type.shape[1] == 5,
        )

    def forward_gpu(self, inputs):
        self.retain_inputs((1,))
        self._bottom_data_shape = inputs[0].shape

        bottom_data, bottom_rois = inputs
        #e.g. (batch, channel, h, w)=(1, 512, 38, 53) (n_rois, )=(128, 5)
        channels, height, width = bottom_data.shape[1:]
        n_rois = bottom_rois.shape[0]
        top_data = cuda.cupy.empty((n_rois, channels, self.outh,
                                    self.outw), dtype=numpy.float32)
        cuda.cupy.ElementwiseKernel(
            '''
            raw float32 bottom_data, float32 spatial_scale, int32 channels,
            int32 height, int32 width, int32 pooled_height, int32 pooled_width,
            raw float32 bottom_rois
            ''',
            'float32 top_data',
            '''
            // pos in output filter
            int pw = i % pooled_width;
            int ph = (i / pooled_width) % pooled_height;
            int c = (i / pooled_width / pooled_height) % channels;
            int num = i / pooled_width / pooled_height / channels;

            // scale the ROI coordinates (1/16)
            float roi_batch_ind = bottom_rois[num * 5 + 0];
            float roi_start_w = bottom_rois[num * 5 + 1] * spatial_scale;
            float roi_start_h = bottom_rois[num * 5 + 2] * spatial_scale;
            float roi_end_w = bottom_rois[num * 5 + 3] * spatial_scale;
            float roi_end_h = bottom_rois[num * 5 + 4] * spatial_scale;

            // Force malformed ROIs to be 1x1
            float roi_width = max(roi_end_w - roi_start_w, 1.0);
            float roi_height = max(roi_end_h - roi_start_h, 1.0);

            // float bin size 
            float bin_size_h = roi_height / static_cast<float>(pooled_height);
            float bin_size_w = roi_width / static_cast<float>(pooled_width);
            float maxval = 0;
            int maxidx = -1;
            
            for (int j = 0; j < 4; j++) {
                int ih = j / 2;
                int iw = j % 2;
                float val = 0;
                // ROIAlign using the center of the bin
                float fh = roi_start_h + (static_cast<float>(ph) + 0.25 + static_cast<float>(ih) * 0.5f) * bin_size_h;
                float fw = roi_start_w + (static_cast<float>(pw) + 0.25 + static_cast<float>(iw) * 0.5f) * bin_size_w;
                
                if (fh < -1.0 || fh > height || fw < -1.0 || fw > width) {
                    continue;
                }

                int hstart = static_cast<int>(floor(fh));
                int wstart = static_cast<int>(floor(fw));
                int hend = hstart + 1;
                int wend = wstart + 1;

                if (hstart >= height - 1) {
                    hend = hstart = height - 1;
                    fh = static_cast<float>(hstart);
                } else {
                    hend = hstart + 1;
                }

                if (wstart >= width - 1) {
                    wend = wstart = width - 1;
                    fw = static_cast<float>(wstart);
                } else {
                    wend = wstart + 1;
                }
                float dh = fh - static_cast<float>(hstart);
                float dw = fw - static_cast<float>(wstart);

                //compute the max value in the bin
                int data_offset = (roi_batch_ind * channels + c) * height * width;

                val += (1.0 - dh) * (1.0 - dw) * bottom_data[data_offset + hstart * width + wstart];
                val += (1.0 - dh) * dw         * bottom_data[data_offset + hstart * width + wend];
                val += dh * (1.0 - dw)         * bottom_data[data_offset + hend * width + wstart];
                val += dh * dw                 * bottom_data[data_offset + hend * width + wend];

                maxval += val;
            }
            top_data = maxval / 4;
            
            ''', 'roi_pooling_2d_fwd'
        )(bottom_data, self.spatial_scale, channels, height, width,
          self.outh, self.outw, bottom_rois, top_data)
        return top_data,

    def backward_gpu(self, inputs, gy):
        bottom_rois = inputs[1]
        channels, height, width = self._bottom_data_shape[1:]
        bottom_diff = cuda.cupy.zeros(self._bottom_data_shape, numpy.float32)
        cuda.cupy.ElementwiseKernel(
            '''
            raw float32 top_diff, int32 num_rois,
            float32 spatial_scale, int32 channels, int32 height, int32 width,
            int32 pooled_height, int32 pooled_width, raw float32 bottom_rois
            ''',
            'raw float32 bottom_diff',
            '''
            // pos in output filter
            int pw = i % pooled_width;
            int ph = (i / pooled_width) % pooled_height;
            int c = (i / pooled_width / pooled_height) % channels;
            int num = i / pooled_width / pooled_height / channels;

            // scale the ROI coordinates (1/16)
            float roi_batch_ind = bottom_rois[num * 5 + 0];
            float roi_start_w = bottom_rois[num * 5 + 1] * spatial_scale;
            float roi_start_h = bottom_rois[num * 5 + 2] * spatial_scale;
            float roi_end_w = bottom_rois[num * 5 + 3] * spatial_scale;
            float roi_end_h = bottom_rois[num * 5 + 4] * spatial_scale;

            // Force malformed ROIs to be 1x1
            float roi_width = max(roi_end_w - roi_start_w, 1.0);
            float roi_height = max(roi_end_h - roi_start_h, 1.0);

            // float bin size 
            float bin_size_h = roi_height / static_cast<float>(pooled_height);
            float bin_size_w = roi_width / static_cast<float>(pooled_width);
            int data_offset = (roi_batch_ind * channels + c) * height * width;
            
            for (int j = 0; j < 4; j++) {
                int ih = j / 2;
                int iw = j % 2;
                // ROIAlign using the center of the bin
                float fh = roi_start_h + (static_cast<float>(ph) + 0.25 + static_cast<float>(ih) * 0.5f) * bin_size_h;
                float fw = roi_start_w + (static_cast<float>(pw) + 0.25 + static_cast<float>(iw) * 0.5f) * bin_size_w;
                
                if (fh < -1.0 || fh > height || fw < -1.0 || fw > width) {
                    continue;
                }

                int hstart = static_cast<int>(floor(fh));
                int wstart = static_cast<int>(floor(fw));
                int hend = hstart + 1;
                int wend = wstart + 1;

                if (hstart >= height - 1) {
                    hend = hstart = height - 1;
                    fh = static_cast<float>(hstart);
                } else {
                    hend = hstart + 1;
                }

                if (wstart >= width - 1) {
                    wend = wstart = width - 1;
                    fw = static_cast<float>(wstart);
                } else {
                    wend = wstart + 1;
                }
                float dh = fh - static_cast<float>(hstart);
                float dw = fw - static_cast<float>(wstart);

                //atomic add: pointer, value
                atomicAdd(&bottom_diff[data_offset + hstart * width + wstart], top_diff[i] * (1.0 - dh) * (1.0 - dw) / 4);
                atomicAdd(&bottom_diff[data_offset + hstart * width + wend], top_diff[i] * (1.0 - dh) * dw         / 4);
                atomicAdd(&bottom_diff[data_offset + hend * width + wstart], top_diff[i] * dh         * (1.0 - dw) / 4);
                atomicAdd(&bottom_diff[data_offset + hend * width + wend], top_diff[i] * dh         * dw         / 4);
            }

            ''', 'roi_pooling_2d_bwd'
        )(gy[0], bottom_rois.shape[0], self.spatial_scale,
          channels, height, width, self.outh, self.outw,
          bottom_rois, bottom_diff, size=gy[0].size)
        
        return bottom_diff, None


def roi_align_2d(x, rois, outh, outw, spatial_scale):
    """Spatial Region of Interest (ROI) align function.

    This function acts similarly to :class:`~functions.MaxPooling2D`, but
    it computes the maximum of input spatial patch for each channel
    with the region of interest.

    Args:
        x (~chainer.Variable): Input variable. The shape is expected to be
            4 dimentional: (n: batch, c: channel, h, height, w: width).
        rois (~chainer.Variable): Input roi variable. The shape is expected to
            be (n: data size, 5), and each datum is set as below:
            (batch_index, x_min, y_min, x_max, y_max).
        outh (int): Height of output image after pooled.
        outw (int): Width of output image after pooled.
        spatial_scale (float): Scale of the roi is resized.

    Returns:
        ~chainer.Variable: Output variable.

    See the original paper proposing ROIPooling:
    `Fast R-CNN <https://arxiv.org/abs/1504.08083>`_.

    """
    return ROIAlign2D(outh, outw, spatial_scale)(x, rois)


================================================
FILE: utils/updater.py
================================================
import copy
import six

from chainer.dataset import convert
from chainer.dataset import iterator as iterator_module
from chainer import function, variable
from chainer.training.updater import StandardUpdater
from chainer import reporter
from chainer import cuda

class SubDivisionUpdater(StandardUpdater):


    def __init__(self, iterator, optimizer, converter=convert.concat_examples,
        subdivisions=1, device=None, loss_func=None):
        super(SubDivisionUpdater, self).__init__(
            iterator=iterator,
            optimizer=optimizer,
            converter=converter,
            device=device,
            loss_func=loss_func,
        )
        self._batchsize = self._iterators['main'].batch_size
        self._subdivisions = subdivisions
        self._n = int(self._batchsize / self._subdivisions)
        assert self._batchsize % self._subdivisions == 0, (self._batchsize, self._subdivisions)

    def update_core(self):
        batch = self._iterators['main'].next()
        #print(self._n)
        in_arrays_list = []
        for i in range(self._subdivisions):
            in_arrays_list.append(self.converter(batch[i::self._subdivisions], self.device))
            #in_arrays_list.append(self.converter(batch, self.device))
        optimizer = self._optimizers['main']
        loss_func = self.loss_func or optimizer.target
        loss_func.cleargrads()

        losses=[]

        for i, in_arrays in enumerate(in_arrays_list):
            if isinstance(in_arrays, tuple):
                in_vars = list(variable.Variable(x) for x in in_arrays)
                loss = loss_func(*in_vars)
            elif isinstance(in_arrays, dict):
                in_vars = {key: variable.Variable(x) for key, x in six.iteritems(in_arrays)}
                loss = loss_func(in_vars)
            else:
                print(type(in_arrays))
            loss.backward()
            #loss = {k: cuda.to_cpu(v.data) for k, v in loss.items()} # for logging
            loss = cuda.to_cpu(loss.data)
            losses.append(loss)
        
        optimizer.update()
        # minibatch average
        if isinstance(loss, dict):
            avg_loss = {k: 0. for k in losses[0].keys()}
            for loss in losses:
                for k, v in loss.items():
                    avg_loss[k] += v
            #avg_loss = {k: v / float(self._batchsize) for k, v in avg_loss.items()}
            avg_loss = {k: v / float(len(losses)) for k, v in avg_loss.items()}
            #avg_loss = {k: v for k, v in avg_loss.items()}

            # report all the loss values
            for k, v in avg_loss.items():
                reporter.report({k: v}, loss_func)
            reporter.report({'loss': sum(list(avg_loss.values()))}, loss_func)
        else:
            avg_loss = 0.
            for loss in losses:
                avg_loss += loss
            avg_loss /= float(self._subdivisions)
            reporter.report({'loss': avg_loss}, loss_func)

================================================
FILE: utils/vis_bbox.py
================================================
from chainercv.visualizations.vis_image import vis_image
import numpy as np
from skimage.measure import find_contours
from matplotlib.patches import Polygon
import cv2

def vis_bbox(img, bbox, label=None, score=None, mask=None, label_names=None, ax=None, contour=False, labeldisplay=True):
    """Visualize bounding boxes inside image.

    Example:

        >>> from chainercv.datasets import VOCDetectionDataset
        >>> from chainercv.datasets import voc_bbox_label_names
        >>> from chainercv.visualizations import vis_bbox
        >>> import matplotlib.pyplot as plot
        >>> dataset = VOCDetectionDataset()
        >>> img, bbox, label = dataset[60]
        >>> vis_bbox(img, bbox, label,
        ...         label_names=voc_bbox_label_names)
        >>> plot.show()

    Args:
        img (~numpy.ndarray): An array of shape :math:`(3, height, width)`.
            This is in RGB format and the range of its value is
            :math:`[0, 255]`.
        bbox (~numpy.ndarray): An array of shape :math:`(R, 4)`, where
            :math:`R` is the number of bounding boxes in the image.
            Each element is organized
            by :obj:`(y_min, x_min, y_max, x_max)` in the second axis.
        label (~numpy.ndarray): An integer array of shape :math:`(R,)`.
            The values correspond to id for label names stored in
            :obj:`label_names`. This is optional.
        score (~numpy.ndarray): A float array of shape :math:`(R,)`.
             Each value indicates how confident the prediction is.
             This is optional.
        label_names (iterable of strings): Name of labels ordered according
            to label ids. If this is :obj:`None`, labels will be skipped.
        ax (matplotlib.axes.Axis): The visualization is displayed on this
            axis. If this is :obj:`None` (default), a new axis is created.

    Returns:
        ~matploblib.axes.Axes:
        Returns the Axes object with the plot for further tweaking.

    """
    from matplotlib import pyplot as plot

    if label is not None and not len(bbox) == len(label):
        raise ValueError('The length of label must be same as that of bbox')
    if score is not None and not len(bbox) == len(score):
        raise ValueError('The length of score must be same as that of bbox')

    # alpha-blend the masks
    COLOR=[(1,1,0), (1,0,1),(0,1,1),(0,0,1),(0,1,0), (1,0,0),(0.1,1,0.2)]
    dst = img.astype(float)
    for i, m in enumerate(mask):
        alpha = np.tile(np.round(m), (3, 1, 1)).astype(float) * 0.4
        src1 = np.ones(dst.shape).astype(float)
        for j, col in enumerate(COLOR[i%len(COLOR)]):
            src1[j] *= col * 255
        dst = cv2.multiply(src1, alpha) + cv2.multiply(dst, 1 - alpha)

    # Returns newly instantiated matplotlib.axes.Axes object if ax is None
    ax = vis_image(dst, ax=ax)

    # If there is no bounding box to display, visualize the image and exit.
    if len(bbox) == 0:
        return ax

    # add boxes, contours and labels
    for i, bb in enumerate(bbox):
        # boxes
        xy = (bb[1], bb[0])
        height = int(bb[2]) - int(bb[0])
        width = int(bb[3]) - int(bb[1])
        ax.add_patch(plot.Rectangle(
            xy, width, height, fill=False, edgecolor='red', linewidth=1))
        
        # contours
        if contour:
            Mcontours = find_contours(mask[i].T, 0.5)
            for verts in Mcontours:
                p = Polygon(verts, facecolor="none", edgecolor=[0.5,0.5,0.5])
                ax.add_patch(p)
        
        #labels
        caption = list()
        if label is not None and label_names is not None:
            lb = label[i]
            print(lb)
            if not (0 <= lb < len(label_names)):
                raise ValueError('No corresponding name is given')
            caption.append(label_names[lb])
        if score is not None:
            sc = score[i]
            caption.append('{:.2f}'.format(sc))

        if len(caption) > 0 and labeldisplay:
            ax.text(bb[1], bb[0],
                    ': '.join(caption),
                    style='italic',
                    fontsize=8,
                    color='white'
                    )#'facecolor': 'white', 'alpha': 0.7, 'pad': 10})
    return ax