Repository: DeNA/Chainer_Mask_R-CNN Branch: master Commit: 315a5b098978 Files: 25 Total size: 113.7 KB Directory structure: gitextract_iecqmj1q/ ├── .gitignore ├── LICENSE ├── README.md ├── README_JP.md ├── coco_dataset.py ├── demo.py ├── getcoco.sh ├── mask_rcnn.py ├── mask_rcnn_resnet.py ├── mask_rcnn_train_chain.py ├── mask_rcnn_train_chain_batch.py ├── train.py └── utils/ ├── __init__.py ├── bn_utils.py ├── box_utils.py ├── cocoapi_evaluator.py ├── detection_coco_evaluator.py ├── detectron_parser.py ├── eval_detection_coco.py ├── makecocolist.py ├── proposal_target_creator.py ├── region_proposal_network.py ├── roi_align_2d.py ├── updater.py └── vis_bbox.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ __pycache* result *.png ================================================ FILE: LICENSE ================================================ Copyright (c) 2018 DeNA Co., Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sublicense copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software; and THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ##################################################################################### # Chainer_Mask_R-CNN is designed based on chainercv's API. # Chainer_Mask_R-CNN's source code and documents contain the original chainercv ones. ##################################################################################### Copyright (c) 2017 Yusuke Niitani. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the chainercv Developers nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ##################################################################################### # Chainer_Mask_R-CNN is designed based on chainer's API. # Chainer_Mask_R-CNN's source code and documents contain the original chainer ones. ##################################################################################### Copyright (c) 2015 Preferred Infrastructure, Inc. Copyright (c) 2015 Preferred Networks, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the chainer Developers nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ~ ================================================ FILE: README.md ================================================ # Chainer\_Mask\_R-CNN Chainer implementation of Mask R-CNN - the multi-task network for object detection, object classification, and instance segmentation. (https://arxiv.org/abs/1703.06870) 日本語版 README ## What's New - Training result for R-50-C4 model has been evaluated! - COCO box AP = 0.346 using our trainer (0.355 with official boxes) - COCO mask AP = 0.287 using our trainer (0.314 with official boxes) ## Examples - to be updated ## Requirements - [Chainer](https://github.com/pfnet/chainer) - [Chainercv](https://github.com/chainer/chainercv) - [Cupy](https://github.com/cupy/cupy) (operable if your environment can run chainer > v3 with cuda and cudnn.) (verified as operable: chainer==3.1.0, chainercv==0.7.0, cupy==1.0.3) ``` $ pip install chainer $ pip install chainercv $ pip install cupy ``` - Python 3.0+ - NumPy - Matplotlib - OpenCV ## TODOs - [x] Precision Evaluator (bbox, COCO metric) - [x] Detectron Model Parser - [x] Modify ROIAlign - [x] Mask inference using refined ROIs - [x] Precision Evaluator (mask, COCO metric) - [ ] Improve segmentation AP for R-50-C4 model - [ ] Feature Pyramid Network (R-50-FPN) - [ ] Keypoint Detection (R-50-FPN, Keypoints) ## Benchmark Results
Box AP 50:95 Segm AP 50:95
Ours (1 GPU) 0.346 0.287
Detectron model 0.350 0.295
Detectron caffe2 0.355 0.314
## Inference with Pretrained Models - Download the pretrained model from the [Model Zoo] (https://github.com/facebookresearch/Detectron/blob/master/MODEL_ZOO.md) (`model` link of `R-50-C4 Mask` at `End-to-End Faster & Mask R-CNN Baselines`) - Make `modelfiles` directory and put the downloaded file `model_final.pkl` in it - Execute: ``` python utils/detectron_parser.py ``` - And the converted model file is saved in `modelfiles` - Run the demo: ``` python demo.py --bn2affine --modelfile modelfiles/e2e_mask_rcnn_R-50-C4_1x_d2c.npz --image ``` ## Prerequisites for training - Download 'ResNet-50-model.caffemodel' from the "OneDrive download" of [ResNet pretrained models](https://github.com/KaimingHe/deep-residual-networks#models) for model initialization and place it in ~/.chainer/dataset/pfnet/chainer/models/ - COCO 2017 dataset : the COCO dataset can be downloaded and unzipped by: ``` bash getcoco.sh ``` Setup the COCO API: ``` git clone https://github.com/waleedka/coco cd coco/PythonAPI/ make python setup.py install cd ../../ ``` note: the official coco repository is not python3 compatible. Use the repository above in order to run our evaluation. ## Train ``` python train.py ``` arguments and the default conditions are defined as follows: ``` '--dataset', choices=('coco2017'), default='coco2017' '--extractor', choices=('resnet50','resnet101'), default='resnet50', help='extractor network' '--gpu', '-g', type=int, default=0 '--lr', '-l', type=float, default=1e-4 '--batchsize', '-b', type=int, default=8 '--freeze_bn', action='store_true', default=False, help='freeze batchnorm gamma/beta' '--bn2affine', action='store_true', default=False, help='batchnorm to affine' '--out', '-o', default='result', help='output directory' '--seed', '-s', type=int, default=0 '--roialign', action='store_true', default=True, help='True: ROIAlign, False: ROIpooling' '--step_size', '-ss', type=int, default=400000 '--lr_step', '-ls', type=int, default=480000 '--lr_initialchange', '-li', type=int, default=800 '--pretrained', '-p', type=str, default='imagenet' '--snapshot', type=int, default=4000 '--validation', type=int, default=30000 '--resume', type=str '--iteration', '-i', type=int, default=800000 '--roi_size', '-r', type=int, default=14, help='ROI size for mask head input' '--gamma', type=float, default=1, help='mask loss balancing factor' ``` note that we use a subdivision-based updater to enable training with large batch size. ## Demo Segment the objects in the input image by executing: ``` python demo.py --image --modelfile result/snapshot_model.npz --contour ``` ## Evaluation Evaluate the trained model with COCO metric (bounding box, segmentation) : ``` python train.py --lr 0 --iteration 1 --validation 1 --resume ``` ## Citation Please cite the original paper in your publications if it helps your research: @article{DBLP:journals/corr/HeGDG17, author = {Kaiming He and Georgia Gkioxari and Piotr Doll{\'{a}}r and Ross B. Girshick}, title = {Mask {R-CNN}}, journal = {CoRR}, volume = {abs/1703.06870}, year = {2017}, url = {http://arxiv.org/abs/1703.06870}, archivePrefix = {arXiv}, eprint = {1703.06870}, timestamp = {Wed, 07 Jun 2017 14:42:32 +0200}, biburl = {http://dblp.org/rec/bib/journals/corr/HeGDG17}, bibsource = {dblp computer science bibliography, http://dblp.org} } ================================================ FILE: README_JP.md ================================================ # Chainer\_Mask\_R-CNN マルチタスク検出器Mask R-CNNのchainer実装 (https://arxiv.org/abs/1703.06870) ## 実行例 - 準備中 ## 必要環境 - [Chainer](https://github.com/pfnet/chainer) - [Chainercv](https://github.com/chainer/chainercv) - [Cupy](https://github.com/cupy/cupy) (動作確認済み: chainer==3.1.0, chainercv==0.7.0, verified: cupy==1.0.3) ``` $ pip install chainer $ pip install chainercv $ pip install cupy==1.0.3 ``` - Python 3.0+ - NumPy - Matplotlib - OpenCV ## TODOs - [x] Precision Evaluator (bbox, COCO metric) - [x] Detectron Model Parser - [x] Modify ROIAlign - [x] Mask inference using refined ROIs - [x] Precision Evaluator (mask, COCO metric) - [ ] Feature Pyramid Network (R-50-FPN) - [ ] Keypoint Detection (R-50-FPN, Keypoints) ## 学習済みモデルの使用 - [Model Zoo] (https://github.com/facebookresearch/Detectron/blob/master/MODEL_ZOO.md) からモデルファイルをダウンロード ( `End-to-End Faster & Mask R-CNN Baselines` の `R-50-C4 Mask` 行の `model` リンク) - `modelfiles` ディレクトリを作り、ダウンロードした `model_final.pkl` を置く - 以下を実行 ``` python utils/detectron_parser.py ``` - `modelfiles` の中に変換されたモデルファイルが保存されます。 - 以下によりデモを実行 ``` python demo.py --bn2affine --modelfile modelfiles/e2e_mask_rcnn_R-50-C4_1x_d2c.npz --image ``` ## 学習のための準備 - 学習済みモデルのダウンロード ・以下リンク先の'OneDrive download'から、ResNet-50-model.caffemodelをダウンロード [ResNet pretrained models](https://github.com/KaimingHe/deep-residual-networks#models) ・~/.chainer/dataset/pfnet/chainer/models/ に置く - COCO 2017 データセット COCOデータセットのダウンロードと解凍: ``` bash getcoco.sh ``` - COCO APIのセットアップ: ``` git clone https://github.com/waleedka/coco cd coco/PythonAPI/ make python setup.py install cd ../../ ``` ## 学習 ``` python train.py ``` 引数は以下です: ``` '--dataset', choices=('coco2017'), default='coco2017' '--extractor', choices=('resnet50','resnet101'), default='resnet50', help='extractor network' '--gpu', '-g', type=int, default=0 '--lr', '-l', type=float, default=1e-4 '--batchsize', '-b', type=int, default=8 '--freeze_bn', action='store_true', default=False, help='freeze batchnorm gamma/beta' '--bn2affine', action='store_true', default=False, help='batchnorm to affine' '--out', '-o', default='result', help='output directory' '--seed', '-s', type=int, default=0 '--roialign', action='store_true', default=True, help='True: ROIAlign, False: ROIpooling' '--step_size', '-ss', type=int, default=400000 '--lr_step', '-ls', type=int, default=480000 '--lr_initialchange', '-li', type=int, default=800 '--pretrained', '-p', type=str, default='imagenet' '--snapshot', type=int, default=4000 '--validation', type=int, default=30000 '--resume', type=str '--iteration', '-i', type=int, default=800000 '--roi_size', '-r', type=int, default=14, help='ROI size for mask head input' '--gamma', type=float, default=1, help='mask loss balancing factor' ``` 本実装ではsubdivisionを用いたupdateを行なっているため、batch size = 1 相当のGPUメモリでbatch size=8等を指定可能です ## デモ 入力画像のインスタンス・セグメンテーションを実行します: ``` python demo.py --image --modelfile result/snapshot_model.npz --contour ``` ### 評価 COCO metric (Bounding Box, Segmentation) によるモデルの評価を実行します。 ``` python train.py --lr 0 --iteration 1 --validation 1 --resume ``` ## 引用 Please cite the original paper in your publications if it helps your research: @article{DBLP:journals/corr/HeGDG17, author = {Kaiming He and Georgia Gkioxari and Piotr Doll{\'{a}}r and Ross B. Girshick}, title = {Mask {R-CNN}}, journal = {CoRR}, volume = {abs/1703.06870}, year = {2017}, url = {http://arxiv.org/abs/1703.06870}, archivePrefix = {arXiv}, eprint = {1703.06870}, timestamp = {Wed, 07 Jun 2017 14:42:32 +0200}, biburl = {http://dblp.org/rec/bib/journals/corr/HeGDG17}, bibsource = {dblp computer science bibliography, http://dblp.org} } ================================================ FILE: coco_dataset.py ================================================ import numpy as np from skimage.draw import polygon import json import os import cv2 import pycocotools from pycocotools.coco import COCO import chainer from chainercv.utils import read_image class COCODataset(chainer.dataset.DatasetMixin): def __init__(self, data_dir='COCO/', json_file='instances_train2017.json', name='train2017', id_list_file='train2017.txt', sizemin=10): self.data_dir = data_dir self.json_file = json_file self.coco = COCO(self.data_dir + 'annotations/'+self.json_file) self.ids = self.coco.getImgIds() self.name = name self.sizemin = sizemin self.class_ids = sorted(self.coco.getCatIds()) def __len__(self): return len(self.ids) def ann2rle(self, ann, height, width): if isinstance(ann, list): rles = pycocotools.mask.frPyObjects(ann, height, width) rle = pycocotools.mask.merge(rles) elif isinstance(ann['counts'], list): rle = pycocotools.mask.frPyObjects(ann, height, width) else: rle = ann return rle def get_example(self, i): #i = i % 500 # for limiting data size numofboxes=0 while True: id_ = self.ids[i] annot_labels, annot_bboxes, annot_segs= list(), list(), list() anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=None) annotations = self.coco.loadAnns(anno_ids) for a in annotations: if a['bbox'][2] > self.sizemin and a['bbox'][3] > self.sizemin \ and a['iscrowd']==0: annot_labels.append(a['category_id']) annot_bboxes.append(a['bbox']) annot_segs.append(a['segmentation']) numofboxes=len(annot_labels) if numofboxes > 0 or chainer.config.train == False: break else: i = i - 1 img_file = os.path.join(self.data_dir, self.name, '{:012}'.format(id_) + '.jpg') img = read_image(img_file, color=True) _, h, w = img.shape annot_masks = [] for annot_seg_polygons in annot_segs: rle = self.ann2rle(annot_seg_polygons, h, w) annot_masks.append(pycocotools.mask.decode(rle)) if numofboxes > 0: annot_masks = np.stack(annot_masks).astype(np.uint8) #y,x annot_bboxes = np.stack(annot_bboxes).astype(np.float32) annot_labels = np.stack(annot_labels).astype(np.int32) else: annot_labels, annot_bboxes, annot_masks = [], [], [] return img, annot_labels, annot_bboxes, annot_masks, i ================================================ FILE: demo.py ================================================ import argparse import chainer import numpy as np from mask_rcnn_train_chain import MaskRCNNTrainChain from utils.bn_utils import freeze_bn, bn_to_affine def main(): parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--modelfile') parser.add_argument('--image', type=str) parser.add_argument('--roi_size', '-r', type=int, default=14, help='ROI size for mask head input') parser.add_argument('--roialign', action='store_false', default=True, help='default: True') parser.add_argument('--contour', action='store_true', default=False, help='visualize contour') parser.add_argument('--background', action='store_true', default=False, help='background(no-display mode)') parser.add_argument('--bn2affine', action='store_true', default=False, help='batchnorm to affine') parser.add_argument('--extractor', choices=('resnet50','resnet101'), default='resnet50', help='extractor network') args = parser.parse_args() #network class id --> coco label id test_class_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, \ 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, \ 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90] if args.background: import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plot from utils.vis_bbox import vis_bbox from chainercv.datasets import voc_bbox_label_names from mask_rcnn_resnet import MaskRCNNResNet from chainercv import utils if args.extractor=='resnet50': model = MaskRCNNResNet(n_fg_class=80, roi_size=args.roi_size, pretrained_model=args.modelfile, n_layers=50, roi_align=args.roialign, class_ids=test_class_ids) elif args.extractor=='resnet101': model = MaskRCNNResNet(n_fg_class=80, roi_size=args.roi_size, pretrained_model=args.modelfile, n_layers=101, roi_align=args.roialign, class_ids=test_class_ids) chainer.serializers.load_npz(args.modelfile, model) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() if args.bn2affine: bn_to_affine(model) img = utils.read_image(args.image, color=True) bboxes, labels, scores, masks = model.predict([img]) bbox, label, score, mask = bboxes[0], np.asarray(labels[0],dtype=np.int32), scores[0], masks[0] #print(bbox, np.asarray(label,dtype=np.int32), score, mask) coco_label_names=('background', # class zero 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'street sign', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'hat', 'backpack', 'umbrella', 'shoe', 'eye glasses', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'plate', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'mirror', 'dining table', 'window', 'desk','toilet', 'door', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'blender', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush' ) vis_bbox( img, bbox, label=label, score=score, mask=mask, label_names=coco_label_names, contour=args.contour, labeldisplay=True) plot.show() filename = "output.png" plot.savefig(filename) if __name__ == '__main__': main() ================================================ FILE: getcoco.sh ================================================ # get COCO dataset mkdir COCO cd COCO wget http://images.cocodataset.org/zips/train2017.zip wget http://images.cocodataset.org/zips/val2017.zip wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip unzip train2017.zip unzip val2017.zip unzip annotations_trainval2017.zip rm -f train2017.zip rm -f val2017.zip rm -f annotations_trainval2017.zip ================================================ FILE: mask_rcnn.py ================================================ from __future__ import division import numpy as np import chainer from chainer import cuda import chainer.functions as F from chainercv.links.model.faster_rcnn.utils.loc2bbox import loc2bbox from chainercv.utils import non_maximum_suppression from chainercv.transforms.image.resize import resize import cv2 import pycocotools from utils.box_utils import bbox_yxyx2xywh, im_mask class MaskRCNN(chainer.Chain): def __init__(self, extractor, rpn, head, mean, min_size=600, max_size=1000, loc_normalize_mean=(0., 0., 0., 0.), loc_normalize_std=(0.1, 0.1, 0.2, 0.2), class_ids=[] ): print("MaskRCNN initialization") super(MaskRCNN, self).__init__() with self.init_scope(): self.extractor = extractor self.rpn = rpn self.head = head self.mean = mean self.min_size = min_size self.max_size = max_size self.loc_normalize_mean = loc_normalize_mean self.loc_normalize_std = loc_normalize_std self.use_preset('visualize') if class_ids==[]: raise ValueError('set class ids') self.class_ids = class_ids self.preset = 'visualize' @property def n_class(self): return self.head.n_class def __call__(self, x, scale=1.): img_size = x.shape[2:] h = self.extractor(x) #VGG rpn_locs, rpn_scores, rois, roi_indices, anchor = \ self.rpn(h, img_size, scale) #Region Proposal Network hres5 = self.head.res5head(h, rois, roi_indices) roi_cls_locs, roi_scores = self.head.boxhead(hres5) return roi_cls_locs, roi_scores, rois, roi_indices, h def use_preset(self, preset): if preset == 'visualize': self.nms_thresh = 0.3 self.score_thresh = 0.7 self.preset = 'visualize' elif preset == 'evaluate': self.nms_thresh = 0.5 self.score_thresh = 0.05 self.preset = 'evaluate' else: raise ValueError('preset must be visualize or evaluate') def prepare(self, img): _, H, W = img.shape scale = self.min_size / min(H, W) if scale * max(H, W) > self.max_size: scale = self.max_size / max(H, W) #img = resize(img, (int(H * scale), int(W * scale))) img = img.transpose((1,2,0)) img = cv2.resize(img, None, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR) img = img.transpose((2,0,1)) img = (img - self.mean).astype(np.float32, copy=False) img = img[::-1, :, :] # RGB to BGR order for resnet pretrained model return img def _suppress(self, raw_cls_bbox, raw_cls_roi, raw_prob): bbox = list() roi = list() label = list() score = list() mask = list() for l in range(1, self.n_class): cls_bbox_l = raw_cls_bbox.reshape((-1, self.n_class, 4))[:, l, :] cls_roi_l = raw_cls_roi.reshape((-1, self.n_class, 4))[:, l, :] prob_l = raw_prob[:, l] lmask = prob_l > self.score_thresh cls_bbox_l = cls_bbox_l[lmask] cls_roi_l = cls_roi_l[lmask] prob_l = prob_l[lmask] keep = non_maximum_suppression(cls_bbox_l, self.nms_thresh, prob_l) bbox.append(cls_bbox_l[keep]) roi.append(cls_roi_l[keep]) label.append((l - 1) * np.ones((len(keep),))) score.append(prob_l[keep]) bbox = np.concatenate(bbox, axis=0).astype(np.float32) roi = np.concatenate(roi, axis=0).astype(np.float32) label = np.concatenate(label, axis=0).astype(np.float32) score = np.concatenate(score, axis=0).astype(np.float32) return bbox, roi, label, score def predict(self, imgs): prepared_imgs = list() sizes = list() #print("predicting!") for img in imgs: size = img.shape[1:] img = self.prepare(img.astype(np.float32)) prepared_imgs.append(img) sizes.append(size) bboxes = list() out_rois = list() labels = list() scores = list() masks = list() for img, size in zip(prepared_imgs, sizes): with chainer.using_config('train', False), \ chainer.function.no_backprop_mode(): img_var = chainer.Variable(self.xp.asarray(img[None])) scale = img_var.shape[3] / size[1] roi_cls_locs, roi_scores, rois, _, h = self.__call__(img_var, scale=scale) #assuming batch size = 1 roi_cls_loc = roi_cls_locs.data roi_score = roi_scores.data roi = rois / scale mean = self.xp.tile(self.xp.asarray(self.loc_normalize_mean), self.n_class) std = self.xp.tile(self.xp.asarray(self.loc_normalize_std), self.n_class) roi_cls_loc = (roi_cls_loc * std + mean).astype(np.float32) roi_cls_loc = roi_cls_loc.reshape((-1, self.n_class, 4)) roi = self.xp.broadcast_to(roi[:, None], roi_cls_loc.shape).reshape((-1, 4)) cls_bbox = loc2bbox(roi, roi_cls_loc.reshape((-1, 4))) cls_bbox = cls_bbox.reshape((-1, self.n_class * 4)) cls_roi = roi.reshape((-1, self.n_class * 4)) #clip the bbox cls_bbox[:, 0::2] = self.xp.clip(cls_bbox[:, 0::2], 0, size[0]) cls_bbox[:, 1::2] = self.xp.clip(cls_bbox[:, 1::2], 0, size[1]) cls_roi[:, 0::2] = self.xp.clip(cls_roi[:, 0::2], 0, size[0]) cls_roi[:, 1::2] = self.xp.clip(cls_roi[:, 1::2], 0, size[1]) prob = F.softmax(roi_score).data raw_cls_bbox = cuda.to_cpu(cls_bbox) raw_cls_roi = cuda.to_cpu(cls_roi) raw_prob = cuda.to_cpu(prob) bbox, out_roi, label, score = self._suppress(raw_cls_bbox, raw_cls_roi, raw_prob) mask=[] if len(bbox) > 0: # mask head roi_indices = self.xp.zeros((len(bbox),), dtype=np.int32) with chainer.using_config('train', False), \ chainer.function.no_backprop_mode(): hres5 = self.head.res5head(h, cuda.to_gpu(bbox * scale), roi_indices) roi_masks = self.head.maskhead(hres5) roi_mask = F.sigmoid(roi_masks).data raw_mask = cuda.to_cpu(roi_mask) # postprocess if self.preset == 'evaluate': bboxes.append(bbox_yxyx2xywh(bbox)) wmasks = [] for m, b, l in zip(raw_mask, bbox, label): wm = im_mask(m[int(l+1)], size, b) # encode the mask wm = pycocotools.mask.encode(np.asfortranarray(wm)) wm['counts'] = wm['counts'].decode('ascii') mask.append(wm) elif self.preset == 'visualize': bboxes.append(bbox) wmasks = [] for m, b, l in zip(raw_mask, bbox, label): wm = im_mask(m[int(l+1)], size, b) mask.append(wm) elif self.preset == 'evaluate': # len(bbox) = 0 wm = np.zeros((size[0], size[1]), dtype=np.uint8) wm = pycocotools.mask.encode(np.asfortranarray(wm)) wm['counts'] = wm['counts'].decode('ascii') mask.append(wm) bboxes.append(bbox_yxyx2xywh(bbox)) labels.append([self.class_ids[int(l)] for l in label.tolist()]) scores.append(score) masks.append(mask) return bboxes, labels, scores, masks ================================================ FILE: mask_rcnn_resnet.py ================================================ import numpy as np import chainer import chainer.functions as F import chainer.links as L from mask_rcnn import MaskRCNN #from chainercv.links.model.faster_rcnn.region_proposal_network import \ # RegionProposalNetwork from utils.region_proposal_network import RegionProposalNetwork from utils import roi_align_2d from chainer.links.model.vision.resnet import BuildingBlock, _retrieve from chainer.links.connection.convolution_2d import Convolution2D from chainer.links.connection.linear import Linear from chainer.links.normalization.batch_normalization import BatchNormalization from chainer.initializers import constant class ExtractorResNet(chainer.link.Chain): def __init__(self, pretrained_model='auto', n_layers=50, roi_size=14): super(ExtractorResNet, self).__init__() print('Extractor ResNet',n_layers,' initialization') kwargs = {'initialW': constant.Zero()} if pretrained_model=='auto': if n_layers == 50: pretrained_model = 'ResNet-50-model.caffemodel' block = [3, 4, 6, 3] elif n_layers == 101: pretrained_model = 'ResNet-101-model.caffemodel' block = [3, 4, 23, 3] with self.init_scope(): self.conv1 = Convolution2D(3, 64, 7, 2, 3, **kwargs, nobias=True) self.bn1 = BatchNormalization(64) self.res2 = BuildingBlock(block[0], 64, 64, 256, 1, **kwargs) self.res3 = BuildingBlock(block[1], 256, 128, 512, 2, **kwargs) self.res4 = BuildingBlock(block[2], 512, 256, 1024, 2, **kwargs) self.res5 = BuildingBlock(block[3], 1024, 512, 2048, roi_size//7, **kwargs) self.fc6 = Linear(2048, 1000) if pretrained_model and pretrained_model.endswith('.caffemodel'): _retrieve(n_layers, 'ResNet-{}-model.npz'.format(n_layers), pretrained_model, self) elif pretrained_model: npz.load_npz(pretrained_model, self) del self.fc6 def __call__(self, x): h = F.relu(self.bn1(self.conv1(x))) _, _, H, W = h.shape Hpool = (H + 1)//2 Wpool = (W + 1)//2 h = F.max_pooling_2d(h, ksize=3, stride=2, pad=1) h = h[:, :, :Hpool, :Wpool] h = self.res2(h) h = self.res3(h) h = self.res4(h) return h class MaskRCNNResNet(MaskRCNN): feat_stride = 16 def __init__(self, n_fg_class=None, pretrained_model=None, min_size=800, max_size=1333, ratios=[0.5 ,1, 2], anchor_scales=[2, 4, 8, 16, 32], initialW=None, rpn_initialW=None, loc_initialW=None, score_initialW=None, proposal_creator_params={"n_test_pre_nms":6000,"n_test_post_nms": 1000,"min_size":4}, roi_size=14, class_ids=[], n_layers=50, roi_align=True ): print("MaskRNNResNet initialization") if n_fg_class is None: raise ValueError('supply n_fg_class!') if loc_initialW is None: loc_initialW = chainer.initializers.Normal(0.001) if score_initialW is None: score_initialW = chainer.initializers.Normal(0.01) if rpn_initialW is None: rpn_initialW = chainer.initializers.Normal(0.01) if initialW is None:# and pretrained_model: print("setting initialW") initialW = chainer.initializers.Normal(0.01) self.roi_size=roi_size if pretrained_model is not None: pretrained_model = 'auto' extractor = ExtractorResNet(pretrained_model, n_layers=n_layers, roi_size=roi_size) rpn = RegionProposalNetwork( 1024, 1024, ratios=ratios, anchor_scales=anchor_scales, feat_stride=self.feat_stride, initialW=rpn_initialW, proposal_creator_params=proposal_creator_params, ) head = MaskRCNNHead( n_fg_class + 1, roi_size=self.roi_size, spatial_scale=1. / self.feat_stride, initialW=initialW, loc_initialW=loc_initialW, score_initialW=score_initialW, roi_align=roi_align, reslayer=extractor.res5 ) del extractor.res5 super(MaskRCNNResNet, self).__init__( extractor, rpn, head, mean=np.array([122.7717, 115.9465, 102.9801], dtype=np.float32)[:, None, None], min_size=min_size, max_size=max_size, class_ids=class_ids ) class MaskRCNNHead(chainer.Chain): def __init__(self, n_class, roi_size, spatial_scale, initialW=None, loc_initialW=None, score_initialW=None, roi_align=True, reslayer=None): super(MaskRCNNHead, self).__init__() with self.init_scope(): self.res5 = reslayer#BuildingBlock(3, 1024, 512, 2048, 1, initialW=initialW) #class / loc branch self.cls_loc = L.Linear(2048, n_class * 4, initialW=initialW) self.score = L.Linear(2048, n_class, initialW=score_initialW) #Mask-RCNN branch self.deconvm1 = L.Deconvolution2D(2048, 256, 2, 2, initialW=initialW) self.convm2 = L.Convolution2D(256, n_class, 1, 1, pad=0,initialW=initialW) self.n_class = n_class self.roi_size = roi_size self.spatial_scale = spatial_scale self.roi_align = roi_align print("ROI Align=",roi_align) def res5head(self, x, rois, roi_indices): # extracted feature map -> pooling -> res5 block roi_indices = roi_indices.astype(np.float32) indices_and_rois = self.xp.concatenate( (roi_indices[:, None], rois), axis=1) #x: (batch, channel, w, h) #rois: (128, 4) (ROI indices) if self.roi_align: pool = _roi_align_2d_yx( x, indices_and_rois, self.roi_size,self.roi_size, self.spatial_scale) else: pool = _roi_pooling_2d_yx( x, indices_and_rois, self.roi_size,self.roi_size, self.spatial_scale) hres5 = self.res5(pool) return hres5 def maskhead(self, hres5): # mask branch h = F.relu(self.deconvm1(hres5)) masks=self.convm2(h) return masks def boxhead(self, hres5): # box branch h = F.average_pooling_2d(hres5, self.roi_size//2, stride=7) roi_cls_locs = self.cls_loc(h) roi_scores = self.score(h) return roi_cls_locs, roi_scores def _roi_pooling_2d_yx(x, indices_and_rois, outh, outw, spatial_scale): xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]] pool = F.roi_pooling_2d( x, xy_indices_and_rois, outh, outw, spatial_scale) return pool def _roi_align_2d_yx(x, indices_and_rois, outh, outw, spatial_scale): xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]] pool = roi_align_2d.roi_align_2d( x, xy_indices_and_rois, outh, outw, spatial_scale) return pool ================================================ FILE: mask_rcnn_train_chain.py ================================================ import numpy as np import chainer from chainer import cuda import chainer.functions as F from chainercv.links.model.faster_rcnn.utils.anchor_target_creator import AnchorTargetCreator from utils.proposal_target_creator import ProposalTargetCreator from chainer import computational_graph as c from chainercv.links import PixelwiseSoftmaxClassifier class MaskRCNNTrainChain(chainer.Chain): def __init__(self, mask_rcnn, rpn_sigma=3., roi_sigma=1., gamma=1, anchor_target_creator=AnchorTargetCreator(), roi_size=14): super(MaskRCNNTrainChain, self).__init__() with self.init_scope(): self.mask_rcnn = mask_rcnn self.rpn_sigma = rpn_sigma self.roi_sigma = roi_sigma self.anchor_target_creator = anchor_target_creator self.proposal_target_creator = ProposalTargetCreator(roi_size=roi_size//2) self.loc_normalize_mean = mask_rcnn.loc_normalize_mean self.loc_normalize_std = mask_rcnn.loc_normalize_std self.decayrate=0.99 self.avg_loss = None self.gamma=gamma def __call__(self, imgs, bboxes, labels, scale, masks, i): if isinstance(bboxes, chainer.Variable): bboxes = bboxes.data if isinstance(labels, chainer.Variable): labels = labels.data if isinstance(scale, chainer.Variable): scale = scale.data if isinstance(masks, chainer.Variable): masks = masks.data scale = np.asscalar(cuda.to_cpu(scale)) n = bboxes.shape[0] if n != 1: raise ValueError('only batch size 1 is supported') _, _, H, W = imgs.shape img_size = (H, W) #Extractor (VGG) : img -> features with chainer.using_config('train', False): features = self.mask_rcnn.extractor(imgs) #Region Proposal Network : features -> rpn_locs, rpn_scores, rois rpn_locs, rpn_scores, rois, roi_indices, anchor = self.mask_rcnn.rpn( features, img_size, scale) bbox, label, mask, rpn_score, rpn_loc, roi = \ bboxes[0], labels[0], masks[0], rpn_scores[0], rpn_locs[0], rois # batch size=1 #proposal target : roi(proposed) , bbox(GT), label(GT) -> sample_roi, gt_roi_loc, gt_roi_label #the targets are compared with the head output. sample_roi, gt_roi_loc, gt_roi_label, gt_roi_mask = self.proposal_target_creator( roi, bbox, label, mask, self.loc_normalize_mean, self.loc_normalize_std) sample_roi_index = self.xp.zeros((len(sample_roi),), dtype=np.int32) #Head Network : features, sample_roi -> roi_cls_loc, roi_score with chainer.using_config('train', False): hres5 = self.mask_rcnn.head.res5head(features, sample_roi, sample_roi_index) roi_cls_loc, roi_score = self.mask_rcnn.head.boxhead(hres5) roi_cls_mask = self.mask_rcnn.head.maskhead(hres5) del(hres5) #RPN losses gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(bbox, anchor, img_size) rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc, gt_rpn_label, self.rpn_sigma) rpn_cls_loss = F.sigmoid_cross_entropy(rpn_score, gt_rpn_label) #Head output losses n_sample = roi_cls_loc.shape[0] roi_cls_loc = roi_cls_loc.reshape((n_sample, -1, 4)) roi_loc = roi_cls_loc[self.xp.arange(n_sample), gt_roi_label] roi_mask = roi_cls_mask[self.xp.arange(n_sample), gt_roi_label] roi_loc_loss = _fast_rcnn_loc_loss(roi_loc, gt_roi_loc, gt_roi_label, self.roi_sigma) roi_cls_loss = F.softmax_cross_entropy(roi_score, gt_roi_label) #mask loss: average binary cross-entropy loss mask_loss = F.sigmoid_cross_entropy(roi_mask[0:gt_roi_mask.shape[0]], gt_roi_mask) #total loss loss = rpn_loc_loss + rpn_cls_loss + roi_loc_loss + roi_cls_loss + self.gamma * mask_loss #avg loss calculation if self.avg_loss is None: self.avg_loss = loss.data else: self.avg_loss = self.avg_loss * self.decayrate + loss.data*(1-self.decayrate) chainer.reporter.report({'rpn_loc_loss':rpn_loc_loss, 'rpn_cls_loss':rpn_cls_loss, 'roi_loc_loss':roi_loc_loss, 'roi_cls_loss':roi_cls_loss, 'roi_mask_loss':self.gamma * mask_loss, 'avg_loss':self.avg_loss, 'loss':loss}, self) return loss def _smooth_l1_loss(x, t, in_weight, sigma): sigma2 = sigma ** 2 diff = in_weight * (x - t) abs_diff = F.absolute(diff) flag = (abs_diff.data < (1. / sigma2)).astype(np.float32) y = (flag * (sigma2 / 2.) * F.square(diff) + (1 - flag) * (abs_diff - 0.5 / sigma2)) return F.sum(y) def _fast_rcnn_loc_loss(pred_loc, gt_loc, gt_label, sigma): xp = chainer.cuda.get_array_module(pred_loc) in_weight = xp.zeros_like(gt_loc) in_weight[gt_label > 0] = 1 loc_loss = _smooth_l1_loss(pred_loc, gt_loc, in_weight, sigma) loc_loss /= xp.sum(gt_label >= 0) return loc_loss ================================================ FILE: mask_rcnn_train_chain_batch.py ================================================ import numpy as np import chainer from chainer import cuda import chainer.functions as F from chainercv.links.model.faster_rcnn.utils.anchor_target_creator import AnchorTargetCreator from utils.proposal_target_creator import ProposalTargetCreator from chainer import computational_graph as c from chainercv.links import PixelwiseSoftmaxClassifier class MaskRCNNTrainChain(chainer.Chain): def __init__(self, mask_rcnn, rpn_sigma=3., roi_sigma=1., gamma=1, anchor_target_creator=AnchorTargetCreator(), roi_size=7): super(MaskRCNNTrainChain, self).__init__() with self.init_scope(): self.mask_rcnn = mask_rcnn self.rpn_sigma = rpn_sigma self.roi_sigma = roi_sigma self.anchor_target_creator = anchor_target_creator self.proposal_target_creator = ProposalTargetCreator(roi_size=roi_size) self.loc_normalize_mean = mask_rcnn.loc_normalize_mean self.loc_normalize_std = mask_rcnn.loc_normalize_std self.decayrate=0.99 self.avg_loss = None self.gamma=gamma def __call__(self, imgs, bboxes, labels, scale, masks): if isinstance(bboxes, chainer.Variable): bboxes = bboxes.data if isinstance(labels, chainer.Variable): labels = labels.data if isinstance(scale, chainer.Variable): scale = scale.data if isinstance(masks, chainer.Variable): masks = masks.data scale = np.asscalar(cuda.to_cpu(scale[0])) n = bboxes.shape[0] #if n != 1: # raise ValueError('only batch size 1 is supported') _, _, H, W = imgs.shape img_size = (H, W) #Extractor (VGG) : img -> features features = self.mask_rcnn.extractor(imgs) #Region Proposal Network : features -> rpn_locs, rpn_scores, rois rpn_loc_loss,rpn_cls_loss, roi_loc_loss, roi_cls_loss, mask_loss= 0,0,0,0,0 for i in range(n): rpn_locs, rpn_scores, rois, roi_indices, anchor = self.mask_rcnn.rpn( features[i:i+1], img_size, scale) bbox, label, mask, rpn_score, rpn_loc, roi = \ bboxes[i], labels[i], masks[i], rpn_scores[0], rpn_locs[0], rois mask[mask>1]=0 numdata = sum(label>=0) label = label[0:numdata] bbox = bbox[0:numdata] mask = mask[0:numdata] #proposal target : roi(proposed) , bbox(GT), label(GT) -> sample_roi, gt_roi_loc, gt_roi_label #the targets are compared with the head output. sample_roi, gt_roi_loc, gt_roi_label, gt_roi_mask = self.proposal_target_creator( roi, bbox, label, mask, self.loc_normalize_mean, self.loc_normalize_std) sample_roi_index = self.xp.zeros((len(sample_roi),), dtype=np.int32) #Head Network : features, sample_roi -> roi_cls_loc, roi_score roi_cls_loc, roi_score, roi_cls_mask = self.mask_rcnn.head( features[i:i+1], sample_roi, sample_roi_index) #RPN losses gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(bbox, anchor, img_size) rpn_loc_loss += _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc, gt_rpn_label, self.rpn_sigma) rpn_cls_loss += F.softmax_cross_entropy(rpn_score, gt_rpn_label) #Head output losses n_sample = roi_cls_loc.shape[0] roi_cls_loc = roi_cls_loc.reshape((n_sample, -1, 4)) roi_loc = roi_cls_loc[self.xp.arange(n_sample), gt_roi_label] roi_mask = roi_cls_mask[self.xp.arange(n_sample), gt_roi_label] roi_loc_loss += _fast_rcnn_loc_loss(roi_loc, gt_roi_loc, gt_roi_label, self.roi_sigma) roi_cls_loss += F.softmax_cross_entropy(roi_score, gt_roi_label) #mask loss: average binary cross-entropy loss mask_loss += F.sigmoid_cross_entropy(roi_mask[0:gt_roi_mask.shape[0]], gt_roi_mask) #total loss loss = rpn_loc_loss + rpn_cls_loss + roi_loc_loss + roi_cls_loss + self.gamma * mask_loss loss /= n #avg loss calculation if self.avg_loss is None: self.avg_loss = loss.data else: self.avg_loss = self.avg_loss * self.decayrate + loss.data*(1-self.decayrate) chainer.reporter.report({'rpn_loc_loss':rpn_loc_loss/n, 'rpn_cls_loss':rpn_cls_loss/n, 'roi_loc_loss':roi_loc_loss/n, 'roi_cls_loss':roi_cls_loss/n, 'roi_mask_loss':self.gamma * mask_loss/n, 'avg_loss':self.avg_loss, 'loss':loss}, self) return loss def _smooth_l1_loss(x, t, in_weight, sigma): sigma2 = sigma ** 2 diff = in_weight * (x - t) abs_diff = F.absolute(diff) flag = (abs_diff.data < (1. / sigma2)).astype(np.float32) y = (flag * (sigma2 / 2.) * F.square(diff) + (1 - flag) * (abs_diff - 0.5 / sigma2)) return F.sum(y) def _fast_rcnn_loc_loss(pred_loc, gt_loc, gt_label, sigma): xp = chainer.cuda.get_array_module(pred_loc) in_weight = xp.zeros_like(gt_loc) in_weight[gt_label > 0] = 1 loc_loss = _smooth_l1_loss(pred_loc, gt_loc, in_weight, sigma) loc_loss /= xp.sum(gt_label >= 0) return loc_loss ================================================ FILE: train.py ================================================ import chainer from chainer import training from chainer.training import extensions, ParallelUpdater from chainer.training.triggers import ManualScheduleTrigger from chainer.datasets import TransformDataset from chainercv.datasets import VOCBboxDataset, voc_bbox_label_names from chainercv import transforms from chainercv.transforms.image.resize import resize import argparse import numpy as np import time #from mask_rcnn_vgg import MaskRCNNVGG16 from mask_rcnn_resnet import MaskRCNNResNet from coco_dataset import COCODataset from mask_rcnn_train_chain import MaskRCNNTrainChain from utils.bn_utils import freeze_bn, bn_to_affine from utils.cocoapi_evaluator import COCOAPIEvaluator from utils.detection_coco_evaluator import DetectionCOCOEvaluator import logging import traceback from utils.updater import SubDivisionUpdater import cv2 def resize_bbox(bbox, in_size, out_size): bbox_o = bbox.copy() y_scale = float(out_size[0]) / in_size[0] x_scale = float(out_size[1]) / in_size[1] bbox_o[:, 0] = y_scale * bbox[:, 1] bbox_o[:, 2] = y_scale * (bbox[:, 1]+bbox[:, 3]) bbox_o[:, 1] = x_scale * bbox[:, 0] bbox_o[:, 3] = x_scale * (bbox[:, 0]+bbox[:, 2]) return bbox_o def parse(): parser = argparse.ArgumentParser( description='Mask RCNN trainer') parser.add_argument('--dataset', choices=('coco2017'), default='coco2017') parser.add_argument('--extractor', choices=('resnet50','resnet101'), default='resnet50', help='extractor network') parser.add_argument('--gpu', '-g', type=int, default=0) parser.add_argument('--lr', '-l', type=float, default=1e-4) parser.add_argument('--batchsize', '-b', type=int, default=8) parser.add_argument('--freeze_bn', action='store_true', default=False, help='freeze batchnorm gamma/beta') parser.add_argument('--bn2affine', action='store_true', default=False, help='batchnorm to affine') parser.add_argument('--out', '-o', default='result', help='Output directory') parser.add_argument('--seed', '-s', type=int, default=0) parser.add_argument('--roialign', action='store_false', default=True, help='default: True') parser.add_argument('--lr_step', '-ls', type=int, default=120000) parser.add_argument('--lr_initialchange', '-li', type=int, default=400) parser.add_argument('--pretrained', '-p', type=str, default='imagenet') parser.add_argument('--snapshot', type=int, default=4000) parser.add_argument('--validation', type=int, default=30000) parser.add_argument('--resume', type=str) parser.add_argument('--iteration', '-i', type=int, default=180000) parser.add_argument('--roi_size', '-r', type=int, default=14, help='ROI size for mask head input') parser.add_argument('--gamma', type=float, default=1, help='mask loss weight') return parser.parse_args() class Transform(object): def __init__(self, net, labelids): self.net = net self.labelids = labelids def __call__(self, in_data): if len(in_data)==5: img, label, bbox, mask, i = in_data elif len(in_data)==4: img, bbox, label, i= in_data label = [self.labelids.index(l) for l in label] _, H, W = img.shape if chainer.config.train: img = self.net.prepare(img) _, o_H, o_W = img.shape scale = o_H / H if len(bbox)==0: return img, [],[],1 bbox = resize_bbox(bbox, (H, W), (o_H, o_W)) mask = resize(mask,(o_H, o_W)) if chainer.config.train: #horizontal flip img, params = transforms.random_flip( img, x_random=True, return_param=True) bbox = transforms.flip_bbox( bbox, (o_H, o_W), x_flip=params['x_flip']) mask = transforms.flip(mask, x_flip=params['x_flip']) return img, bbox, label, scale, mask, i def convert(batch, device): return chainer.dataset.convert.concat_examples(batch, device, padding=-1) def main(): args = parse() np.random.seed(args.seed) print('arguments: ', args) # Model setup if args.dataset == 'coco2017': train_data = COCODataset() test_data = COCODataset(json_file='instances_val2017.json', name='val2017', id_list_file='val2017.txt') train_class_ids =train_data.class_ids test_ids = test_data.ids cocoanns = test_data.coco if args.extractor=='vgg16': mask_rcnn = MaskRCNNVGG16(n_fg_class=80, pretrained_model=args.pretrained, roi_size=args.roi_size, roi_align = args.roialign) elif args.extractor=='resnet50': mask_rcnn = MaskRCNNResNet(n_fg_class=80, pretrained_model=args.pretrained,roi_size=args.roi_size, n_layers=50, roi_align = args.roialign, class_ids=train_class_ids) elif args.extractor=='resnet101': mask_rcnn = MaskRCNNResNet(n_fg_class=80, pretrained_model=args.pretrained,roi_size=args.roi_size, n_layers=101, roi_align = args.roialign, class_ids=train_class_ids) mask_rcnn.use_preset('evaluate') model = MaskRCNNTrainChain(mask_rcnn, gamma=args.gamma, roi_size=args.roi_size) # Trainer setup if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() optimizer = chainer.optimizers.MomentumSGD(lr=args.lr, momentum=0.9) #optimizer = chainer.optimizers.Adam()#alpha=0.001, beta1=0.9, beta2=0.999 , eps=0.00000001) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(rate=0.0001)) train_data=TransformDataset(train_data, Transform(mask_rcnn, train_class_ids)) test_data=TransformDataset(test_data, Transform(mask_rcnn, train_class_ids)) train_iter = chainer.iterators.SerialIterator( train_data, batch_size=args.batchsize) test_iter = chainer.iterators.SerialIterator( test_data, batch_size=1, repeat=False, shuffle=False) updater = SubDivisionUpdater(train_iter, optimizer, device=args.gpu, subdivisions=args.batchsize) #updater = ParallelUpdater(train_iter, optimizer, devices={"main": 0, "second": 1}, converter=convert ) #for training with multiple GPUs trainer = training.Trainer( updater, (args.iteration, 'iteration'), out=args.out) # Extensions trainer.extend( extensions.snapshot_object(model.mask_rcnn, 'snapshot_model.npz'), trigger=(args.snapshot, 'iteration')) trainer.extend(extensions.ExponentialShift('lr', 10), trigger=ManualScheduleTrigger( [args.lr_initialchange], 'iteration')) trainer.extend(extensions.ExponentialShift('lr', 0.1), trigger=(args.lr_step, 'iteration')) if args.resume is not None: chainer.serializers.load_npz(args.resume, model.mask_rcnn) if args.freeze_bn: freeze_bn(model.mask_rcnn) if args.bn2affine: bn_to_affine(model.mask_rcnn) log_interval = 40, 'iteration' plot_interval = 160, 'iteration' print_interval = 40, 'iteration' #trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu), trigger=(args.validation, 'iteration')) #trainer.extend(DetectionCOCOEvaluator(test_iter, model.mask_rcnn), trigger=(args.validation, 'iteration')) #COCO AP Evaluator with VOC metric trainer.extend(COCOAPIEvaluator(test_iter, model.mask_rcnn, test_ids, cocoanns), trigger=(args.validation, 'iteration')) #COCO AP Evaluator trainer.extend(chainer.training.extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.PrintReport( ['iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss', 'main/avg_loss', 'main/roi_loc_loss', 'main/roi_cls_loss', 'main/roi_mask_loss', 'main/rpn_loc_loss', 'main/rpn_cls_loss', 'validation/main/loss', 'validation/main/map', ]), trigger=print_interval) trainer.extend(extensions.ProgressBar(update_interval=1000)) #trainer.extend(extensions.dump_graph('main/loss')) try: trainer.run() except: traceback.print_exc() if __name__ == '__main__': main() ================================================ FILE: utils/__init__.py ================================================ ================================================ FILE: utils/bn_utils.py ================================================ import numpy as np import cupy def freeze_bn(model): # freeze batchnorm update def disableupdate(block): for name in block._forward: l = getattr(block, name) l.bn1.disable_update() l.bn2.disable_update() l.bn3.disable_update() if name=='a': l.bn4.disable_update() model.extractor.bn1.disable_update() disableupdate(model.extractor.res2) disableupdate(model.extractor.res3) disableupdate(model.extractor.res4) disableupdate(model.head.res5) print("batchnorm update disabled!") def bn_to_affine(model): # change batchnorm layers to affine layers (mean -> 0, var -> 1) def bn_to_affine_block(block): for name in block._forward: l = getattr(block, name) l.bn1.avg_mean = cupy.zeros(l.bn1.avg_mean.shape, dtype=np.float32) l.bn1.avg_var = cupy.ones(l.bn1.avg_var.shape, dtype=np.float32) - l.bn1.eps l.bn2.avg_mean = cupy.zeros(l.bn2.avg_mean.shape, dtype=np.float32) l.bn2.avg_var = cupy.ones(l.bn2.avg_var.shape, dtype=np.float32) - l.bn1.eps l.bn3.avg_mean = cupy.zeros(l.bn3.avg_mean.shape, dtype=np.float32) l.bn3.avg_var = cupy.ones(l.bn3.avg_var.shape, dtype=np.float32) - l.bn1.eps if name=='a': l.bn4.avg_mean = cupy.zeros(l.bn4.avg_mean.shape, dtype=np.float32) l.bn4.avg_var = cupy.ones(l.bn4.avg_var.shape, dtype=np.float32) - l.bn1.eps model.extractor.bn1.avg_mean = cupy.zeros(model.extractor.bn1.avg_mean.shape, dtype=np.float32) model.extractor.bn1.avg_var = cupy.ones(model.extractor.bn1.avg_var.shape, dtype=np.float32) - model.extractor.bn1.eps bn_to_affine_block(model.extractor.res2) bn_to_affine_block(model.extractor.res3) bn_to_affine_block(model.extractor.res4) bn_to_affine_block(model.head.res5) print("converted batchnorm to affine") ================================================ FILE: utils/box_utils.py ================================================ import numpy as np import cupy import cv2 def resize_bbox(bbox, in_size, out_size): bbox_o = bbox.copy() y_scale = float(out_size[0]) / in_size[0] x_scale = float(out_size[1]) / in_size[1] bbox_o[:, 0] = y_scale * bbox[:, 1] bbox_o[:, 2] = y_scale * (bbox[:, 1]+bbox[:, 3]) bbox_o[:, 1] = x_scale * bbox[:, 0] bbox_o[:, 3] = x_scale * (bbox[:, 0]+bbox[:, 2]) return bbox_o def bbox_yxyx2xywh(bbox): bbox_o = bbox.copy() bbox_o[:, 0] = bbox[:, 1] bbox_o[:, 2] = bbox[:, 3] - bbox[:, 1] bbox_o[:, 1] = bbox[:, 0] bbox_o[:, 3] = bbox[:, 2] - bbox[:, 0] return bbox_o def im_mask(mask, size, bbox): # bboxes are already clipped to [0, w], [0, h] masksize = mask.shape[0] # pad the mask to avoid cv2.resize artifacts pmask = np.zeros((masksize + 2, masksize + 2), dtype=np.float32) pmask[1:-1, 1:-1] = mask # extend the boxhead scale = (masksize + 2) / masksize ex_w = (bbox[3] - bbox[1]) * scale ex_h = (bbox[2] - bbox[0]) * scale ex_x0 = (bbox[3] + bbox[1] - ex_w) / 2 ex_y0 = (bbox[2] + bbox[0] - ex_h) / 2 ex_x1 = (bbox[3] + bbox[1] + ex_w) / 2 ex_y1 = (bbox[2] + bbox[0] + ex_h) / 2 ex_bbox = np.asarray([ex_y0, ex_x0, ex_y1, ex_x1], dtype=np.int32) # whole-image-sized mask immask = np.zeros((size[0],size[1]), dtype=np.uint8) x0, x1 = max(ex_bbox[1], 0), min(ex_bbox[3] + 1, size[1]) y0, y1= max(ex_bbox[0], 0), min(ex_bbox[2] + 1, size[0]) immask_roi = cv2.resize(pmask, (x1 - x0, y1 - y0)) immask[y0:y1, x0:x1] = np.round(immask_roi).astype(np.uint8) return immask ================================================ FILE: utils/cocoapi_evaluator.py ================================================ import copy import numpy as np from chainer import reporter import chainer.training.extensions from utils import eval_detection_coco from chainercv.utils import apply_prediction_to_iterator import pycocotools from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval class COCOAPIEvaluator(chainer.training.extensions.Evaluator): trigger = 1, 'epoch' default_name = 'validation' priority = chainer.training.PRIORITY_WRITER def __init__( self, iterator, target, ids, cocoanns, label_names=None): super(COCOAPIEvaluator, self).__init__( iterator, target) self.ids = ids self.cocoanns = cocoanns def evaluate(self): iterator = self._iterators['main'] target = self._targets['main'] annType = ['segm','bbox','keypoints'] if hasattr(iterator, 'reset'): iterator.reset() it = iterator else: it = copy.copy(iterator) in_values, out_values, rest_values = apply_prediction_to_iterator( target.predict, it) # delete unused iterators explicitly del in_values pred_bboxes, pred_labels, pred_scores, pred_masks = out_values if len(rest_values) == 3: gt_bboxes, gt_labels, gt_difficults = rest_values elif len(rest_values) == 2: gt_bboxes, gt_labels = rest_values gt_difficults = None elif len(rest_values) == 5: gt_bboxes, gt_labels, _, _, i = rest_values gt_difficults = None pred_bboxes = iter(list(pred_bboxes)) pred_labels = iter(list(pred_labels)) pred_scores = iter(list(pred_scores)) gt_bboxes = iter(list(gt_bboxes)) gt_labels = iter(list(gt_labels)) data_dict = [] for i, (pred_bbox, pred_label, pred_score, pred_mask) in \ enumerate(zip(pred_bboxes, pred_labels, pred_scores, pred_masks)): for bbox, label, score, mask in zip(pred_bbox, pred_label, pred_score, pred_mask): A={"image_id":int(self.ids[i]), "category_id":int(label), "bbox":bbox.tolist(), "score":float(score), "segmentation": mask} data_dict.append(A) if len(data_dict)>0: for i in range(2): # 'segm','bbox' cocoGt=self.cocoanns cocoDt=cocoGt.loadRes(data_dict) cocoEval = COCOeval(self.cocoanns, cocoDt, annType[i]) cocoEval.params.imgIds = [int(id_) for id_ in self.ids] cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() report = {'map': cocoEval.stats[0]} # report COCO AP (IoU=0.5:0:95) else: report = {'map': 0} observation = {} with reporter.report_scope(observation): reporter.report(report, target) return observation ================================================ FILE: utils/detection_coco_evaluator.py ================================================ import copy import numpy as np from chainer import reporter import chainer.training.extensions from utils import eval_detection_coco from chainercv.utils import apply_prediction_to_iterator class DetectionCOCOEvaluator(chainer.training.extensions.Evaluator): """An extension that evaluates a detection model by PASCAL VOC metric. This extension iterates over an iterator and evaluates the prediction results by average precisions (APs) and mean of them (mean Average Precision, mAP). This extension reports the following values with keys. Please note that :obj:`'ap/'` is reported only if :obj:`label_names` is specified. * :obj:`'map'`: Mean of average precisions (mAP). * :obj:`'ap/'`: Average precision for class \ :obj:`label_names[l]`, where :math:`l` is the index of the class. \ For example, this evaluator reports :obj:`'ap/aeroplane'`, \ :obj:`'ap/bicycle'`, etc. if :obj:`label_names` is \ :obj:`~chainercv.datasets.voc_bbox_label_names`. \ If there is no bounding box assigned to class :obj:`label_names[l]` \ in either ground truth or prediction, it reports :obj:`numpy.nan` as \ its average precision. \ In this case, mAP is computed without this class. Args: iterator (chainer.Iterator): An iterator. Each sample should be following tuple :obj:`img, bbox, label` or :obj:`img, bbox, label, difficult`. :obj:`img` is an image, :obj:`bbox` is coordinates of bounding boxes, :obj:`label` is labels of the bounding boxes and :obj:`difficult` is whether the bounding boxes are difficult or not. If :obj:`difficult` is returned, difficult ground truth will be ignored from evaluation. target (chainer.Link): A detection link. This link must have :meth:`predict` method that takes a list of images and returns :obj:`bboxes`, :obj:`labels` and :obj:`scores`. use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric for calculating average precision. The default value is :obj:`False`. label_names (iterable of strings): An iterable of names of classes. If this value is specified, average precision for each class is also reported with the key :obj:`'ap/'`. """ trigger = 1, 'epoch' default_name = 'validation' priority = chainer.training.PRIORITY_WRITER def __init__( self, iterator, target, use_07_metric=False, label_names=None): super(DetectionCOCOEvaluator, self).__init__( iterator, target) self.use_07_metric = use_07_metric self.label_names = ['background', # class zero 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'street sign', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'hat', 'backpack', 'umbrella', 'shoe', 'eye glasses', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'plate', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'mirror', 'dining table', 'window', 'desk','toilet', 'door', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'blender', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'] def evaluate(self): iterator = self._iterators['main'] target = self._targets['main'] if hasattr(iterator, 'reset'): iterator.reset() it = iterator else: it = copy.copy(iterator) in_values, out_values, rest_values = apply_prediction_to_iterator( target.predict, it) # delete unused iterators explicitly del in_values pred_bboxes, _, pred_labels, pred_scores, _ = out_values if len(rest_values) == 3: gt_bboxes, gt_labels, gt_difficults = rest_values elif len(rest_values) == 2: gt_bboxes, gt_labels = rest_values gt_difficults = None elif len(rest_values) == 5: gt_bboxes, gt_labels, _, _, i = rest_values gt_difficults = None result = eval_detection_coco.eval_detection_coco( pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, gt_difficults, use_07_metric=self.use_07_metric) report = {'map': result['map']} if self.label_names is not None: for l, label_name in enumerate(self.label_names): try: report['ap/{:s}'.format(label_name)] = result['ap'][l] except IndexError: report['ap/{:s}'.format(label_name)] = np.nan if True: print(report) observation = {} with reporter.report_scope(observation): reporter.report(report, target) return observation ================================================ FILE: utils/detectron_parser.py ================================================ import numpy as np import os path = os.path.join(os.path.dirname(__file__), '../') import sys sys.path.append(path) from mask_rcnn_resnet import MaskRCNNResNet from chainer import serializers import pickle model = MaskRCNNResNet(n_fg_class=80, roi_size=14, pretrained_model='auto', anchor_scales=[2, 4, 8, 16, 32], n_layers=50, class_ids=[[1]]) modeldir = "modelfiles" if os.path.exists(modeldir)==False: os.mkdir(modeldir) # resnet50, end-to-end, C4 d_model_file = "modelfiles/model_final.pkl" c_model_file = "modelfiles/e2e_mask_rcnn_R-50-C4_1x_d2c.npz" with open(d_model_file, 'rb') as f: d = pickle.load(f, encoding='latin-1')['blobs'] d_key = sorted(d) parsecount = 0 for bl in d_key: if 'res' in bl: stage = bl[3] # resnet stage, 2, 3, 4, 5 block = bl[5] # resnet block, a or b if stage=='_': # non-resnet layers continue else: stage = int(stage) - 1 if stage == 4: netname='head' else: netname='extractor' if 'branch2a' in bl: c_nlayer = 1 elif 'branch2b' in bl: c_nlayer = 2 elif 'branch2c' in bl: c_nlayer = 3 elif 'branch1' in bl: c_nlayer = 4 else: c_nlayer = 0 # do not copy if bl.endswith('_b') and 'bn_b' not in bl: continue if 'momentum' in bl: continue # conv / bn gamma / bn beta if '_w' in bl: c_kind = 'conv%d.W' % c_nlayer elif 'bn_s' in bl: c_kind = 'bn%d.gamma' % c_nlayer elif 'bn_b' in bl: c_kind = 'bn%d.beta' % c_nlayer # chainer block kind if block == '0': c_block = 'a' else: c_block = 'b'+block # shape checker exec("c_shape = model.%s.res%d.%s.%s.data.shape" % (netname, stage + 1, c_block, c_kind)) exec("d_shape = d['%s'].shape" % bl) if c_shape == d_shape: # execute copy txt = "model.%s.res%d.%s.%s.data = d['%s']" % (netname, stage + 1, c_block, c_kind, bl ) print(txt) exec(txt) parsecount += 1 else: print("shape mismatch error!") # copy the other layers layer_pairs = \ [('extractor.conv1.W', 'conv1_w'), ('extractor.bn1.gamma', 'res_conv1_bn_s'), ('extractor.bn1.beta', 'res_conv1_bn_b'), ('rpn.conv1.W', 'conv_rpn_w'), ('rpn.conv1.b', 'conv_rpn_b'), ('rpn.loc.W', 'rpn_bbox_pred_w'), ('rpn.loc.b', 'rpn_bbox_pred_b'), ('rpn.score.W', 'rpn_cls_logits_w'), ('rpn.score.b', 'rpn_cls_logits_b'), ('head.score.W', 'cls_score_w'), ('head.score.b', 'cls_score_b'), ('head.cls_loc.W', 'bbox_pred_w'), ('head.cls_loc.b', 'bbox_pred_b'), ('head.deconvm1.W', 'conv5_mask_w'), ('head.deconvm1.b', 'conv5_mask_b'), ('head.convm2.W', 'mask_fcn_logits_w'), ('head.convm2.b', 'mask_fcn_logits_b'), ] def xytrans(src): sh = src.shape dst = src.reshape(sh[0]//4, 4, -1)[:,[1, 0, 3, 2]].reshape(sh) return dst for layer_pair in layer_pairs: exec("c_shape = model.%s.data.shape" % layer_pair[0]) exec("d_shape = d['%s'].shape" % layer_pair[1]) if 'bbox_pred' in layer_pair[1]: d[layer_pair[1]] = xytrans(d[layer_pair[1]]) if c_shape == d_shape: txt = "model.%s.data = d['%s']" % layer_pair print(txt) exec(txt) parsecount += 1 else: print("shape mismatch error!") print(parsecount, " layers copied") serializers.save_npz(c_model_file, model) print("save weights file to a chainer model", c_model_file) ================================================ FILE: utils/eval_detection_coco.py ================================================ from __future__ import division from collections import defaultdict import itertools import numpy as np import six from chainercv.utils.bbox.bbox_iou import bbox_iou def eval_detection_coco( pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, gt_difficults=None, iou_thresh=0.5, use_07_metric=False): """Calculate average precisions based on evaluation code of PASCAL VOC. This function evaluates predicted bounding boxes obtained from a dataset which has :math:`N` images by using average precision for each class. The code is based on the evaluation code used in PASCAL VOC Challenge. Args: pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N` sets of bounding boxes. Its index corresponds to an index for the base dataset. Each element of :obj:`pred_bboxes` is a set of coordinates of bounding boxes. This is an array whose shape is :math:`(R, 4)`, where :math:`R` corresponds to the number of bounding boxes, which may vary among boxes. The second axis corresponds to :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box. pred_labels (iterable of numpy.ndarray): An iterable of labels. Similar to :obj:`pred_bboxes`, its index corresponds to an index for the base dataset. Its length is :math:`N`. pred_scores (iterable of numpy.ndarray): An iterable of confidence scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`, its index corresponds to an index for the base dataset. Its length is :math:`N`. gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth bounding boxes whose length is :math:`N`. An element of :obj:`gt_bboxes` is a bounding box whose shape is :math:`(R, 4)`. Note that the number of bounding boxes in each image does not need to be same as the number of corresponding predicted boxes. gt_labels (iterable of numpy.ndarray): An iterable of ground truth labels which are organized similarly to :obj:`gt_bboxes`. gt_difficults (iterable of numpy.ndarray): An iterable of boolean arrays which is organized similarly to :obj:`gt_bboxes`. This tells whether the corresponding ground truth bounding box is difficult or not. By default, this is :obj:`None`. In that case, this function considers all bounding boxes to be not difficult. iou_thresh (float): A prediction is correct if its Intersection over Union with the ground truth is above this value. use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric for calculating average precision. The default value is :obj:`False`. Returns: dict: The keys, value-types and the description of the values are listed below. * **ap** (*numpy.ndarray*): An array of average precisions. \ The :math:`l`-th value corresponds to the average precision \ for class :math:`l`. If class :math:`l` does not exist in \ either :obj:`pred_labels` or :obj:`gt_labels`, the corresponding \ value is set to :obj:`numpy.nan`. * **map** (*float*): The average of Average Precisions over classes. """ prec, rec = calc_detection_coco_prec_rec( pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, gt_difficults, iou_thresh=iou_thresh) ap = calc_detection_coco_ap(prec, rec, use_07_metric=use_07_metric) #for name, ap0 in zip(coconames, ap): # if ~(ap0==ap0): # ap0 = -1 # apresults.append([name, ap0]) #print("average precision evaluation results: ", apresults) return {'ap': ap, 'map': np.nanmean(ap)} def calc_detection_coco_prec_rec( pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, gt_difficults=None, iou_thresh=0.5): """Calculate precision and recall based on evaluation code of PASCAL VOC. This function calculates precision and recall of predicted bounding boxes obtained from a dataset which has :math:`N` images. The code is based on the evaluation code used in PASCAL VOC Challenge. Args: pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N` sets of bounding boxes. Its index corresponds to an index for the base dataset. Each element of :obj:`pred_bboxes` is a set of coordinates of bounding boxes. This is an array whose shape is :math:`(R, 4)`, where :math:`R` corresponds to the number of bounding boxes, which may vary among boxes. The second axis corresponds to :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box. pred_labels (iterable of numpy.ndarray): An iterable of labels. Similar to :obj:`pred_bboxes`, its index corresponds to an index for the base dataset. Its length is :math:`N`. pred_scores (iterable of numpy.ndarray): An iterable of confidence scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`, its index corresponds to an index for the base dataset. Its length is :math:`N`. gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth bounding boxes whose length is :math:`N`. An element of :obj:`gt_bboxes` is a bounding box whose shape is :math:`(R, 4)`. Note that the number of bounding boxes in each image does not need to be same as the number of corresponding predicted boxes. gt_labels (iterable of numpy.ndarray): An iterable of ground truth labels which are organized similarly to :obj:`gt_bboxes`. gt_difficults (iterable of numpy.ndarray): An iterable of boolean arrays which is organized similarly to :obj:`gt_bboxes`. This tells whether the corresponding ground truth bounding box is difficult or not. By default, this is :obj:`None`. In that case, this function considers all bounding boxes to be not difficult. iou_thresh (float): A prediction is correct if its Intersection over Union with the ground truth is above this value.. Returns: tuple of two lists: This function returns two lists: :obj:`prec` and :obj:`rec`. * :obj:`prec`: A list of arrays. :obj:`prec[l]` is precision \ for class :math:`l`. If class :math:`l` does not exist in \ either :obj:`pred_labels` or :obj:`gt_labels`, :obj:`prec[l]` is \ set to :obj:`None`. * :obj:`rec`: A list of arrays. :obj:`rec[l]` is recall \ for class :math:`l`. If class :math:`l` that is not marked as \ difficult does not exist in \ :obj:`gt_labels`, :obj:`rec[l]` is \ set to :obj:`None`. """ pred_bboxes = iter(list(pred_bboxes)) pred_labels = iter(list(pred_labels)) pred_scores = iter(list(pred_scores)) gt_bboxes = iter(list(gt_bboxes)) gt_labels = iter(list(gt_labels)) if gt_difficults is None: gt_difficults = itertools.repeat(None) else: gt_difficults = iter(gt_difficults) n_pos = defaultdict(int) score = defaultdict(list) match = defaultdict(list) for pred_bbox, pred_label, pred_score, gt_bbox, gt_label, gt_difficult in \ six.moves.zip( pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, gt_difficults): if gt_difficult is None: gt_difficult = np.zeros(gt_bbox.shape[0], dtype=bool) for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)): pred_mask_l = pred_label == l pred_bbox_l = pred_bbox[pred_mask_l] pred_score_l = pred_score[pred_mask_l] # sort by score order = pred_score_l.argsort()[::-1] pred_bbox_l = pred_bbox_l[order] pred_score_l = pred_score_l[order] gt_mask_l = gt_label == l gt_bbox_l = gt_bbox[gt_mask_l] gt_difficult_l = gt_difficult[gt_mask_l] n_pos[l] += np.logical_not(gt_difficult_l).sum() score[l].extend(pred_score_l) if len(pred_bbox_l) == 0: continue if len(gt_bbox_l) == 0: match[l].extend((0,) * pred_bbox_l.shape[0]) continue # VOC evaluation follows integer typed bounding boxes. pred_bbox_l = pred_bbox_l.copy() pred_bbox_l[:, 2:] += 1 gt_bbox_l = gt_bbox_l.copy() gt_bbox_l[:, 2:] += 1 iou = bbox_iou(pred_bbox_l, gt_bbox_l) gt_index = iou.argmax(axis=1) # set -1 if there is no matching ground truth gt_index[iou.max(axis=1) < iou_thresh] = -1 del iou selec = np.zeros(gt_bbox_l.shape[0], dtype=bool) for gt_idx in gt_index: if gt_idx >= 0: if gt_difficult_l[gt_idx]: match[l].append(-1) else: if not selec[gt_idx]: match[l].append(1) else: match[l].append(0) selec[gt_idx] = True else: match[l].append(0) for iter_ in ( pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, gt_difficults): if next(iter_, None) is not None: raise ValueError('Length of input iterables need to be same.') n_fg_class = max(n_pos.keys()) + 1 prec = [None] * n_fg_class rec = [None] * n_fg_class for l in n_pos.keys(): score_l = np.array(score[l]) match_l = np.array(match[l], dtype=np.int8) order = score_l.argsort()[::-1] match_l = match_l[order] tp = np.cumsum(match_l == 1) fp = np.cumsum(match_l == 0) # If an element of fp + tp is 0, # the corresponding element of prec[l] is nan. prec[l] = tp / (fp + tp) # If n_pos[l] is 0, rec[l] is None. if n_pos[l] > 0: rec[l] = tp / n_pos[l] return prec, rec def calc_detection_coco_ap(prec, rec, use_07_metric=False): """Calculate average precisions based on evaluation code of PASCAL VOC. This function calculates average precisions from given precisions and recalls. The code is based on the evaluation code used in PASCAL VOC Challenge. Args: prec (list of numpy.array): A list of arrays. :obj:`prec[l]` indicates precision for class :math:`l`. If :obj:`prec[l]` is :obj:`None`, this function returns :obj:`numpy.nan` for class :math:`l`. rec (list of numpy.array): A list of arrays. :obj:`rec[l]` indicates recall for class :math:`l`. If :obj:`rec[l]` is :obj:`None`, this function returns :obj:`numpy.nan` for class :math:`l`. use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric for calculating average precision. The default value is :obj:`False`. Returns: ~numpy.ndarray: This function returns an array of average precisions. The :math:`l`-th value corresponds to the average precision for class :math:`l`. If :obj:`prec[l]` or :obj:`rec[l]` is :obj:`None`, the corresponding value is set to :obj:`numpy.nan`. """ n_fg_class = len(prec) ap = np.empty(n_fg_class) for l in six.moves.range(n_fg_class): if prec[l] is None or rec[l] is None: ap[l] = np.nan continue if use_07_metric: # 11 point metric ap[l] = 0 for t in np.arange(0., 1.1, 0.1): if np.sum(rec[l] >= t) == 0: p = 0 else: p = np.max(np.nan_to_num(prec[l])[rec[l] >= t]) ap[l] += p / 11 else: # correct AP calculation # first append sentinel values at the end mpre = np.concatenate(([0], np.nan_to_num(prec[l]), [0])) mrec = np.concatenate(([0], rec[l], [1])) mpre = np.maximum.accumulate(mpre[::-1])[::-1] # to calculate area under PR curve, look for points # where X axis (recall) changes value i = np.where(mrec[1:] != mrec[:-1])[0] # and sum (\Delta recall) * prec ap[l] = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) return ap ================================================ FILE: utils/makecocolist.py ================================================ import glob fnames = glob.glob('COCO/train2017/*.jpg') with open("COCO/train2017.txt", "w") as f: for fname in fnames: f.write(fname.split('/')[-1].split('.')[0]+'\n') f.close() fnames = glob.glob('COCO/val2017/*.jpg') with open("COCO/val2017.txt", "w") as f: for i, fname in enumerate(fnames): f.write(fname.split('/')[-1].split('.')[0]+'\n') if i > 1000: break f.close() ================================================ FILE: utils/proposal_target_creator.py ================================================ import numpy as np from chainer import cuda from chainercv.links.model.faster_rcnn.utils.bbox2loc import bbox2loc from chainercv.utils.bbox.bbox_iou import bbox_iou import cv2 class ProposalTargetCreator(object): """Assign ground truth bounding boxes to given RoIs. The :meth:`__call__` of this class generates training targets for each object proposal. This is used to train Faster RCNN [#]_. .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \ Faster R-CNN: Towards Real-Time Object Detection with \ Region Proposal Networks. NIPS 2015. Args: n_sample (int): The number of sampled regions. pos_ratio (float): Fraction of regions that is labeled as a foreground. pos_iou_thresh (float): IoU threshold for a RoI to be considered as a foreground. neg_iou_thresh_hi (float): RoI is considered to be the background if IoU is in [:obj:`neg_iou_thresh_hi`, :obj:`neg_iou_thresh_hi`). neg_iou_thresh_lo (float): See above. """ def __init__(self, n_sample=128, pos_ratio=0.25, pos_iou_thresh=0.5, neg_iou_thresh_hi=0.5, neg_iou_thresh_lo=0.0, roi_size=7 ): self.roi_size=roi_size self.n_sample = n_sample self.pos_ratio = pos_ratio self.pos_iou_thresh = pos_iou_thresh self.neg_iou_thresh_hi = neg_iou_thresh_hi self.neg_iou_thresh_lo = neg_iou_thresh_lo def __call__(self, roi, bbox, label, mask, loc_normalize_mean=(0., 0., 0., 0.), loc_normalize_std=(0.1, 0.1, 0.2, 0.2)): """Assigns ground truth to sampled proposals. This function samples total of :obj:`self.n_sample` RoIs from the combination of :obj:`roi` and :obj:`bbox`. The RoIs are assigned with the ground truth class labels as well as bounding box offsets and scales to match the ground truth bounding boxes. As many as :obj:`pos_ratio * self.n_sample` RoIs are sampled as foregrounds. Offsets and scales of bounding boxes are calculated using :func:`chainercv.links.model.faster_rcnn.bbox2loc`. Also, types of input arrays and output arrays are same. Here are notations. * :math:`S` is the total number of sampled RoIs, which equals \ :obj:`self.n_sample`. * :math:`L` is number of object classes possibly including the \ background. Args: roi (array): Region of Interests (RoIs) from which we sample. Its shape is :math:`(R, 4)` bbox (array): The coordinates of ground truth bounding boxes. Its shape is :math:`(R', 4)`. label (array): Ground truth bounding box labels. Its shape is :math:`(R',)`. Its range is :math:`[0, L - 1]`, where :math:`L` is the number of foreground classes. loc_normalize_mean (tuple of four floats): Mean values to normalize coordinates of bouding boxes. loc_normalize_std (tupler of four floats): Standard deviation of the coordinates of bounding boxes. Returns: (array, array, array): * **sample_roi**: Regions of interests that are sampled. \ Its shape is :math:`(S, 4)`. * **gt_roi_loc**: Offsets and scales to match \ the sampled RoIs to the ground truth bounding boxes. \ Its shape is :math:`(S, 4)`. * **gt_roi_label**: Labels assigned to sampled RoIs. Its shape is \ :math:`(S,)`. Its range is :math:`[0, L]`. The label with \ value 0 is the background. """ xp = cuda.get_array_module(roi) roi = cuda.to_cpu(roi) bbox = cuda.to_cpu(bbox) label = cuda.to_cpu(label) mask = cuda.to_cpu(mask) n_bbox, _ = bbox.shape roi = np.concatenate((roi, bbox), axis=0) pos_roi_per_image = np.round(self.n_sample * self.pos_ratio) iou = bbox_iou(roi, bbox) gt_assignment = iou.argmax(axis=1) max_iou = iou.max(axis=1) # Offset range of classes from [0, n_fg_class - 1] to [1, n_fg_class]. # The label with value 0 is the background. gt_roi_label = label[gt_assignment] + 1 # Select foreground RoIs as those with >= pos_iou_thresh IoU. pos_index = np.where(max_iou >= self.pos_iou_thresh)[0] pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size)) if pos_index.size > 0: pos_index = np.random.choice( pos_index, size=pos_roi_per_this_image, replace=False) # Select background RoIs as those within # [neg_iou_thresh_lo, neg_iou_thresh_hi). neg_index = np.where((max_iou < self.neg_iou_thresh_hi) & (max_iou >= self.neg_iou_thresh_lo))[0] neg_roi_per_this_image = self.n_sample - pos_roi_per_this_image neg_roi_per_this_image = int(min(neg_roi_per_this_image, neg_index.size)) if neg_index.size > 0: neg_index = np.random.choice( neg_index, size=neg_roi_per_this_image, replace=False) # The indices that we're selecting (both positive and negative). keep_index = np.append(pos_index, neg_index) gt_roi_label = gt_roi_label[keep_index] gt_roi_label[pos_roi_per_this_image:] = 0 # negative labels --> 0 sample_roi = roi[keep_index]# sampled <- proposed # Compute offsets and scales to match sampled RoIs to the GTs. gt_roi_loc = bbox2loc(sample_roi, bbox[gt_assignment[keep_index]]) gt_roi_loc = ((gt_roi_loc - np.array(loc_normalize_mean, np.float32) ) / np.array(loc_normalize_std, np.float32)) # Prepare groundtruth masks gt_roi_mask=[] _, h, w = mask.shape for i , idx in enumerate(gt_assignment[pos_index]): A=mask[idx, np.max((int(sample_roi[i,0]),0)):np.min((int(sample_roi[i,2]),h)), np.max((int(sample_roi[i,1]),0)):np.min((int(sample_roi[i,3]),w))] gt_roi_mask.append(cv2.resize(A, (self.roi_size*2,self.roi_size*2))) #debug: visualize masks #cv2.imwrite("gt_assignment_mask.png",mask[0,np.max((int(sample_roi[0,0]),0)):np.min((int(sample_roi[0,2]),h)), np.max((int(sample_roi[0,1]),0)):np.min((int(sample_roi[0,3]),w))]*255) #cv2.imwrite("gt_roi_mask.png",gt_roi_mask[0]*244)# if xp != np: sample_roi = cuda.to_gpu(sample_roi) gt_roi_loc = cuda.to_gpu(gt_roi_loc) gt_roi_label = cuda.to_gpu(gt_roi_label) gt_roi_mask = cuda.to_gpu(np.stack(gt_roi_mask).astype(np.int32)) else: gt_roi_mask = np.stack(gt_roi_mask).astype(np.int32) return sample_roi, gt_roi_loc, gt_roi_label, gt_roi_mask ================================================ FILE: utils/region_proposal_network.py ================================================ import numpy as np import chainer from chainer import cuda import chainer.functions as F import chainer.links as L from chainercv.links.model.faster_rcnn.utils.generate_anchor_base import \ generate_anchor_base from chainercv.links.model.faster_rcnn.utils.proposal_creator import \ ProposalCreator class RegionProposalNetwork(chainer.Chain): """Region Proposal Network introduced in Faster R-CNN. This is Region Proposal Network introduced in Faster R-CNN [#]_. This takes features extracted from images and propose class agnostic bounding boxes around "objects". .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \ Faster R-CNN: Towards Real-Time Object Detection with \ Region Proposal Networks. NIPS 2015. Args: in_channels (int): The channel size of input. mid_channels (int): The channel size of the intermediate tensor. ratios (list of floats): This is ratios of width to height of the anchors. anchor_scales (list of numbers): This is areas of anchors. Those areas will be the product of the square of an element in :obj:`anchor_scales` and the original area of the reference window. feat_stride (int): Stride size after extracting features from an image. initialW (callable): Initial weight value. If :obj:`None` then this function uses Gaussian distribution scaled by 0.1 to initialize weight. May also be a callable that takes an array and edits its values. proposal_creator_params (dict): Key valued paramters for :class:`~chainercv.links.model.faster_rcnn.ProposalCreator`. .. seealso:: :class:`~chainercv.links.model.faster_rcnn.ProposalCreator` """ def __init__( self, in_channels=512, mid_channels=512, ratios=[0.5, 1, 2], anchor_scales=[8, 16, 32], feat_stride=16, initialW=None, proposal_creator_params={}, ): self.anchor_base = generate_anchor_base( anchor_scales=anchor_scales, ratios=ratios) self.feat_stride = feat_stride self.proposal_layer = ProposalCreator(**proposal_creator_params) n_anchor = self.anchor_base.shape[0] super(RegionProposalNetwork, self).__init__() with self.init_scope(): self.conv1 = L.Convolution2D( in_channels, mid_channels, 3, 1, 1, initialW=initialW) self.score = L.Convolution2D( mid_channels, n_anchor * 1, 1, 1, 0, initialW=initialW) self.loc = L.Convolution2D( mid_channels, n_anchor * 4, 1, 1, 0, initialW=initialW) def __call__(self, x, img_size, scale=1.): """Forward Region Proposal Network. Here are notations. * :math:`N` is batch size. * :math:`C` channel size of the input. * :math:`H` and :math:`W` are height and witdh of the input feature. * :math:`A` is number of anchors assigned to each pixel. Args: x (~chainer.Variable): The Features extracted from images. Its shape is :math:`(N, C, H, W)`. img_size (tuple of ints): A tuple :obj:`height, width`, which contains image size after scaling. scale (float): The amount of scaling done to the input images after reading them from files. Returns: (~chainer.Variable, ~chainer.Variable, array, array, array): This is a tuple of five following values. * **rpn_locs**: Predicted bounding box offsets and scales for \ anchors. Its shape is :math:`(N, H W A, 4)`. * **rpn_scores**: Predicted foreground scores for \ anchors. Its shape is :math:`(N, H W A, 2)`. * **rois**: A bounding box array containing coordinates of \ proposal boxes. This is a concatenation of bounding box \ arrays from multiple images in the batch. \ Its shape is :math:`(R', 4)`. Given :math:`R_i` predicted \ bounding boxes from the :math:`i` th image, \ :math:`R' = \\sum _{i=1} ^ N R_i`. * **roi_indices**: An array containing indices of images to \ which RoIs correspond to. Its shape is :math:`(R',)`. * **anchor**: Coordinates of enumerated shifted anchors. \ Its shape is :math:`(H W A, 4)`. """ n, _, hh, ww = x.shape anchor = _enumerate_shifted_anchor( self.xp.array(self.anchor_base), self.feat_stride, hh, ww) n_anchor = anchor.shape[0] // (hh * ww) h = F.relu(self.conv1(x)) rpn_locs = self.loc(h) rpn_scores = self.score(h) rpn_locs = rpn_locs.transpose((0, 2, 3, 1)).reshape((n, -1, 4)) rpn_scores = rpn_scores.transpose((0, 2, 3, 1)) rpn_fg_scores =\ rpn_scores.reshape((n, hh, ww, n_anchor))[:, :, :, :] # modified from chainercv rpn_fg_scores = rpn_fg_scores.reshape((n, -1)) rpn_scores = rpn_scores.reshape((n, -1)) # modified from chainercv rois = [] roi_indices = [] for i in range(n): roi = self.proposal_layer( rpn_locs[i].array, rpn_fg_scores[i].array, anchor, img_size, scale=scale) batch_index = i * self.xp.ones((len(roi),), dtype=np.int32) rois.append(roi) roi_indices.append(batch_index) rois = self.xp.concatenate(rois, axis=0) roi_indices = self.xp.concatenate(roi_indices, axis=0) return rpn_locs, rpn_scores, rois, roi_indices, anchor def _enumerate_shifted_anchor(anchor_base, feat_stride, height, width): # Enumerate all shifted anchors: # # add A anchors (1, A, 4) to # cell K shifts (K, 1, 4) to get # shift anchors (K, A, 4) # reshape to (K*A, 4) shifted anchors xp = cuda.get_array_module(anchor_base) shift_y = xp.arange(0, height * feat_stride, feat_stride) shift_x = xp.arange(0, width * feat_stride, feat_stride) shift_x, shift_y = xp.meshgrid(shift_x, shift_y) shift = xp.stack((shift_y.ravel(), shift_x.ravel(), shift_y.ravel(), shift_x.ravel()), axis=1) A = anchor_base.shape[0] K = shift.shape[0] anchor = anchor_base.reshape((1, A, 4)) + \ shift.reshape((1, K, 4)).transpose((1, 0, 2)) anchor = anchor.reshape((K * A, 4)).astype(np.float32) return anchor ================================================ FILE: utils/roi_align_2d.py ================================================ # Modified work as ROIAlign: # ----------------------------------------------------------------------------- # Copyright (c) 2018 DeNA # ----------------------------------------------------------------------------- # Modified work: # ----------------------------------------------------------------------------- # Copyright (c) 2015 Preferred Infrastructure, Inc. # Copyright (c) 2015 Preferred Networks, Inc. # ----------------------------------------------------------------------------- # Original work of forward_gpu and backward_gpu: # ----------------------------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see fast-rcnn/LICENSE for details] # Written by Ross Girshick # ----------------------------------------------------------------------------- import numpy import six from chainer import cuda from chainer import function from chainer.utils import type_check class ROIAlign2D(function.Function): """RoI align over a set of 2d planes.""" def __init__(self, outh, outw, spatial_scale): self.outh, self.outw = outh, outw self.spatial_scale = spatial_scale def check_type_forward(self, in_types): type_check.expect(in_types.size() == 2) x_type, roi_type = in_types type_check.expect( x_type.dtype == numpy.float32, x_type.ndim == 4, roi_type.dtype == numpy.float32, roi_type.ndim == 2, roi_type.shape[1] == 5, ) def forward_gpu(self, inputs): self.retain_inputs((1,)) self._bottom_data_shape = inputs[0].shape bottom_data, bottom_rois = inputs #e.g. (batch, channel, h, w)=(1, 512, 38, 53) (n_rois, )=(128, 5) channels, height, width = bottom_data.shape[1:] n_rois = bottom_rois.shape[0] top_data = cuda.cupy.empty((n_rois, channels, self.outh, self.outw), dtype=numpy.float32) cuda.cupy.ElementwiseKernel( ''' raw float32 bottom_data, float32 spatial_scale, int32 channels, int32 height, int32 width, int32 pooled_height, int32 pooled_width, raw float32 bottom_rois ''', 'float32 top_data', ''' // pos in output filter int pw = i % pooled_width; int ph = (i / pooled_width) % pooled_height; int c = (i / pooled_width / pooled_height) % channels; int num = i / pooled_width / pooled_height / channels; // scale the ROI coordinates (1/16) float roi_batch_ind = bottom_rois[num * 5 + 0]; float roi_start_w = bottom_rois[num * 5 + 1] * spatial_scale; float roi_start_h = bottom_rois[num * 5 + 2] * spatial_scale; float roi_end_w = bottom_rois[num * 5 + 3] * spatial_scale; float roi_end_h = bottom_rois[num * 5 + 4] * spatial_scale; // Force malformed ROIs to be 1x1 float roi_width = max(roi_end_w - roi_start_w, 1.0); float roi_height = max(roi_end_h - roi_start_h, 1.0); // float bin size float bin_size_h = roi_height / static_cast(pooled_height); float bin_size_w = roi_width / static_cast(pooled_width); float maxval = 0; int maxidx = -1; for (int j = 0; j < 4; j++) { int ih = j / 2; int iw = j % 2; float val = 0; // ROIAlign using the center of the bin float fh = roi_start_h + (static_cast(ph) + 0.25 + static_cast(ih) * 0.5f) * bin_size_h; float fw = roi_start_w + (static_cast(pw) + 0.25 + static_cast(iw) * 0.5f) * bin_size_w; if (fh < -1.0 || fh > height || fw < -1.0 || fw > width) { continue; } int hstart = static_cast(floor(fh)); int wstart = static_cast(floor(fw)); int hend = hstart + 1; int wend = wstart + 1; if (hstart >= height - 1) { hend = hstart = height - 1; fh = static_cast(hstart); } else { hend = hstart + 1; } if (wstart >= width - 1) { wend = wstart = width - 1; fw = static_cast(wstart); } else { wend = wstart + 1; } float dh = fh - static_cast(hstart); float dw = fw - static_cast(wstart); //compute the max value in the bin int data_offset = (roi_batch_ind * channels + c) * height * width; val += (1.0 - dh) * (1.0 - dw) * bottom_data[data_offset + hstart * width + wstart]; val += (1.0 - dh) * dw * bottom_data[data_offset + hstart * width + wend]; val += dh * (1.0 - dw) * bottom_data[data_offset + hend * width + wstart]; val += dh * dw * bottom_data[data_offset + hend * width + wend]; maxval += val; } top_data = maxval / 4; ''', 'roi_pooling_2d_fwd' )(bottom_data, self.spatial_scale, channels, height, width, self.outh, self.outw, bottom_rois, top_data) return top_data, def backward_gpu(self, inputs, gy): bottom_rois = inputs[1] channels, height, width = self._bottom_data_shape[1:] bottom_diff = cuda.cupy.zeros(self._bottom_data_shape, numpy.float32) cuda.cupy.ElementwiseKernel( ''' raw float32 top_diff, int32 num_rois, float32 spatial_scale, int32 channels, int32 height, int32 width, int32 pooled_height, int32 pooled_width, raw float32 bottom_rois ''', 'raw float32 bottom_diff', ''' // pos in output filter int pw = i % pooled_width; int ph = (i / pooled_width) % pooled_height; int c = (i / pooled_width / pooled_height) % channels; int num = i / pooled_width / pooled_height / channels; // scale the ROI coordinates (1/16) float roi_batch_ind = bottom_rois[num * 5 + 0]; float roi_start_w = bottom_rois[num * 5 + 1] * spatial_scale; float roi_start_h = bottom_rois[num * 5 + 2] * spatial_scale; float roi_end_w = bottom_rois[num * 5 + 3] * spatial_scale; float roi_end_h = bottom_rois[num * 5 + 4] * spatial_scale; // Force malformed ROIs to be 1x1 float roi_width = max(roi_end_w - roi_start_w, 1.0); float roi_height = max(roi_end_h - roi_start_h, 1.0); // float bin size float bin_size_h = roi_height / static_cast(pooled_height); float bin_size_w = roi_width / static_cast(pooled_width); int data_offset = (roi_batch_ind * channels + c) * height * width; for (int j = 0; j < 4; j++) { int ih = j / 2; int iw = j % 2; // ROIAlign using the center of the bin float fh = roi_start_h + (static_cast(ph) + 0.25 + static_cast(ih) * 0.5f) * bin_size_h; float fw = roi_start_w + (static_cast(pw) + 0.25 + static_cast(iw) * 0.5f) * bin_size_w; if (fh < -1.0 || fh > height || fw < -1.0 || fw > width) { continue; } int hstart = static_cast(floor(fh)); int wstart = static_cast(floor(fw)); int hend = hstart + 1; int wend = wstart + 1; if (hstart >= height - 1) { hend = hstart = height - 1; fh = static_cast(hstart); } else { hend = hstart + 1; } if (wstart >= width - 1) { wend = wstart = width - 1; fw = static_cast(wstart); } else { wend = wstart + 1; } float dh = fh - static_cast(hstart); float dw = fw - static_cast(wstart); //atomic add: pointer, value atomicAdd(&bottom_diff[data_offset + hstart * width + wstart], top_diff[i] * (1.0 - dh) * (1.0 - dw) / 4); atomicAdd(&bottom_diff[data_offset + hstart * width + wend], top_diff[i] * (1.0 - dh) * dw / 4); atomicAdd(&bottom_diff[data_offset + hend * width + wstart], top_diff[i] * dh * (1.0 - dw) / 4); atomicAdd(&bottom_diff[data_offset + hend * width + wend], top_diff[i] * dh * dw / 4); } ''', 'roi_pooling_2d_bwd' )(gy[0], bottom_rois.shape[0], self.spatial_scale, channels, height, width, self.outh, self.outw, bottom_rois, bottom_diff, size=gy[0].size) return bottom_diff, None def roi_align_2d(x, rois, outh, outw, spatial_scale): """Spatial Region of Interest (ROI) align function. This function acts similarly to :class:`~functions.MaxPooling2D`, but it computes the maximum of input spatial patch for each channel with the region of interest. Args: x (~chainer.Variable): Input variable. The shape is expected to be 4 dimentional: (n: batch, c: channel, h, height, w: width). rois (~chainer.Variable): Input roi variable. The shape is expected to be (n: data size, 5), and each datum is set as below: (batch_index, x_min, y_min, x_max, y_max). outh (int): Height of output image after pooled. outw (int): Width of output image after pooled. spatial_scale (float): Scale of the roi is resized. Returns: ~chainer.Variable: Output variable. See the original paper proposing ROIPooling: `Fast R-CNN `_. """ return ROIAlign2D(outh, outw, spatial_scale)(x, rois) ================================================ FILE: utils/updater.py ================================================ import copy import six from chainer.dataset import convert from chainer.dataset import iterator as iterator_module from chainer import function, variable from chainer.training.updater import StandardUpdater from chainer import reporter from chainer import cuda class SubDivisionUpdater(StandardUpdater): def __init__(self, iterator, optimizer, converter=convert.concat_examples, subdivisions=1, device=None, loss_func=None): super(SubDivisionUpdater, self).__init__( iterator=iterator, optimizer=optimizer, converter=converter, device=device, loss_func=loss_func, ) self._batchsize = self._iterators['main'].batch_size self._subdivisions = subdivisions self._n = int(self._batchsize / self._subdivisions) assert self._batchsize % self._subdivisions == 0, (self._batchsize, self._subdivisions) def update_core(self): batch = self._iterators['main'].next() #print(self._n) in_arrays_list = [] for i in range(self._subdivisions): in_arrays_list.append(self.converter(batch[i::self._subdivisions], self.device)) #in_arrays_list.append(self.converter(batch, self.device)) optimizer = self._optimizers['main'] loss_func = self.loss_func or optimizer.target loss_func.cleargrads() losses=[] for i, in_arrays in enumerate(in_arrays_list): if isinstance(in_arrays, tuple): in_vars = list(variable.Variable(x) for x in in_arrays) loss = loss_func(*in_vars) elif isinstance(in_arrays, dict): in_vars = {key: variable.Variable(x) for key, x in six.iteritems(in_arrays)} loss = loss_func(in_vars) else: print(type(in_arrays)) loss.backward() #loss = {k: cuda.to_cpu(v.data) for k, v in loss.items()} # for logging loss = cuda.to_cpu(loss.data) losses.append(loss) optimizer.update() # minibatch average if isinstance(loss, dict): avg_loss = {k: 0. for k in losses[0].keys()} for loss in losses: for k, v in loss.items(): avg_loss[k] += v #avg_loss = {k: v / float(self._batchsize) for k, v in avg_loss.items()} avg_loss = {k: v / float(len(losses)) for k, v in avg_loss.items()} #avg_loss = {k: v for k, v in avg_loss.items()} # report all the loss values for k, v in avg_loss.items(): reporter.report({k: v}, loss_func) reporter.report({'loss': sum(list(avg_loss.values()))}, loss_func) else: avg_loss = 0. for loss in losses: avg_loss += loss avg_loss /= float(self._subdivisions) reporter.report({'loss': avg_loss}, loss_func) ================================================ FILE: utils/vis_bbox.py ================================================ from chainercv.visualizations.vis_image import vis_image import numpy as np from skimage.measure import find_contours from matplotlib.patches import Polygon import cv2 def vis_bbox(img, bbox, label=None, score=None, mask=None, label_names=None, ax=None, contour=False, labeldisplay=True): """Visualize bounding boxes inside image. Example: >>> from chainercv.datasets import VOCDetectionDataset >>> from chainercv.datasets import voc_bbox_label_names >>> from chainercv.visualizations import vis_bbox >>> import matplotlib.pyplot as plot >>> dataset = VOCDetectionDataset() >>> img, bbox, label = dataset[60] >>> vis_bbox(img, bbox, label, ... label_names=voc_bbox_label_names) >>> plot.show() Args: img (~numpy.ndarray): An array of shape :math:`(3, height, width)`. This is in RGB format and the range of its value is :math:`[0, 255]`. bbox (~numpy.ndarray): An array of shape :math:`(R, 4)`, where :math:`R` is the number of bounding boxes in the image. Each element is organized by :obj:`(y_min, x_min, y_max, x_max)` in the second axis. label (~numpy.ndarray): An integer array of shape :math:`(R,)`. The values correspond to id for label names stored in :obj:`label_names`. This is optional. score (~numpy.ndarray): A float array of shape :math:`(R,)`. Each value indicates how confident the prediction is. This is optional. label_names (iterable of strings): Name of labels ordered according to label ids. If this is :obj:`None`, labels will be skipped. ax (matplotlib.axes.Axis): The visualization is displayed on this axis. If this is :obj:`None` (default), a new axis is created. Returns: ~matploblib.axes.Axes: Returns the Axes object with the plot for further tweaking. """ from matplotlib import pyplot as plot if label is not None and not len(bbox) == len(label): raise ValueError('The length of label must be same as that of bbox') if score is not None and not len(bbox) == len(score): raise ValueError('The length of score must be same as that of bbox') # alpha-blend the masks COLOR=[(1,1,0), (1,0,1),(0,1,1),(0,0,1),(0,1,0), (1,0,0),(0.1,1,0.2)] dst = img.astype(float) for i, m in enumerate(mask): alpha = np.tile(np.round(m), (3, 1, 1)).astype(float) * 0.4 src1 = np.ones(dst.shape).astype(float) for j, col in enumerate(COLOR[i%len(COLOR)]): src1[j] *= col * 255 dst = cv2.multiply(src1, alpha) + cv2.multiply(dst, 1 - alpha) # Returns newly instantiated matplotlib.axes.Axes object if ax is None ax = vis_image(dst, ax=ax) # If there is no bounding box to display, visualize the image and exit. if len(bbox) == 0: return ax # add boxes, contours and labels for i, bb in enumerate(bbox): # boxes xy = (bb[1], bb[0]) height = int(bb[2]) - int(bb[0]) width = int(bb[3]) - int(bb[1]) ax.add_patch(plot.Rectangle( xy, width, height, fill=False, edgecolor='red', linewidth=1)) # contours if contour: Mcontours = find_contours(mask[i].T, 0.5) for verts in Mcontours: p = Polygon(verts, facecolor="none", edgecolor=[0.5,0.5,0.5]) ax.add_patch(p) #labels caption = list() if label is not None and label_names is not None: lb = label[i] print(lb) if not (0 <= lb < len(label_names)): raise ValueError('No corresponding name is given') caption.append(label_names[lb]) if score is not None: sc = score[i] caption.append('{:.2f}'.format(sc)) if len(caption) > 0 and labeldisplay: ax.text(bb[1], bb[0], ': '.join(caption), style='italic', fontsize=8, color='white' )#'facecolor': 'white', 'alpha': 0.7, 'pad': 10}) return ax